From b449f1bd00a3e9340fe5413cd4e3a4c9ea4c387b Mon Sep 17 00:00:00 2001 From: harvey Date: Thu, 25 Dec 2025 12:16:59 -0500 Subject: [PATCH 1/7] feat: creates rust pyo3 bindings for AGON encodings --- Cargo.toml | 24 + crates/agon-core/Cargo.toml | 20 + crates/agon-core/src/error.rs | 102 ++ crates/agon-core/src/formats/columns.rs | 1313 +++++++++++++++++ crates/agon-core/src/formats/mod.rs | 357 +++++ crates/agon-core/src/formats/struct_fmt.rs | 1504 ++++++++++++++++++++ crates/agon-core/src/formats/text.rs | 1339 +++++++++++++++++ crates/agon-core/src/lib.rs | 503 +++++++ crates/agon-core/src/types.rs | 130 ++ crates/agon-core/src/utils.rs | 30 + pyproject.toml | 38 +- python/agon/__init__.py | 32 + {src => python}/agon/core.py | 56 +- {src => python}/agon/encoding.py | 0 python/agon/errors.py | 15 + {src => python}/agon/py.typed | 0 src/agon/__init__.py | 23 - src/agon/errors.py | 23 - src/agon/formats/__init__.py | 14 - src/agon/formats/base.py | 93 -- src/agon/formats/columns.py | 895 ------------ src/agon/formats/struct.py | 1070 -------------- src/agon/formats/text.py | 837 ----------- 23 files changed, 5415 insertions(+), 3003 deletions(-) create mode 100644 Cargo.toml create mode 100644 crates/agon-core/Cargo.toml create mode 100644 crates/agon-core/src/error.rs create mode 100644 crates/agon-core/src/formats/columns.rs create mode 100644 crates/agon-core/src/formats/mod.rs create mode 100644 crates/agon-core/src/formats/struct_fmt.rs create mode 100644 crates/agon-core/src/formats/text.rs create mode 100644 crates/agon-core/src/lib.rs create mode 100644 crates/agon-core/src/types.rs create mode 100644 crates/agon-core/src/utils.rs create mode 100644 python/agon/__init__.py rename {src => python}/agon/core.py (87%) rename {src => python}/agon/encoding.py (100%) create mode 100644 python/agon/errors.py rename {src => python}/agon/py.typed (100%) delete mode 100644 src/agon/__init__.py delete mode 100644 src/agon/errors.py delete mode 100644 src/agon/formats/__init__.py delete mode 100644 src/agon/formats/base.py delete mode 100644 src/agon/formats/columns.py delete mode 100644 src/agon/formats/struct.py delete mode 100644 src/agon/formats/text.py diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..6b9d10d --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,24 @@ +[workspace] +resolver = "2" +members = ["crates/*"] + +[workspace.package] +version = "0.1.0" +edition = "2021" +license = "MIT" +authors = ["Harvey Tseng "] +repository = "https://github.com/Verdenroz/agon-python" + +[workspace.dependencies] +pyo3 = { version = "0.27.2", features = ["extension-module"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = { version = "1.0", features = ["preserve_order"] } +rayon = "1.10" +thiserror = "2.0" +regex = "1.11" +tiktoken-rs = "0.9.1" + +[profile.release] +lto = true +codegen-units = 1 +opt-level = 3 diff --git a/crates/agon-core/Cargo.toml b/crates/agon-core/Cargo.toml new file mode 100644 index 0000000..1247274 --- /dev/null +++ b/crates/agon-core/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "agon-core" +version.workspace = true +edition.workspace = true +license.workspace = true +authors.workspace = true +description = "Rust core for AGON encoding formats" + +[lib] +name = "agon_core" +crate-type = ["cdylib", "rlib"] + +[dependencies] +pyo3.workspace = true +serde.workspace = true +serde_json.workspace = true +rayon.workspace = true +thiserror.workspace = true +regex.workspace = true +tiktoken-rs.workspace = true diff --git a/crates/agon-core/src/error.rs b/crates/agon-core/src/error.rs new file mode 100644 index 0000000..337812d --- /dev/null +++ b/crates/agon-core/src/error.rs @@ -0,0 +1,102 @@ +//! Error types for AGON encoding/decoding + +use pyo3::exceptions::PyValueError; +use pyo3::PyErr; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum AgonError { + #[error("Invalid AGON format: {0}")] + InvalidFormat(String), + + #[error("Encoding error: {0}")] + EncodingError(String), + + #[error("Decoding error: {0}")] + DecodingError(String), + + #[error("JSON error: {0}")] + JsonError(#[from] serde_json::Error), + + #[error("Invalid data structure: {0}")] + InvalidData(String), + + #[error("Parse error at line {line}: {message}")] + ParseError { line: usize, message: String }, + + #[error("Python error: {0}")] + PyError(String), +} + +impl From for PyErr { + fn from(err: AgonError) -> PyErr { + PyValueError::new_err(err.to_string()) + } +} + +impl From for AgonError { + fn from(err: PyErr) -> AgonError { + AgonError::PyError(err.to_string()) + } +} + +pub type Result = std::result::Result; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_invalid_format_error() { + let err = AgonError::InvalidFormat("unknown".to_string()); + assert_eq!(err.to_string(), "Invalid AGON format: unknown"); + } + + #[test] + fn test_encoding_error() { + let err = AgonError::EncodingError("failed to encode".to_string()); + assert_eq!(err.to_string(), "Encoding error: failed to encode"); + } + + #[test] + fn test_decoding_error() { + let err = AgonError::DecodingError("invalid payload".to_string()); + assert_eq!(err.to_string(), "Decoding error: invalid payload"); + } + + #[test] + fn test_invalid_data_error() { + let err = AgonError::InvalidData("bad structure".to_string()); + assert_eq!(err.to_string(), "Invalid data structure: bad structure"); + } + + #[test] + fn test_parse_error() { + let err = AgonError::ParseError { + line: 42, + message: "unexpected token".to_string(), + }; + assert_eq!(err.to_string(), "Parse error at line 42: unexpected token"); + } + + #[test] + fn test_py_error() { + let err = AgonError::PyError("Python exception".to_string()); + assert_eq!(err.to_string(), "Python error: Python exception"); + } + + #[test] + fn test_json_error_from() { + // Create a JSON parse error + let json_err = serde_json::from_str::("invalid json").unwrap_err(); + let agon_err: AgonError = json_err.into(); + assert!(agon_err.to_string().contains("JSON error")); + } + + #[test] + fn test_error_debug_format() { + let err = AgonError::InvalidFormat("test".to_string()); + let debug_str = format!("{:?}", err); + assert!(debug_str.contains("InvalidFormat")); + } +} diff --git a/crates/agon-core/src/formats/columns.rs b/crates/agon-core/src/formats/columns.rs new file mode 100644 index 0000000..420500f --- /dev/null +++ b/crates/agon-core/src/formats/columns.rs @@ -0,0 +1,1313 @@ +//! AGONColumns format encoder/decoder +//! +//! Columnar encoding with type clustering for wide tables. +//! +//! Format structure: +//! @AGON columns +//! name[N] +//! ├ field1: val1val2... +//! ├ field2: val1val2... +//! └ fieldN: val1val2... + +use serde_json::{Map, Value}; + +use crate::error::{AgonError, Result}; + +const HEADER: &str = "@AGON columns"; +const DEFAULT_DELIMITER: &str = "\t"; +const INDENT: &str = " "; + +/// Encode data to AGONColumns format +pub fn encode(data: &Value, include_header: bool) -> Result { + let mut lines = Vec::new(); + let delimiter = DEFAULT_DELIMITER; + + if include_header { + lines.push(HEADER.to_string()); + lines.push(String::new()); + } + + encode_value(data, &mut lines, 0, delimiter, None); + + Ok(lines.join("\n")) +} + +/// Decode AGONColumns payload +pub fn decode(payload: &str) -> Result { + let lines: Vec<&str> = payload.lines().collect(); + if lines.is_empty() { + return Err(AgonError::DecodingError("Empty payload".to_string())); + } + + let mut idx = 0; + + // Parse header + let header_line = lines[idx].trim(); + if !header_line.starts_with("@AGON columns") { + return Err(AgonError::DecodingError(format!( + "Invalid header: {}", + header_line + ))); + } + idx += 1; + + // Skip blank lines + while idx < lines.len() && lines[idx].trim().is_empty() { + idx += 1; + } + + if idx >= lines.len() { + return Ok(Value::Null); + } + + let (result, _) = decode_value(&lines, idx, 0, DEFAULT_DELIMITER)?; + Ok(result) +} + +// ============================================================================ +// Encoding helpers +// ============================================================================ + +fn format_primitive(val: &Value) -> String { + match val { + Value::Null => "null".to_string(), + Value::Bool(b) => if *b { "true" } else { "false" }.to_string(), + Value::Number(n) => n.to_string(), + Value::String(s) => { + // Quote if contains delimiter, special chars, or could be parsed as another type + if needs_quote(s) { + format!( + "\"{}\"", + s.replace('\\', "\\\\") + .replace('"', "\\\"") + .replace('\n', "\\n") + .replace('\t', "\\t") + ) + } else { + s.clone() + } + } + _ => serde_json::to_string(val).unwrap_or_default(), + } +} + +/// Check if a string needs quoting to preserve its type +fn needs_quote(s: &str) -> bool { + if s.is_empty() { + return true; + } + // Strings with leading/trailing whitespace need quoting + if s != s.trim() { + return true; + } + // Delimiter and special chars + if s.contains('\t') || s.contains('\n') || s.contains('\\') || s.contains('"') { + return true; + } + // Tree drawing chars at start + if s.starts_with('├') + || s.starts_with('└') + || s.starts_with('|') + || s.starts_with('@') + || s.starts_with('#') + || s.starts_with('-') + { + return true; + } + // Boolean/null keywords + let lower = s.to_lowercase(); + if lower == "true" || lower == "false" || lower == "null" { + return true; + } + // Looks like a number - needs quoting to preserve string type + if s.parse::().is_ok() || s.parse::().is_ok() { + return true; + } + false +} + +fn parse_primitive(s: &str) -> Value { + let s = s.trim(); + if s.is_empty() { + return Value::Null; + } + + // Quoted string + if s.starts_with('"') && s.ends_with('"') { + let inner = &s[1..s.len() - 1]; + return Value::String( + inner + .replace("\\n", "\n") + .replace("\\t", "\t") + .replace("\\\"", "\"") + .replace("\\\\", "\\"), + ); + } + + // Boolean/null + match s.to_lowercase().as_str() { + "null" => return Value::Null, + "true" => return Value::Bool(true), + "false" => return Value::Bool(false), + _ => {} + } + + // Number + if let Ok(i) = s.parse::() { + return Value::Number(i.into()); + } + if let Ok(f) = s.parse::() { + if let Some(n) = serde_json::Number::from_f64(f) { + return Value::Number(n); + } + } + + Value::String(s.to_string()) +} + +/// Parse a columnar cell value +/// Returns None for empty/missing cells, Some(value) for present values (including explicit null) +fn parse_columnar_cell(s: &str) -> Option { + let trimmed = s.trim(); + if trimmed.is_empty() { + // Empty cell means field is missing (absent from object) + return None; + } + // Non-empty cell means field is present (could be explicit "null") + Some(parse_primitive(s)) +} + +fn is_uniform_array(arr: &[Value]) -> (bool, Vec) { + if arr.is_empty() { + return (false, vec![]); + } + + if !arr.iter().all(|v| v.is_object()) { + return (false, vec![]); + } + + // Check all values are primitives + for obj in arr { + if let Some(map) = obj.as_object() { + for v in map.values() { + if v.is_object() || v.is_array() { + return (false, vec![]); + } + } + } + } + + // Collect keys in order + let mut key_order = Vec::new(); + for obj in arr { + if let Some(map) = obj.as_object() { + for k in map.keys() { + if !key_order.contains(k) { + key_order.push(k.clone()); + } + } + } + } + + (true, key_order) +} + +fn encode_value( + val: &Value, + lines: &mut Vec, + depth: usize, + delimiter: &str, + name: Option<&str>, +) { + let indent = INDENT.repeat(depth); + + match val { + Value::Null | Value::Bool(_) | Value::Number(_) | Value::String(_) => { + let encoded = format_primitive(val); + if let Some(n) = name { + lines.push(format!("{}{}: {}", indent, n, encoded)); + } else { + lines.push(format!("{}{}", indent, encoded)); + } + } + Value::Array(arr) => { + encode_array(arr, lines, depth, delimiter, name); + } + Value::Object(obj) => { + encode_object(obj, lines, depth, delimiter, name); + } + } +} + +fn encode_array( + arr: &[Value], + lines: &mut Vec, + depth: usize, + delimiter: &str, + name: Option<&str>, +) { + let indent = INDENT.repeat(depth); + + if arr.is_empty() { + if let Some(n) = name { + lines.push(format!("{}{}[0]", indent, n)); + } else { + lines.push(format!("{}[0]", indent)); + } + return; + } + + // Check for uniform objects (columnar format) + let (is_uniform, fields) = is_uniform_array(arr); + if is_uniform && !fields.is_empty() { + // Columnar header + if let Some(n) = name { + lines.push(format!("{}{}[{}]", indent, n, arr.len())); + } else { + lines.push(format!("{}[{}]", indent, arr.len())); + } + + // Output each field as a column + let total_fields = fields.len(); + for (i, field) in fields.iter().enumerate() { + let values: Vec = arr + .iter() + .map(|obj| { + obj.as_object() + .and_then(|m| m.get(field)) + .map(format_primitive) + .unwrap_or_default() + }) + .collect(); + + let prefix = if i == total_fields - 1 { "└" } else { "├" }; + lines.push(format!( + "{}{} {}: {}", + indent, + prefix, + field, + values.join(delimiter) + )); + } + return; + } + + // Primitive array (inline) + if arr.iter().all(|v| !v.is_object() && !v.is_array()) { + let values: Vec = arr.iter().map(format_primitive).collect(); + if let Some(n) = name { + lines.push(format!( + "{}{}[{}]: {}", + indent, + n, + arr.len(), + values.join(delimiter) + )); + } else { + lines.push(format!( + "{}[{}]: {}", + indent, + arr.len(), + values.join(delimiter) + )); + } + return; + } + + // Mixed/nested - use list item format + if let Some(n) = name { + lines.push(format!("{}{}[{}]:", indent, n, arr.len())); + } else { + lines.push(format!("{}[{}]:", indent, arr.len())); + } + for item in arr { + match item { + Value::Object(obj) => { + encode_list_item_object(obj, lines, depth + 1, delimiter); + } + _ => { + lines.push(format!("{} - {}", indent, format_primitive(item))); + } + } + } +} + +/// Encode an object as a list item (- key: value format) +fn encode_list_item_object( + obj: &Map, + lines: &mut Vec, + depth: usize, + delimiter: &str, +) { + let indent = INDENT.repeat(depth); + let mut first = true; + + for (k, v) in obj { + let prefix = if first { + format!("{}- ", indent) + } else { + format!("{} ", indent) + }; + first = false; + + match v { + Value::Object(nested) => { + lines.push(format!("{}{}:", prefix, k)); + for (nk, nv) in nested { + match nv { + Value::Object(_) | Value::Array(_) => { + encode_value(nv, lines, depth + 2, delimiter, Some(nk)); + } + _ => { + lines.push(format!("{} {}: {}", indent, nk, format_primitive(nv))); + } + } + } + } + Value::Array(arr) => { + lines.push(format!("{}{}:", prefix, k)); + encode_array(arr, lines, depth + 2, delimiter, None); + } + _ => { + lines.push(format!("{}{}: {}", prefix, k, format_primitive(v))); + } + } + } +} + +fn encode_object( + obj: &Map, + lines: &mut Vec, + depth: usize, + delimiter: &str, + name: Option<&str>, +) { + let indent = INDENT.repeat(depth); + let mut actual_depth = depth; + + if let Some(n) = name { + lines.push(format!("{}{}:", indent, n)); + actual_depth += 1; + } + + let actual_indent = INDENT.repeat(actual_depth); + + for (k, v) in obj { + match v { + Value::Object(_) | Value::Array(_) => { + encode_value(v, lines, actual_depth, delimiter, Some(k)); + } + _ => { + lines.push(format!("{}{}: {}", actual_indent, k, format_primitive(v))); + } + } + } +} + +// ============================================================================ +// Decoding helpers +// ============================================================================ + +fn get_indent_depth(line: &str) -> usize { + let stripped = line.trim_start_matches(' '); + let spaces = line.len() - stripped.len(); + spaces / 2 +} + +fn decode_value( + lines: &[&str], + idx: usize, + _depth: usize, + delimiter: &str, +) -> Result<(Value, usize)> { + if idx >= lines.len() { + return Ok((Value::Null, idx)); + } + + let line = lines[idx].trim(); + let base_depth = get_indent_depth(lines[idx]); + + // Check for array patterns: [N], [N]:, name[N], name[N]: + if let Some(bracket_pos) = line.find('[') { + if let Some(end_pos) = line.find(']') { + if end_pos > bracket_pos { + let name = &line[..bracket_pos]; + let count_str = &line[bracket_pos + 1..end_pos]; + if let Ok(count) = count_str.parse::() { + // If this is a named array (name[N]), it's part of an object + // Delegate to decode_object to parse the full object + if !name.is_empty() { + return decode_object(lines, idx, delimiter); + } + + // Unnamed array: [N] + // Check if next line has ├ or └ (columnar format) + if idx + 1 < lines.len() { + let next = lines[idx + 1].trim(); + if next.starts_with('├') || next.starts_with('└') { + return decode_columnar_array(lines, idx, "", count, delimiter); + } + } + + // Check for inline primitive array: [N]: val1\tval2 + if let Some(colon_pos) = line.find("]:") { + let values_str = line[colon_pos + 2..].trim(); + if !values_str.is_empty() { + let values: Vec = + values_str.split(delimiter).map(parse_primitive).collect(); + return Ok((Value::Array(values), idx + 1)); + } + // Empty values after colon means list array: [N]: + return decode_list_array(lines, idx, base_depth, count, delimiter); + } + + // Bare [N] with no colon - could be empty array or non-columnar array + if count == 0 { + return Ok((Value::Array(vec![]), idx + 1)); + } + // Check if next line is a list item + if idx + 1 < lines.len() { + let next = lines[idx + 1].trim(); + if next.starts_with("- ") { + return decode_list_array(lines, idx, base_depth, count, delimiter); + } + } + // No colon, no columnar, no list - it's an empty array + return Ok((Value::Array(vec![]), idx + 1)); + } + } + } + } + + // Check for key: value + if line.contains(':') { + return decode_object(lines, idx, delimiter); + } + + Ok((Value::Null, idx + 1)) +} + +fn decode_columnar_array( + lines: &[&str], + idx: usize, + name: &str, + count: usize, + delimiter: &str, +) -> Result<(Value, usize)> { + let mut fields: Vec = Vec::new(); + // Each column stores Option: None = missing, Some(v) = present (including explicit null) + let mut columns: Vec>> = Vec::new(); + + let mut idx = idx + 1; + + // Parse columnar lines (├ field: val1\tval2... or └ field: val1\tval2...) + while idx < lines.len() { + let line = lines[idx].trim(); + + let field_line = if let Some(rest) = line.strip_prefix('├') { + Some(rest.trim()) + } else { + line.strip_prefix('└').map(|rest| rest.trim()) + }; + + if let Some(content) = field_line { + if let Some(colon_pos) = content.find(':') { + let field = content[..colon_pos].trim(); + // Don't strip trailing whitespace - it's part of delimiter for empty cells + let values_str = content[colon_pos + 1..].trim_start(); + + fields.push(field.to_string()); + + let values: Vec> = if values_str.is_empty() { + vec![] + } else { + split_column_values(values_str, delimiter) + .iter() + .map(|s| parse_columnar_cell(s)) + .collect() + }; + columns.push(values); + } + idx += 1; + + if line.starts_with('└') { + break; + } + } else { + break; + } + } + + // Transpose columns to rows, preserving field order + let mut result: Vec = Vec::with_capacity(count); + for i in 0..count { + let mut obj = Map::new(); + for (j, field) in fields.iter().enumerate() { + if let Some(col) = columns.get(j) { + // Only insert if value is present (Some(Some(val))), skip if missing + if let Some(Some(val)) = col.get(i) { + obj.insert(field.clone(), val.clone()); + } + } + } + result.push(Value::Object(obj)); + } + + let arr = Value::Array(result); + if name.is_empty() { + Ok((arr, idx)) + } else { + let mut wrapper = Map::new(); + wrapper.insert(name.to_string(), arr); + Ok((Value::Object(wrapper), idx)) + } +} + +/// Split column values respecting quotes +fn split_column_values(values_str: &str, delimiter: &str) -> Vec { + let mut result = Vec::new(); + let mut current = String::new(); + let mut in_quote = false; + let mut chars = values_str.chars().peekable(); + let delim_chars: Vec = delimiter.chars().collect(); + + while let Some(c) = chars.next() { + // Check for delimiter (only when not in quotes) + if !in_quote && c == delim_chars[0] { + let mut is_delim = delim_chars.len() == 1; + if delim_chars.len() > 1 { + // Check rest of delimiter + let mut temp = String::new(); + temp.push(c); + let mut matched = true; + for (_i, &dc) in delim_chars.iter().enumerate().skip(1) { + if let Some(&nc) = chars.peek() { + if nc == dc { + temp.push(chars.next().unwrap()); + } else { + matched = false; + current.push_str(&temp); + break; + } + } else { + matched = false; + current.push_str(&temp); + break; + } + } + is_delim = matched; + } + if is_delim { + result.push(current); + current = String::new(); + continue; + } + } + + if c == '"' && !in_quote { + in_quote = true; + current.push(c); + } else if c == '"' && in_quote { + in_quote = false; + current.push(c); + } else { + current.push(c); + } + } + + result.push(current); + result +} + +fn decode_object(lines: &[&str], idx: usize, delimiter: &str) -> Result<(Value, usize)> { + let mut result = Map::new(); + let base_depth = get_indent_depth(lines[idx]); + let mut idx = idx; + + while idx < lines.len() { + let line = lines[idx]; + if line.trim().is_empty() { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth < base_depth { + break; + } + + let stripped = line.trim(); + + // Check for array patterns: name[N] or name[N]: values + if let Some(bracket_pos) = stripped.find('[') { + if let Some(end_pos) = stripped.find(']') { + if end_pos > bracket_pos { + let name = &stripped[..bracket_pos]; + let count_str = &stripped[bracket_pos + 1..end_pos]; + if let Ok(count) = count_str.parse::() { + // This is an array pattern - decode it via decode_value + let (arr, new_idx) = + decode_array_in_object(lines, idx, name, count, delimiter)?; + result.insert(name.to_string(), arr); + idx = new_idx; + continue; + } + } + } + } + + // Regular key: value parsing + if let Some(colon_pos) = stripped.find(':') { + let key = stripped[..colon_pos].trim(); + let val_str = stripped[colon_pos + 1..].trim(); + + if !val_str.is_empty() { + result.insert(key.to_string(), parse_primitive(val_str)); + idx += 1; + } else { + idx += 1; + if idx < lines.len() { + let next_depth = get_indent_depth(lines[idx]); + if next_depth > line_depth { + let (nested, new_idx) = decode_value(lines, idx, next_depth, delimiter)?; + result.insert(key.to_string(), nested); + idx = new_idx; + } else { + result.insert(key.to_string(), Value::Object(Map::new())); + } + } else { + // End of file - still insert empty object + result.insert(key.to_string(), Value::Object(Map::new())); + } + } + } else { + break; + } + } + + Ok((Value::Object(result), idx)) +} + +/// Decode an array that appears within an object context +fn decode_array_in_object( + lines: &[&str], + idx: usize, + _name: &str, + count: usize, + delimiter: &str, +) -> Result<(Value, usize)> { + let line = lines[idx].trim(); + let base_depth = get_indent_depth(lines[idx]); + + // Check for inline primitive array: name[N]: val1\tval2 + if let Some(colon_pos) = line.find("]:") { + let values_str = line[colon_pos + 2..].trim(); + if !values_str.is_empty() { + let values: Vec = values_str.split(delimiter).map(parse_primitive).collect(); + return Ok((Value::Array(values), idx + 1)); + } + } + + // Check for columnar array: name[N] followed by ├/└ lines + if idx + 1 < lines.len() { + let next = lines[idx + 1].trim(); + if next.starts_with('├') || next.starts_with('└') { + let (arr, new_idx) = decode_columnar_array(lines, idx, "", count, delimiter)?; + return Ok((arr, new_idx)); + } + } + + // Check for list array: name[N]: followed by - items + if line.ends_with(':') { + return decode_list_array(lines, idx, base_depth, count, delimiter); + } + + // Empty array + Ok((Value::Array(vec![]), idx + 1)) +} + +/// Decode a list array: name[N]: followed by - items +fn decode_list_array( + lines: &[&str], + idx: usize, + base_depth: usize, + count: usize, + delimiter: &str, +) -> Result<(Value, usize)> { + let mut result: Vec = Vec::new(); + let mut idx = idx + 1; + let item_depth = base_depth + 1; + + while idx < lines.len() && result.len() < count { + let line = lines[idx]; + if line.trim().is_empty() { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth < item_depth { + break; + } + + let stripped = line.trim(); + if let Some(item_str) = stripped.strip_prefix("- ") { + // Check if it's key: value (object) or primitive + if item_str.contains(':') { + let (obj, new_idx) = decode_list_item_object(lines, idx, item_depth, delimiter)?; + result.push(obj); + idx = new_idx; + } else { + result.push(parse_primitive(item_str)); + idx += 1; + } + } else { + break; + } + } + + Ok((Value::Array(result), idx)) +} + +/// Decode an object that starts with '- key: value' +fn decode_list_item_object( + lines: &[&str], + idx: usize, + item_depth: usize, + delimiter: &str, +) -> Result<(Value, usize)> { + let mut obj = Map::new(); + + let first_line = lines[idx].trim(); + let first_content = first_line.strip_prefix("- ").unwrap_or(first_line).trim(); + + let mut idx = idx; + + // Parse first key: value + if let Some(colon_pos) = first_content.find(':') { + let key = first_content[..colon_pos].trim(); + let val_str = first_content[colon_pos + 1..].trim(); + + if !val_str.is_empty() { + obj.insert(key.to_string(), parse_primitive(val_str)); + idx += 1; + } else { + // Nested value or empty object + idx += 1; + if idx < lines.len() { + let next_depth = get_indent_depth(lines[idx]); + if next_depth > item_depth { + let (nested, new_idx) = decode_value(lines, idx, next_depth, delimiter)?; + obj.insert(key.to_string(), nested); + idx = new_idx; + } else { + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } else { + // No more lines - empty object + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } + } else { + idx += 1; + } + + // Parse continuation lines (indented under the list item) + while idx < lines.len() { + let line = lines[idx]; + if line.trim().is_empty() { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth <= item_depth { + break; + } + + let stripped = line.trim(); + + // New list item at same level means end of this object + if stripped.starts_with("- ") { + break; + } + + // Check for array patterns + if let Some(bracket_pos) = stripped.find('[') { + if let Some(end_pos) = stripped.find(']') { + if end_pos > bracket_pos { + let arr_name = &stripped[..bracket_pos]; + let count_str = &stripped[bracket_pos + 1..end_pos]; + if let Ok(count) = count_str.parse::() { + let (arr, new_idx) = + decode_array_in_object(lines, idx, arr_name, count, delimiter)?; + obj.insert(arr_name.to_string(), arr); + idx = new_idx; + continue; + } + } + } + } + + // Regular key: value + if let Some(colon_pos) = stripped.find(':') { + let key = stripped[..colon_pos].trim(); + let val_str = stripped[colon_pos + 1..].trim(); + + if !val_str.is_empty() { + obj.insert(key.to_string(), parse_primitive(val_str)); + idx += 1; + } else { + idx += 1; + if idx < lines.len() { + let next_depth = get_indent_depth(lines[idx]); + if next_depth > line_depth { + let (nested, new_idx) = decode_value(lines, idx, next_depth, delimiter)?; + obj.insert(key.to_string(), nested); + idx = new_idx; + } else { + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } else { + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } + } else { + idx += 1; + } + } + + Ok((Value::Object(obj), idx)) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + // ======================================================================== + // Encoding tests + // ======================================================================== + + #[test] + fn test_encode_columnar() { + let data = json!([ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"} + ]); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("[2]")); + assert!(encoded.contains("├") || encoded.contains("└")); + } + + #[test] + fn test_encode_with_header() { + let data = json!({"name": "test"}); + let encoded = encode(&data, true).unwrap(); + assert!(encoded.starts_with("@AGON columns")); + } + + #[test] + fn test_encode_without_header() { + let data = json!({"name": "test"}); + let encoded = encode(&data, false).unwrap(); + assert!(!encoded.contains("@AGON")); + } + + #[test] + fn test_encode_primitives() { + let data = json!({ + "string": "hello", + "number": 42, + "float": 3.15, + "bool_true": true, + "bool_false": false, + "null_val": null + }); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("string: hello")); + assert!(encoded.contains("number: 42")); + assert!(encoded.contains("bool_true: true")); + assert!(encoded.contains("null_val: null")); + } + + #[test] + fn test_encode_empty_array() { + let data = json!({"items": []}); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("items[0]")); + } + + #[test] + fn test_encode_primitive_array() { + let data = json!({"nums": [1, 2, 3]}); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("nums[3]:")); + } + + #[test] + fn test_encode_nested_object() { + let data = json!({ + "outer": { + "inner": { + "value": 42 + } + } + }); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("outer:")); + assert!(encoded.contains("inner:")); + assert!(encoded.contains("value: 42")); + } + + #[test] + fn test_encode_columnar_tree_chars() { + let data = json!([ + {"a": 1, "b": 2, "c": 3} + ]); + let encoded = encode(&data, false).unwrap(); + // Should have ├ for non-last and └ for last + assert!(encoded.contains("├") || encoded.contains("└")); + } + + // ======================================================================== + // Decoding tests + // ======================================================================== + + #[test] + fn test_decode_empty_payload() { + let result = decode(""); + assert!(result.is_err()); + } + + #[test] + fn test_decode_invalid_header() { + let result = decode("invalid header"); + assert!(result.is_err()); + } + + #[test] + fn test_decode_header_only() { + let result = decode("@AGON columns\n\n").unwrap(); + assert!(result.is_null()); + } + + #[test] + fn test_decode_simple_object() { + let payload = "@AGON columns\n\nname: Alice\nage: 30"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["name"], "Alice"); + assert_eq!(decoded["age"], 30); + } + + #[test] + fn test_decode_columnar_array() { + let payload = "@AGON columns\n\n[2]\n├ id: 1\t2\n└ name: Alice\tBob"; + let decoded = decode(payload).unwrap(); + assert!(decoded.is_array()); + let arr = decoded.as_array().unwrap(); + assert_eq!(arr.len(), 2); + assert_eq!(arr[0]["id"], 1); + assert_eq!(arr[0]["name"], "Alice"); + assert_eq!(arr[1]["id"], 2); + assert_eq!(arr[1]["name"], "Bob"); + } + + #[test] + fn test_decode_named_columnar_array() { + let payload = "@AGON columns\n\nusers[2]\n├ id: 1\t2\n└ name: Alice\tBob"; + let decoded = decode(payload).unwrap(); + assert!(decoded.is_object()); + let users = decoded["users"].as_array().unwrap(); + assert_eq!(users.len(), 2); + } + + #[test] + fn test_decode_primitive_array() { + let payload = "@AGON columns\n\nnums[3]: 1\t2\t3"; + let decoded = decode(payload).unwrap(); + let nums = decoded["nums"].as_array().unwrap(); + assert_eq!(nums.len(), 3); + assert_eq!(nums[0], 1); + } + + #[test] + fn test_decode_empty_array() { + let payload = "@AGON columns\n\nitems[0]"; + let decoded = decode(payload).unwrap(); + let items = decoded["items"].as_array().unwrap(); + assert!(items.is_empty()); + } + + // ======================================================================== + // Roundtrip tests + // ======================================================================== + + #[test] + fn test_roundtrip() { + let data = json!({ + "users": [ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"} + ] + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + + assert!(decoded.is_object()); + let users = decoded.get("users").unwrap(); + assert!(users.is_array()); + assert_eq!(users.as_array().unwrap().len(), 2); + } + + #[test] + fn test_roundtrip_nested() { + let data = json!({ + "company": { + "name": "ACME", + "address": { + "city": "Seattle" + } + } + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(decoded["company"]["name"], "ACME"); + assert_eq!(decoded["company"]["address"]["city"], "Seattle"); + } + + #[test] + fn test_roundtrip_mixed_array() { + let data = json!({ + "items": [1, "two", true, null] + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + let items = decoded["items"].as_array().unwrap(); + assert_eq!(items.len(), 4); + } + + // ======================================================================== + // Helper function tests + // ======================================================================== + + #[test] + fn test_needs_quote_empty() { + assert!(needs_quote("")); + } + + #[test] + fn test_needs_quote_whitespace() { + assert!(needs_quote(" padded ")); + assert!(needs_quote(" leading")); + } + + #[test] + fn test_needs_quote_delimiter() { + assert!(needs_quote("has\ttab")); + assert!(needs_quote("has\nnewline")); + } + + #[test] + fn test_needs_quote_special_chars() { + assert!(needs_quote("has\"quote")); + assert!(needs_quote("has\\backslash")); + } + + #[test] + fn test_needs_quote_tree_chars() { + assert!(needs_quote("├ branch")); + assert!(needs_quote("└ leaf")); + assert!(needs_quote("| pipe")); + } + + #[test] + fn test_needs_quote_special_prefix() { + assert!(needs_quote("@mention")); + assert!(needs_quote("#comment")); + assert!(needs_quote("-item")); + } + + #[test] + fn test_needs_quote_primitives() { + assert!(needs_quote("true")); + assert!(needs_quote("false")); + assert!(needs_quote("null")); + assert!(needs_quote("42")); + assert!(needs_quote("3.14")); + } + + #[test] + fn test_needs_quote_normal_string() { + assert!(!needs_quote("hello")); + assert!(!needs_quote("normal string")); + } + + #[test] + fn test_format_primitive() { + assert_eq!(format_primitive(&Value::Null), "null"); + assert_eq!(format_primitive(&Value::Bool(true)), "true"); + assert_eq!(format_primitive(&Value::Bool(false)), "false"); + assert_eq!(format_primitive(&json!(42)), "42"); + assert_eq!(format_primitive(&json!("hello")), "hello"); + assert_eq!(format_primitive(&json!("42")), "\"42\""); // Quoted to preserve string type + } + + #[test] + fn test_parse_primitive_null() { + assert_eq!(parse_primitive("null"), Value::Null); + assert_eq!(parse_primitive("NULL"), Value::Null); + assert_eq!(parse_primitive(""), Value::Null); + } + + #[test] + fn test_parse_primitive_bool() { + assert_eq!(parse_primitive("true"), Value::Bool(true)); + assert_eq!(parse_primitive("false"), Value::Bool(false)); + } + + #[test] + fn test_parse_primitive_number() { + assert_eq!(parse_primitive("42"), json!(42)); + assert_eq!(parse_primitive("-17"), json!(-17)); + assert_eq!(parse_primitive("3.15"), json!(3.15)); + } + + #[test] + fn test_parse_primitive_quoted_string() { + assert_eq!( + parse_primitive("\"hello\""), + Value::String("hello".to_string()) + ); + assert_eq!( + parse_primitive("\"line\\nbreak\""), + Value::String("line\nbreak".to_string()) + ); + } + + #[test] + fn test_parse_columnar_cell_empty() { + assert_eq!(parse_columnar_cell(""), None); + assert_eq!(parse_columnar_cell(" "), None); + } + + #[test] + fn test_parse_columnar_cell_value() { + assert_eq!(parse_columnar_cell("42"), Some(json!(42))); + assert_eq!(parse_columnar_cell("null"), Some(Value::Null)); + } + + #[test] + fn test_is_uniform_array_empty() { + let arr: Vec = vec![]; + let (uniform, _) = is_uniform_array(&arr); + assert!(!uniform); + } + + #[test] + fn test_is_uniform_array_primitives() { + let arr = vec![json!(1), json!(2)]; + let (uniform, _) = is_uniform_array(&arr); + assert!(!uniform); + } + + #[test] + fn test_is_uniform_array_uniform_objects() { + let arr = vec![json!({"id": 1, "name": "a"}), json!({"id": 2, "name": "b"})]; + let (uniform, fields) = is_uniform_array(&arr); + assert!(uniform); + assert!(fields.contains(&"id".to_string())); + assert!(fields.contains(&"name".to_string())); + } + + #[test] + fn test_is_uniform_array_nested() { + let arr = vec![json!({"nested": {"a": 1}})]; + let (uniform, _) = is_uniform_array(&arr); + assert!(!uniform); // Contains nested object + } + + #[test] + fn test_split_column_values_simple() { + let values = split_column_values("a\tb\tc", "\t"); + assert_eq!(values, vec!["a", "b", "c"]); + } + + #[test] + fn test_split_column_values_quoted() { + let values = split_column_values("\"a\tb\"\tc", "\t"); + assert_eq!(values, vec!["\"a\tb\"", "c"]); + } + + #[test] + fn test_split_column_values_empty() { + let values = split_column_values("a\t\tc", "\t"); + assert_eq!(values, vec!["a", "", "c"]); + } + + #[test] + fn test_get_indent_depth() { + assert_eq!(get_indent_depth("no indent"), 0); + assert_eq!(get_indent_depth(" one level"), 1); + assert_eq!(get_indent_depth(" two levels"), 2); + } + + // ======================================================================== + // Edge cases + // ======================================================================== + + #[test] + fn test_unicode_strings() { + let data = json!({"text": "Hello 世界"}); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(decoded["text"], "Hello 世界"); + } + + #[test] + fn test_long_string() { + let long = "x".repeat(500); + let data = json!({"text": long}); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(decoded["text"].as_str().unwrap().len(), 500); + } + + #[test] + fn test_many_columns() { + let data = json!([ + {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5} + ]); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert!(decoded.is_array()); + } + + #[test] + fn test_missing_values_in_column() { + // Some objects have fewer fields + let data = json!([ + {"id": 1, "name": "Alice", "email": "a@b.com"}, + {"id": 2, "name": "Bob"} // No email + ]); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + let arr = decoded.as_array().unwrap(); + assert_eq!(arr[0]["email"], "a@b.com"); + // Bob should not have email key at all (not null, just missing) + assert!(arr[1].get("email").is_none() || arr[1]["email"].is_null()); + } + + #[test] + fn test_list_array_with_objects() { + let data = json!({ + "items": [ + {"type": "a", "nested": {"x": 1}}, + {"type": "b", "nested": {"x": 2}} + ] + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert!(decoded["items"].is_array()); + } +} diff --git a/crates/agon-core/src/formats/mod.rs b/crates/agon-core/src/formats/mod.rs new file mode 100644 index 0000000..9fe73fd --- /dev/null +++ b/crates/agon-core/src/formats/mod.rs @@ -0,0 +1,357 @@ +//! AGON encoding formats +//! +//! This module contains implementations of the three AGON formats: +//! - text: Row-based tabular encoding (similar to TOON) +//! - columns: Columnar encoding with type clustering +//! - struct_fmt: Template-based encoding for nested patterns + +pub mod columns; +pub mod struct_fmt; +pub mod text; + +use rayon::prelude::*; +use serde_json::Value as JsonValue; + +use crate::error::Result; +use crate::utils::count_tokens; + +/// Result of encoding with metadata +#[derive(Debug, Clone)] +pub struct EncodingResult { + pub format: String, + pub text: String, + pub header: String, + pub token_estimate: usize, +} + +/// Headers for each format +pub fn get_header(format: &str) -> &'static str { + match format { + "text" => "@AGON text", + "columns" => "@AGON columns", + "struct" => "@AGON struct", + "json" => "", + _ => "", + } +} + +/// Encode data with all formats in parallel and return the best one +pub fn encode_auto_parallel( + data: &JsonValue, + force: bool, + min_savings: f64, +) -> Result { + let results = encode_all_parallel(data)?; + + // Find JSON baseline + let json_result = results.iter().find(|r| r.format == "json"); + let json_tokens = json_result.map(|r| r.token_estimate).unwrap_or(usize::MAX); + + // Find best non-JSON result + let best = results + .iter() + .filter(|r| force || r.format != "json") + .min_by_key(|r| r.token_estimate); + + match best { + Some(best_result) => { + // Check if savings meet threshold + if !force && best_result.format != "json" { + let savings = 1.0 - (best_result.token_estimate as f64 / json_tokens.max(1) as f64); + if savings < min_savings { + // Return JSON if savings don't meet threshold + return Ok(json_result.cloned().unwrap_or_else(|| EncodingResult { + format: "json".to_string(), + text: serde_json::to_string(data).unwrap_or_default(), + header: String::new(), + token_estimate: json_tokens, + })); + } + } + Ok(best_result.clone()) + } + None => { + // Fallback to JSON + let text = serde_json::to_string(data)?; + let tokens = count_tokens(&text); + Ok(EncodingResult { + format: "json".to_string(), + text, + header: String::new(), + token_estimate: tokens, + }) + } + } +} + +/// Encode data with all formats in parallel +pub fn encode_all_parallel(data: &JsonValue) -> Result> { + let formats = ["json", "text", "columns", "struct"]; + + // Use rayon to encode all formats in parallel + let results: Vec> = formats + .par_iter() + .map(|format| encode_with_format(data, format)) + .collect(); + + // Collect results, filtering out errors + let mut valid_results = Vec::new(); + for result in results { + match result { + Ok(r) => valid_results.push(r), + Err(_) => continue, // Skip formats that fail + } + } + + if valid_results.is_empty() { + // At minimum, JSON should always work + let text = serde_json::to_string(data)?; + valid_results.push(EncodingResult { + format: "json".to_string(), + text: text.clone(), + header: String::new(), + token_estimate: count_tokens(&text), + }); + } + + Ok(valid_results) +} + +/// Encode data with a specific format +fn encode_with_format(data: &JsonValue, format: &str) -> Result { + let (text, header) = match format { + "json" => (serde_json::to_string(data)?, String::new()), + "text" => (text::encode(data, false)?, get_header("text").to_string()), + "columns" => ( + columns::encode(data, false)?, + get_header("columns").to_string(), + ), + "struct" => ( + struct_fmt::encode(data, false)?, + get_header("struct").to_string(), + ), + _ => return Err(crate::error::AgonError::InvalidFormat(format.to_string())), + }; + + let token_estimate = count_tokens(&text); + + Ok(EncodingResult { + format: format.to_string(), + text, + header, + token_estimate, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_get_header() { + assert_eq!(get_header("text"), "@AGON text"); + assert_eq!(get_header("columns"), "@AGON columns"); + assert_eq!(get_header("struct"), "@AGON struct"); + assert_eq!(get_header("json"), ""); + assert_eq!(get_header("unknown"), ""); + } + + #[test] + fn test_encode_all_parallel_simple() { + let data = json!({"name": "test", "value": 42}); + let results = encode_all_parallel(&data).unwrap(); + + // Should have results for all formats + assert!(!results.is_empty()); + + // JSON should always be present + let json_result = results.iter().find(|r| r.format == "json"); + assert!(json_result.is_some()); + } + + #[test] + fn test_encode_all_parallel_array() { + let data = json!([ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"}, + {"id": 3, "name": "Carol"} + ]); + let results = encode_all_parallel(&data).unwrap(); + + // All four formats should succeed + assert_eq!(results.len(), 4); + + let formats: Vec<&str> = results.iter().map(|r| r.format.as_str()).collect(); + assert!(formats.contains(&"json")); + assert!(formats.contains(&"text")); + assert!(formats.contains(&"columns")); + assert!(formats.contains(&"struct")); + } + + #[test] + fn test_encode_auto_parallel_selects_best() { + let data = json!([ + {"id": 1, "name": "Alice", "role": "admin"}, + {"id": 2, "name": "Bob", "role": "user"}, + {"id": 3, "name": "Carol", "role": "user"} + ]); + + let result = encode_auto_parallel(&data, false, 0.0).unwrap(); + + // Should select a non-JSON format for tabular data + assert!(!result.text.is_empty()); + assert!(result.token_estimate > 0); + } + + #[test] + fn test_encode_auto_parallel_force_non_json() { + let data = json!({"simple": "data"}); + + // With force=true, should never return JSON (if alternatives exist) + let result = encode_auto_parallel(&data, true, 0.0).unwrap(); + + // Result should be valid + assert!(!result.text.is_empty()); + } + + #[test] + fn test_encode_auto_parallel_min_savings_fallback() { + let data = json!({"a": 1}); + + // With high min_savings threshold, should fall back to JSON if savings aren't met + let result = encode_auto_parallel(&data, false, 0.99).unwrap(); + + // Should get a valid result regardless + assert!(!result.text.is_empty()); + } + + #[test] + fn test_encode_with_format_json() { + let data = json!({"key": "value"}); + let result = encode_with_format(&data, "json").unwrap(); + + assert_eq!(result.format, "json"); + assert!(result.header.is_empty()); + assert!(result.text.contains("key")); + } + + #[test] + fn test_encode_with_format_text() { + let data = json!({"name": "test"}); + let result = encode_with_format(&data, "text").unwrap(); + + assert_eq!(result.format, "text"); + assert_eq!(result.header, "@AGON text"); + } + + #[test] + fn test_encode_with_format_columns() { + let data = json!([{"id": 1}, {"id": 2}]); + let result = encode_with_format(&data, "columns").unwrap(); + + assert_eq!(result.format, "columns"); + assert_eq!(result.header, "@AGON columns"); + } + + #[test] + fn test_encode_with_format_struct() { + let data = json!({"a": {"fmt": "1", "raw": 1}}); + let result = encode_with_format(&data, "struct").unwrap(); + + assert_eq!(result.format, "struct"); + assert_eq!(result.header, "@AGON struct"); + } + + #[test] + fn test_encode_with_format_invalid() { + let data = json!({}); + let result = encode_with_format(&data, "invalid_format"); + + assert!(result.is_err()); + } + + #[test] + fn test_encoding_result_token_estimate() { + let data = json!([ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"} + ]); + + let results = encode_all_parallel(&data).unwrap(); + + // All results should have positive token estimates + for result in &results { + assert!( + result.token_estimate > 0, + "Format {} has zero tokens", + result.format + ); + } + } + + #[test] + fn test_empty_object() { + let data = json!({}); + let results = encode_all_parallel(&data).unwrap(); + + assert!(!results.is_empty()); + } + + #[test] + fn test_empty_array() { + let data = json!([]); + let results = encode_all_parallel(&data).unwrap(); + + assert!(!results.is_empty()); + } + + #[test] + fn test_nested_structure() { + let data = json!({ + "user": { + "name": "Alice", + "address": { + "city": "Seattle", + "zip": "98101" + } + } + }); + + let results = encode_all_parallel(&data).unwrap(); + assert!(!results.is_empty()); + + // All formats should handle nested structures + for result in &results { + assert!( + !result.text.is_empty(), + "Format {} produced empty text", + result.format + ); + } + } + + #[test] + fn test_primitive_values() { + let data = json!({ + "string": "hello", + "number": 42, + "float": 3.15, + "bool": true, + "null": null + }); + + let results = encode_all_parallel(&data).unwrap(); + assert!(!results.is_empty()); + } + + #[test] + fn test_mixed_array() { + let data = json!([1, "two", true, null, {"nested": "object"}]); + let results = encode_all_parallel(&data).unwrap(); + + // JSON should always handle mixed arrays + let json_result = results.iter().find(|r| r.format == "json").unwrap(); + assert!(json_result.text.contains("two")); + } +} diff --git a/crates/agon-core/src/formats/struct_fmt.rs b/crates/agon-core/src/formats/struct_fmt.rs new file mode 100644 index 0000000..ff67e7e --- /dev/null +++ b/crates/agon-core/src/formats/struct_fmt.rs @@ -0,0 +1,1504 @@ +//! AGONStruct format encoder/decoder +//! +//! Template-based encoding for repeated object structures. +//! +//! Format structure: +//! ```text +//! @AGON struct +//! +//! @StructName: field1, field2, field3 +//! +//! - key: StructName(val1, val2, val3) +//! ``` + +use regex::Regex; +use serde_json::{Map, Value}; +use std::collections::HashMap; +use std::sync::LazyLock; + +use crate::error::{AgonError, Result}; + +const HEADER: &str = "@AGON struct"; +const INDENT: &str = " "; + +// Regex patterns +static NUMBER_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$").unwrap()); +static STRUCT_DEF_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^@(\w+)(?:\(([^)]+)\))?:\s*(.*)$").unwrap()); +static STRUCT_INST_RE: LazyLock = LazyLock::new(|| Regex::new(r"^(\w+)\(").unwrap()); +static KEY_VALUE_RE: LazyLock = LazyLock::new(|| Regex::new(r"^([^:]+):\s*(.*)$").unwrap()); +static ARRAY_HEADER_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^(\w*)\[(\d+)\]:?").unwrap()); + +/// Struct definition stored in registry: (fields, optional_fields, parents) +type StructDef = (Vec, Vec, Vec); +type StructRegistry = HashMap; + +/// Struct definition with name for creation: (name, fields, optional_fields, parents) +#[allow(clippy::type_complexity)] +type StructDefWithName = (String, Vec, Vec, Vec); + +/// Encode data to AGONStruct format +pub fn encode(data: &Value, include_header: bool) -> Result { + let mut lines = Vec::new(); + + // Detect shapes and create struct definitions + let shapes = detect_shapes(data); + let struct_defs = create_struct_definitions(&shapes, 3, 2); + + // Build registry + let mut registry = StructRegistry::new(); + for (name, fields, optional, parents) in &struct_defs { + register_struct(&mut registry, name, fields, optional, parents)?; + } + + if include_header { + lines.push(HEADER.to_string()); + lines.push(String::new()); + } + + // Emit struct definitions + if !struct_defs.is_empty() { + for (name, fields, optional, parents) in &struct_defs { + let fields_str: Vec = fields + .iter() + .map(|f| { + if optional.contains(f) { + format!("{}?", f) + } else { + f.clone() + } + }) + .collect(); + + if parents.is_empty() { + lines.push(format!("@{}: {}", name, fields_str.join(", "))); + } else { + lines.push(format!( + "@{}({}): {}", + name, + parents.join(", "), + fields_str.join(", ") + )); + } + } + lines.push(String::new()); + } + + encode_value(data, &mut lines, 0, ®istry); + + Ok(lines.join("\n")) +} + +/// Decode AGONStruct payload +pub fn decode(payload: &str) -> Result { + let lines: Vec<&str> = payload.lines().collect(); + if lines.is_empty() { + return Err(AgonError::DecodingError("Empty payload".to_string())); + } + + let mut idx = 0; + + // Parse header + let header_line = lines[idx].trim(); + if !header_line.starts_with("@AGON struct") { + return Err(AgonError::DecodingError(format!( + "Invalid header: {}", + header_line + ))); + } + idx += 1; + + // Parse struct definitions + let mut registry = StructRegistry::new(); + while idx < lines.len() { + let line = lines[idx].trim(); + if line.is_empty() { + idx += 1; + continue; + } + if !line.starts_with('@') { + break; + } + if let Some(parsed) = parse_struct_def(line) { + let (name, fields, optional, parents) = parsed; + register_struct(&mut registry, &name, &fields, &optional, &parents)?; + } + idx += 1; + } + + // Skip blank lines + while idx < lines.len() && lines[idx].trim().is_empty() { + idx += 1; + } + + if idx >= lines.len() { + return Ok(Value::Null); + } + + let (result, _) = decode_value(&lines, idx, 0, ®istry)?; + Ok(result) +} + +// ============================================================================ +// Shape detection +// ============================================================================ + +/// Shape signature: sorted list of field names +type Shape = Vec; + +fn get_shape(obj: &Map) -> Shape { + let mut fields: Vec = obj + .iter() + .filter(|(_, v)| !v.is_object() && !v.is_array()) + .map(|(k, _)| k.clone()) + .collect(); + fields.sort(); + fields +} + +fn detect_shapes(data: &Value) -> HashMap { + let mut shapes = HashMap::new(); + collect_shapes(data, &mut shapes); + shapes +} + +fn collect_shapes(data: &Value, shapes: &mut HashMap) { + match data { + Value::Array(arr) => { + for item in arr { + collect_shapes(item, shapes); + } + } + Value::Object(obj) => { + let shape = get_shape(obj); + if !shape.is_empty() { + *shapes.entry(shape).or_insert(0) += 1; + } + for v in obj.values() { + collect_shapes(v, shapes); + } + } + _ => {} + } +} + +fn create_struct_definitions( + shapes: &HashMap, + min_occurrences: usize, + min_fields: usize, +) -> Vec { + let mut defs = Vec::new(); + let mut used_names: std::collections::HashSet = std::collections::HashSet::new(); + + for (shape, count) in shapes { + if *count >= min_occurrences && shape.len() >= min_fields { + let name = generate_struct_name(shape, &mut used_names); + defs.push((name, shape.clone(), vec![], vec![])); + } + } + + defs +} + +/// Generate a struct name from field names +/// Takes first letter of each field (up to 4), adds counter on collision +fn generate_struct_name( + fields: &[String], + used_names: &mut std::collections::HashSet, +) -> String { + // Take first letter of each field, truncate to 4 chars max + let base_name: String = fields + .iter() + .filter_map(|f| f.chars().next()) + .map(|c| c.to_ascii_uppercase()) + .take(4) + .collect(); + + // Fallback if empty + let base_name = if base_name.is_empty() { + "S".to_string() + } else { + base_name + }; + + // Add counter on collision + let mut name = base_name.clone(); + let mut counter = 1; + while used_names.contains(&name) { + counter += 1; + name = format!("{}{}", base_name, counter); + } + used_names.insert(name.clone()); + name +} + +fn register_struct( + registry: &mut StructRegistry, + name: &str, + fields: &[String], + optional: &[String], + parents: &[String], +) -> Result<()> { + let mut all_fields = Vec::new(); + + // Resolve parent fields + for parent_name in parents { + if let Some((parent_fields, _, _)) = registry.get(parent_name) { + for f in parent_fields { + if !all_fields.contains(f) { + all_fields.push(f.clone()); + } + } + } + } + + // Add own fields + for f in fields { + if !all_fields.contains(f) { + all_fields.push(f.clone()); + } + } + + registry.insert( + name.to_string(), + (all_fields, optional.to_vec(), parents.to_vec()), + ); + Ok(()) +} + +// ============================================================================ +// Encoding helpers +// ============================================================================ + +fn format_primitive(val: &Value) -> String { + match val { + Value::Null => "null".to_string(), + Value::Bool(b) => if *b { "true" } else { "false" }.to_string(), + Value::Number(n) => n.to_string(), + Value::String(s) => { + // Quote if contains special chars or could be parsed as another type + if needs_quote(s) { + format!( + "\"{}\"", + s.replace('\\', "\\\\") + .replace('"', "\\\"") + .replace('\n', "\\n") + ) + } else { + s.clone() + } + } + _ => serde_json::to_string(val).unwrap_or_default(), + } +} + +/// Check if a string needs quoting to preserve its type +fn needs_quote(s: &str) -> bool { + if s.is_empty() { + return true; + } + // Strings with leading/trailing whitespace need quoting + if s != s.trim() { + return true; + } + // Struct format special chars + // ':' is included to avoid ambiguity with inline key-value parsing in lists. + if s.contains(',') + || s.contains(':') + || s.contains('(') + || s.contains(')') + || s.contains('\n') + || s.contains('\\') + || s.contains('"') + { + return true; + } + // Tree chars and special prefixes + if s.starts_with('@') || s.starts_with('#') || s.starts_with('-') { + return true; + } + // Boolean/null keywords + let lower = s.to_lowercase(); + if lower == "true" || lower == "false" || lower == "null" { + return true; + } + // Looks like a number - needs quoting to preserve string type + if s.parse::().is_ok() || s.parse::().is_ok() { + return true; + } + false +} + +fn find_matching_struct(obj: &Map, registry: &StructRegistry) -> Option { + // Object must have only primitive values to use struct encoding + // If it has nested objects/arrays, we can't use struct templates + for v in obj.values() { + if v.is_object() || v.is_array() { + return None; + } + } + + let shape = get_shape(obj); + if shape.is_empty() { + return None; + } + + for (name, (fields, _, _)) in registry { + // Check if all required fields match + let mut sorted_fields = fields.clone(); + sorted_fields.sort(); + if sorted_fields == shape { + return Some(name.clone()); + } + } + None +} + +fn encode_value(val: &Value, lines: &mut Vec, depth: usize, registry: &StructRegistry) { + let indent = INDENT.repeat(depth); + + match val { + Value::Null | Value::Bool(_) | Value::Number(_) | Value::String(_) => { + lines.push(format!("{}{}", indent, format_primitive(val))); + } + Value::Array(arr) => { + encode_array(arr, lines, depth, registry); + } + Value::Object(obj) => { + encode_object(obj, lines, depth, registry, None); + } + } +} + +fn encode_array(arr: &[Value], lines: &mut Vec, depth: usize, registry: &StructRegistry) { + let indent = INDENT.repeat(depth); + + if arr.is_empty() { + lines.push(format!("{}[0]:", indent)); + return; + } + + lines.push(format!("{}[{}]:", indent, arr.len())); + + for item in arr { + if let Some(obj) = item.as_object() { + // Only use struct template if ALL fields are primitives (struct covers everything) + // If object has nested objects/arrays, use list item format to preserve them + let has_nested = obj.values().any(|v| v.is_object() || v.is_array()); + + if !has_nested { + if let Some(struct_name) = find_matching_struct(obj, registry) { + if let Some((fields, _, _)) = registry.get(&struct_name) { + let values: Vec = fields + .iter() + .map(|f| obj.get(f).map(format_primitive).unwrap_or_default()) + .collect(); + lines.push(format!( + "{} - {}({})", + indent, + struct_name, + values.join(", ") + )); + continue; + } + } + } + encode_list_item(obj, lines, depth + 1, registry); + } else { + lines.push(format!("{} - {}", indent, format_primitive(item))); + } + } +} + +fn encode_list_item( + obj: &Map, + lines: &mut Vec, + depth: usize, + registry: &StructRegistry, +) { + let indent = INDENT.repeat(depth); + let mut first = true; + + for (k, v) in obj { + let prefix = if first { + format!("{}- ", indent) + } else { + format!("{} ", indent) + }; + first = false; + + // Check if value can use a struct + if let Some(nested_obj) = v.as_object() { + if let Some(struct_name) = find_matching_struct(nested_obj, registry) { + if let Some((fields, _, _)) = registry.get(&struct_name) { + let values: Vec = fields + .iter() + .map(|f| nested_obj.get(f).map(format_primitive).unwrap_or_default()) + .collect(); + lines.push(format!( + "{}{}: {}({})", + prefix, + k, + struct_name, + values.join(", ") + )); + continue; + } + } + } + + match v { + Value::Object(nested) => { + lines.push(format!("{}{}:", prefix, k)); + encode_object(nested, lines, depth + 2, registry, None); + } + Value::Array(arr) => { + lines.push(format!("{}{}:", prefix, k)); + encode_array(arr, lines, depth + 2, registry); + } + _ => { + lines.push(format!("{}{}: {}", prefix, k, format_primitive(v))); + } + } + } +} + +fn encode_object( + obj: &Map, + lines: &mut Vec, + depth: usize, + registry: &StructRegistry, + name: Option<&str>, +) { + let indent = INDENT.repeat(depth); + let mut actual_depth = depth; + + if let Some(n) = name { + lines.push(format!("{}{}:", indent, n)); + actual_depth += 1; + } + + let actual_indent = INDENT.repeat(actual_depth); + + for (k, v) in obj { + // Check if value can use a struct + if let Some(nested_obj) = v.as_object() { + if let Some(struct_name) = find_matching_struct(nested_obj, registry) { + if let Some((fields, _, _)) = registry.get(&struct_name) { + let values: Vec = fields + .iter() + .map(|f| nested_obj.get(f).map(format_primitive).unwrap_or_default()) + .collect(); + lines.push(format!( + "{}{}: {}({})", + actual_indent, + k, + struct_name, + values.join(", ") + )); + continue; + } + } + } + + match v { + Value::Object(nested) => { + encode_object(nested, lines, actual_depth, registry, Some(k)); + } + Value::Array(arr) => { + lines.push(format!("{}{}", actual_indent, k)); + encode_array(arr, lines, actual_depth + 1, registry); + } + _ => { + lines.push(format!("{}{}: {}", actual_indent, k, format_primitive(v))); + } + } + } +} + +// ============================================================================ +// Decoding helpers +// ============================================================================ + +fn parse_struct_def(line: &str) -> Option { + let caps = STRUCT_DEF_RE.captures(line)?; + + let name = caps.get(1)?.as_str().to_string(); + let parents: Vec = caps + .get(2) + .map(|m| { + m.as_str() + .split(',') + .map(|s| s.trim().to_string()) + .collect() + }) + .unwrap_or_default(); + let fields_str = caps.get(3)?.as_str(); + + let mut fields = Vec::new(); + let mut optional = Vec::new(); + + for field in fields_str.split(',') { + let field = field.trim(); + if field.is_empty() { + continue; + } + if let Some(name) = field.strip_suffix('?') { + fields.push(name.to_string()); + optional.push(name.to_string()); + } else { + fields.push(field.to_string()); + } + } + + Some((name, fields, optional, parents)) +} + +fn parse_primitive(s: &str) -> Value { + let s = s.trim(); + if s.is_empty() { + return Value::Null; + } + + // Quoted string + if s.starts_with('"') && s.ends_with('"') { + let inner = &s[1..s.len() - 1]; + return Value::String( + inner + .replace("\\n", "\n") + .replace("\\\"", "\"") + .replace("\\\\", "\\"), + ); + } + + // Boolean/null + match s.to_lowercase().as_str() { + "null" => return Value::Null, + "true" => return Value::Bool(true), + "false" => return Value::Bool(false), + _ => {} + } + + // Number + if NUMBER_RE.is_match(s) { + if s.contains('.') || s.to_lowercase().contains('e') { + if let Ok(f) = s.parse::() { + if let Some(n) = serde_json::Number::from_f64(f) { + return Value::Number(n); + } + } + } else if let Ok(i) = s.parse::() { + return Value::Number(i.into()); + } + } + + Value::String(s.to_string()) +} + +fn get_indent_depth(line: &str) -> usize { + let stripped = line.trim_start_matches(' '); + let spaces = line.len() - stripped.len(); + spaces / 2 +} + +fn parse_struct_instance(s: &str, registry: &StructRegistry) -> Option { + let caps = STRUCT_INST_RE.captures(s)?; + let name = caps.get(1)?.as_str(); + + let (fields, _, _) = registry.get(name)?; + + // Find the closing paren + let start = s.find('(')? + 1; + let end = s.rfind(')')?; + let values_str = &s[start..end]; + + // Split values (respecting nested parens and quotes) + let values = split_struct_values(values_str); + + let mut obj = Map::new(); + for (i, field) in fields.iter().enumerate() { + if let Some(val_str) = values.get(i) { + // Recursively parse struct instances + let val = if let Some(nested) = parse_struct_instance(val_str.trim(), registry) { + nested + } else { + parse_primitive(val_str) + }; + obj.insert(field.clone(), val); + } + } + + Some(Value::Object(obj)) +} + +fn split_struct_values(s: &str) -> Vec { + let mut values = Vec::new(); + let mut current = String::new(); + let mut paren_depth = 0; + let mut in_quote = false; + + for c in s.chars() { + match c { + '"' if !in_quote => in_quote = true, + '"' if in_quote => in_quote = false, + '(' if !in_quote => paren_depth += 1, + ')' if !in_quote => paren_depth -= 1, + ',' if !in_quote && paren_depth == 0 => { + values.push(current.trim().to_string()); + current = String::new(); + continue; + } + _ => {} + } + current.push(c); + } + + if !current.is_empty() { + values.push(current.trim().to_string()); + } + + values +} + +fn decode_value( + lines: &[&str], + idx: usize, + depth: usize, + registry: &StructRegistry, +) -> Result<(Value, usize)> { + if idx >= lines.len() { + return Ok((Value::Null, idx)); + } + + let line = lines[idx]; + let stripped = line.trim(); + + if stripped.is_empty() { + return decode_value(lines, idx + 1, depth, registry); + } + + // Check for array + if let Some(caps) = ARRAY_HEADER_RE.captures(stripped) { + let name = caps.get(1).map(|m| m.as_str()).unwrap_or(""); + if name.is_empty() { + return decode_array(lines, idx, depth, registry); + } + } + + // Check for list item + if stripped.starts_with("- ") { + return decode_array_from_items(lines, idx, depth, registry); + } + + // Check for key: value + if KEY_VALUE_RE.is_match(stripped) { + return decode_object(lines, idx, depth, registry); + } + + // Check for bare identifier followed by array (object with array value) + if is_bare_identifier(stripped) { + // Look ahead to see if next non-empty line is an array header + let mut next_idx = idx + 1; + while next_idx < lines.len() && lines[next_idx].trim().is_empty() { + next_idx += 1; + } + if next_idx < lines.len() && ARRAY_HEADER_RE.is_match(lines[next_idx].trim()) { + // This is an object with a key pointing to an array + return decode_object(lines, idx, depth, registry); + } + } + + // Single value + let val = if let Some(struct_val) = parse_struct_instance(stripped, registry) { + struct_val + } else { + parse_primitive(stripped) + }; + + Ok((val, idx + 1)) +} + +fn decode_array( + lines: &[&str], + idx: usize, + depth: usize, + registry: &StructRegistry, +) -> Result<(Value, usize)> { + let line = lines[idx].trim(); + let caps = ARRAY_HEADER_RE.captures(line).unwrap(); + let name = caps.get(1).map(|m| m.as_str()).unwrap_or(""); + let count: usize = caps + .get(2) + .map(|m| m.as_str()) + .unwrap_or("0") + .parse() + .unwrap_or(0); + + let mut idx = idx + 1; + let mut result = Vec::new(); + let base_depth = depth + 1; + + while idx < lines.len() && result.len() < count { + let line = lines[idx]; + if line.trim().is_empty() { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth < base_depth { + break; + } + + let stripped = line.trim(); + if let Some(item_str) = stripped.strip_prefix("- ") { + let content = item_str.trim(); + // Check struct instance FIRST (struct values may contain ':' which matches KEY_VALUE_RE) + if let Some(struct_val) = parse_struct_instance(content, registry) { + result.push(struct_val); + idx += 1; + } else if is_quoted_string(content) { + // If this is a quoted string list item, treat it as a primitive. + // This avoids ambiguity with inline object syntax when the string + // contains ':' (e.g. "keyword match: foo"). + result.push(parse_primitive(content)); + idx += 1; + } else if KEY_VALUE_RE.is_match(content) { + let (obj, new_idx) = decode_list_item(lines, idx, base_depth, registry)?; + result.push(obj); + idx = new_idx; + } else { + result.push(parse_primitive(item_str)); + idx += 1; + } + } else { + break; + } + } + + let arr = Value::Array(result); + if !name.is_empty() { + let mut wrapper = Map::new(); + wrapper.insert(name.to_string(), arr); + Ok((Value::Object(wrapper), idx)) + } else { + Ok((arr, idx)) + } +} + +fn decode_array_from_items( + lines: &[&str], + idx: usize, + _depth: usize, + registry: &StructRegistry, +) -> Result<(Value, usize)> { + let mut result = Vec::new(); + let base_depth = get_indent_depth(lines[idx]); + let mut idx = idx; + + while idx < lines.len() { + let line = lines[idx]; + if line.trim().is_empty() { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth < base_depth { + break; + } + + let stripped = line.trim(); + if let Some(item_str) = stripped.strip_prefix("- ") { + let content = item_str.trim(); + // Check struct instance first + if let Some(struct_val) = parse_struct_instance(content, registry) { + result.push(struct_val); + idx += 1; + } else if is_quoted_string(content) { + // Quoted strings should be treated as primitives, not key-value pairs + result.push(parse_primitive(content)); + idx += 1; + } else if KEY_VALUE_RE.is_match(content) { + let (obj, new_idx) = decode_list_item(lines, idx, base_depth, registry)?; + result.push(obj); + idx = new_idx; + } else { + result.push(parse_primitive(item_str)); + idx += 1; + } + } else { + break; + } + } + + Ok((Value::Array(result), idx)) +} + +/// Check if a string is a quoted string (starts and ends with double quotes) +fn is_quoted_string(s: &str) -> bool { + s.len() >= 2 && s.starts_with('"') && s.ends_with('"') +} + +fn decode_list_item( + lines: &[&str], + idx: usize, + base_depth: usize, + registry: &StructRegistry, +) -> Result<(Value, usize)> { + let mut obj = Map::new(); + let item_depth = base_depth; + + let first_line = lines[idx].trim(); + let first_content = first_line.strip_prefix("- ").unwrap_or(first_line).trim(); + + let mut idx = idx; + + if let Some(caps) = KEY_VALUE_RE.captures(first_content) { + let key = caps.get(1).map(|m| m.as_str()).unwrap_or("").trim(); + let val_str = caps.get(2).map(|m| m.as_str()).unwrap_or("").trim(); + + if !val_str.is_empty() { + let val = if let Some(struct_val) = parse_struct_instance(val_str, registry) { + struct_val + } else { + parse_primitive(val_str) + }; + obj.insert(key.to_string(), val); + idx += 1; + } else { + idx += 1; + if idx < lines.len() { + let next_depth = get_indent_depth(lines[idx]); + if next_depth > item_depth + 1 { + let (nested, new_idx) = decode_value(lines, idx, next_depth, registry)?; + obj.insert(key.to_string(), nested); + idx = new_idx; + } else { + // Empty object - no nested content + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } else { + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } + } else { + idx += 1; + } + + // Parse continuation lines + while idx < lines.len() { + let line = lines[idx]; + if line.trim().is_empty() { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth <= item_depth { + break; + } + + let stripped = line.trim(); + + // Check for new list item - this is a boundary, not a continuation + if stripped.starts_with("- ") { + break; + } + + if let Some(caps) = KEY_VALUE_RE.captures(stripped) { + let key = caps.get(1).map(|m| m.as_str()).unwrap_or("").trim(); + let val_str = caps.get(2).map(|m| m.as_str()).unwrap_or("").trim(); + + if !val_str.is_empty() { + let val = if let Some(struct_val) = parse_struct_instance(val_str, registry) { + struct_val + } else { + parse_primitive(val_str) + }; + obj.insert(key.to_string(), val); + idx += 1; + } else { + idx += 1; + if idx < lines.len() { + let next_depth = get_indent_depth(lines[idx]); + if next_depth > line_depth { + let (nested, new_idx) = decode_value(lines, idx, next_depth, registry)?; + obj.insert(key.to_string(), nested); + idx = new_idx; + } else { + // Empty object - no nested content + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } else { + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } + } else if is_bare_identifier(stripped) { + // Bare key (no colon) - check if next line is an array + let key = stripped.to_string(); + idx += 1; + + // Skip blank lines + while idx < lines.len() && lines[idx].trim().is_empty() { + idx += 1; + } + + if idx < lines.len() { + let next_line = lines[idx].trim(); + // Check if next line starts an array + if ARRAY_HEADER_RE.is_match(next_line) { + let (arr, new_idx) = decode_array(lines, idx, line_depth, registry)?; + obj.insert(key, arr); + idx = new_idx; + } else { + // Not an array, treat key as having null/empty value + obj.insert(key, Value::Null); + } + } else { + obj.insert(key, Value::Null); + } + } else { + idx += 1; + } + } + + Ok((Value::Object(obj), idx)) +} + +fn decode_object( + lines: &[&str], + idx: usize, + _depth: usize, + registry: &StructRegistry, +) -> Result<(Value, usize)> { + let mut result = Map::new(); + let base_depth = get_indent_depth(lines[idx]); + let mut idx = idx; + + while idx < lines.len() { + let line = lines[idx]; + if line.trim().is_empty() { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth < base_depth { + break; + } + + let stripped = line.trim(); + + if let Some(caps) = KEY_VALUE_RE.captures(stripped) { + let key = caps.get(1).map(|m| m.as_str()).unwrap_or("").trim(); + let val_str = caps.get(2).map(|m| m.as_str()).unwrap_or("").trim(); + + if !val_str.is_empty() { + let val = if let Some(struct_val) = parse_struct_instance(val_str, registry) { + struct_val + } else { + parse_primitive(val_str) + }; + result.insert(key.to_string(), val); + idx += 1; + } else { + idx += 1; + if idx < lines.len() { + let next_depth = get_indent_depth(lines[idx]); + if next_depth > line_depth { + let (nested, new_idx) = decode_value(lines, idx, next_depth, registry)?; + result.insert(key.to_string(), nested); + idx = new_idx; + } else { + result.insert(key.to_string(), Value::Object(Map::new())); + } + } else { + // End of file - still insert empty object + result.insert(key.to_string(), Value::Object(Map::new())); + } + } + } else if is_bare_identifier(stripped) { + // Bare key (no colon) - check if next line is an array + let key = stripped.to_string(); + idx += 1; + + // Skip blank lines + while idx < lines.len() && lines[idx].trim().is_empty() { + idx += 1; + } + + if idx < lines.len() { + let next_line = lines[idx].trim(); + // Check if next line starts an array + if ARRAY_HEADER_RE.is_match(next_line) { + let (arr, new_idx) = decode_array(lines, idx, line_depth, registry)?; + result.insert(key, arr); + idx = new_idx; + } else { + // Not an array, treat key as having null/empty value + result.insert(key, Value::Null); + } + } else { + result.insert(key, Value::Null); + } + } else { + break; + } + } + + Ok((Value::Object(result), idx)) +} + +/// Check if a string is a bare identifier (valid key name without colon) +fn is_bare_identifier(s: &str) -> bool { + if s.is_empty() { + return false; + } + // Must not contain colon (that would be key:value) + if s.contains(':') { + return false; + } + // Must not be an array header + if ARRAY_HEADER_RE.is_match(s) { + return false; + } + // Must not start with special chars + if s.starts_with('-') || s.starts_with('@') || s.starts_with('#') { + return false; + } + // Should be a valid identifier (alphanumeric + underscore, starting with letter) + let first = s.chars().next().unwrap(); + if !first.is_alphabetic() && first != '_' { + return false; + } + s.chars().all(|c| c.is_alphanumeric() || c == '_') +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + // ======================================================================== + // Encoding tests + // ======================================================================== + + #[test] + fn test_encode_with_structs() { + let data = json!({ + "items": [ + {"fmt": "1.00", "raw": 1.0}, + {"fmt": "2.00", "raw": 2.0}, + {"fmt": "3.00", "raw": 3.0} + ] + }); + let encoded = encode(&data, true).unwrap(); + assert!(encoded.starts_with("@AGON struct")); + } + + #[test] + fn test_encode_with_header() { + let data = json!({"name": "test"}); + let encoded = encode(&data, true).unwrap(); + assert!(encoded.starts_with("@AGON struct")); + } + + #[test] + fn test_encode_without_header() { + let data = json!({"name": "test"}); + let encoded = encode(&data, false).unwrap(); + assert!(!encoded.starts_with("@AGON")); + } + + #[test] + fn test_encode_primitives() { + let data = json!({ + "string": "hello", + "number": 42, + "bool_true": true, + "null_val": null + }); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("string: hello")); + assert!(encoded.contains("number: 42")); + assert!(encoded.contains("bool_true: true")); + assert!(encoded.contains("null_val: null")); + } + + #[test] + fn test_encode_repeated_shapes_creates_struct() { + // Three occurrences of same shape should create a struct + let data = json!({ + "price": {"fmt": "100.00", "raw": 100.0}, + "change": {"fmt": "+5.00", "raw": 5.0}, + "volume": {"fmt": "1M", "raw": 1000000} + }); + let encoded = encode(&data, false).unwrap(); + // Should have struct definition + assert!(encoded.contains("@") && encoded.contains(":")); + } + + #[test] + fn test_encode_empty_array() { + let data = json!({"items": []}); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("[0]:")); + } + + // ======================================================================== + // Decoding tests + // ======================================================================== + + #[test] + fn test_decode_empty_payload() { + let result = decode(""); + assert!(result.is_err()); + } + + #[test] + fn test_decode_invalid_header() { + let result = decode("invalid header"); + assert!(result.is_err()); + } + + #[test] + fn test_decode_header_only() { + let result = decode("@AGON struct\n\n").unwrap(); + assert!(result.is_null()); + } + + #[test] + fn test_decode_simple_struct_instance() { + let payload = "@AGON struct\n\n@FR: fmt, raw\n\nprice: FR(\"100.00\", 100.0)"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["price"]["fmt"], "100.00"); + assert_eq!(decoded["price"]["raw"], 100.0); + } + + #[test] + fn test_decode_multiple_struct_instances() { + let payload = + "@AGON struct\n\n@FR: fmt, raw\n\nprice: FR(\"100\", 100)\nchange: FR(\"5\", 5)"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["price"]["fmt"], "100"); + assert_eq!(decoded["change"]["fmt"], "5"); + } + + #[test] + fn test_decode_inherited_struct() { + let payload = + "@AGON struct\n\n@FR: fmt, raw\n@FRC(FR): currency\n\nprice: FRC(\"100\", 100, USD)"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["price"]["fmt"], "100"); + assert_eq!(decoded["price"]["raw"], 100); + assert_eq!(decoded["price"]["currency"], "USD"); + } + + #[test] + fn test_decode_optional_field_present() { + let payload = + "@AGON struct\n\n@Quote: symbol, price, volume?\n\nstock: Quote(AAPL, 150.0, 1000000)"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["stock"]["symbol"], "AAPL"); + assert_eq!(decoded["stock"]["price"], 150.0); + assert_eq!(decoded["stock"]["volume"], 1000000); + } + + #[test] + fn test_decode_optional_field_omitted() { + let payload = "@AGON struct\n\n@Quote: symbol, price, volume?\n\nstock: Quote(AAPL, 150.0)"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["stock"]["symbol"], "AAPL"); + assert_eq!(decoded["stock"]["price"], 150.0); + // Optional field should be absent + assert!(decoded["stock"].get("volume").is_none() || decoded["stock"]["volume"].is_null()); + } + + // ======================================================================== + // Roundtrip tests + // ======================================================================== + + #[test] + fn test_roundtrip_financial_data() { + let data = json!({ + "symbol": "AAPL", + "price": {"fmt": "150.00", "raw": 150.0}, + "change": {"fmt": "+2.50", "raw": 2.5}, + "volume": {"fmt": "1M", "raw": 1000000} + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(decoded["symbol"], "AAPL"); + assert_eq!(decoded["price"]["fmt"], "150.00"); + } + + #[test] + fn test_roundtrip_array_of_structs() { + let data = json!([ + {"fmt": "1", "raw": 1}, + {"fmt": "2", "raw": 2}, + {"fmt": "3", "raw": 3} + ]); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert!(decoded.is_array()); + assert_eq!(decoded.as_array().unwrap().len(), 3); + } + + #[test] + fn test_roundtrip_nested_object() { + let data = json!({ + "quote": { + "price": {"fmt": "100", "raw": 100.0}, + "change": {"fmt": "5", "raw": 5.0}, + "volume": {"fmt": "1M", "raw": 1000000} + } + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert!(decoded["quote"].is_object()); + } + + // ======================================================================== + // Parse struct definition tests + // ======================================================================== + + #[test] + fn test_parse_struct_def() { + let line = "@FR: fmt, raw"; + let (name, fields, optional, parents) = parse_struct_def(line).unwrap(); + assert_eq!(name, "FR"); + assert_eq!(fields, vec!["fmt", "raw"]); + assert!(optional.is_empty()); + assert!(parents.is_empty()); + } + + #[test] + fn test_parse_struct_def_with_optional() { + let line = "@Quote: symbol, price, volume?"; + let (name, fields, optional, _) = parse_struct_def(line).unwrap(); + assert_eq!(name, "Quote"); + assert_eq!(fields, vec!["symbol", "price", "volume"]); + assert_eq!(optional, vec!["volume"]); + } + + #[test] + fn test_parse_struct_def_with_parent() { + let line = "@FRC(FR): currency"; + let (name, fields, _, parents) = parse_struct_def(line).unwrap(); + assert_eq!(name, "FRC"); + assert_eq!(fields, vec!["currency"]); + assert_eq!(parents, vec!["FR"]); + } + + // ======================================================================== + // Helper function tests + // ======================================================================== + + #[test] + fn test_needs_quote_empty() { + assert!(needs_quote("")); + } + + #[test] + fn test_needs_quote_whitespace() { + assert!(needs_quote(" padded ")); + assert!(needs_quote(" leading")); + } + + #[test] + fn test_needs_quote_special_chars() { + assert!(needs_quote("has,comma")); + assert!(needs_quote("has:colon")); + assert!(needs_quote("has(paren)")); + assert!(needs_quote("has\"quote")); + } + + #[test] + fn test_needs_quote_special_prefix() { + assert!(needs_quote("@mention")); + assert!(needs_quote("#comment")); + assert!(needs_quote("-item")); + } + + #[test] + fn test_needs_quote_primitives() { + assert!(needs_quote("true")); + assert!(needs_quote("false")); + assert!(needs_quote("null")); + assert!(needs_quote("42")); + assert!(needs_quote("3.14")); + } + + #[test] + fn test_needs_quote_normal_string() { + assert!(!needs_quote("hello")); + assert!(!needs_quote("normal string")); + } + + #[test] + fn test_format_primitive() { + assert_eq!(format_primitive(&Value::Null), "null"); + assert_eq!(format_primitive(&Value::Bool(true)), "true"); + assert_eq!(format_primitive(&Value::Bool(false)), "false"); + assert_eq!(format_primitive(&json!(42)), "42"); + assert_eq!(format_primitive(&json!("hello")), "hello"); + assert_eq!(format_primitive(&json!("42")), "\"42\""); // Quoted to preserve string type + } + + #[test] + fn test_get_shape() { + let obj = json!({"a": 1, "b": "two", "nested": {"x": 1}}) + .as_object() + .unwrap() + .clone(); + let shape = get_shape(&obj); + // Should only include primitive fields + assert!(shape.contains(&"a".to_string())); + assert!(shape.contains(&"b".to_string())); + assert!(!shape.contains(&"nested".to_string())); // Nested object excluded + } + + #[test] + fn test_detect_shapes() { + let data = json!([ + {"a": 1, "b": 2}, + {"a": 3, "b": 4}, + {"a": 5, "b": 6} + ]); + let shapes = detect_shapes(&data); + // Should have one shape with count 3 + assert!(!shapes.is_empty()); + let shape = vec!["a".to_string(), "b".to_string()]; + assert_eq!(shapes.get(&shape), Some(&3)); + } + + #[test] + fn test_generate_struct_name() { + let mut used = std::collections::HashSet::new(); + let name = generate_struct_name(&["fmt".to_string(), "raw".to_string()], &mut used); + assert_eq!(name, "FR"); + assert!(used.contains(&name)); + } + + #[test] + fn test_generate_struct_name_collision() { + let mut used = std::collections::HashSet::new(); + used.insert("FR".to_string()); + let name = generate_struct_name(&["fmt".to_string(), "raw".to_string()], &mut used); + assert_eq!(name, "FR2"); // Should add counter + } + + #[test] + fn test_find_matching_struct() { + let mut registry = StructRegistry::new(); + registry.insert( + "FR".to_string(), + (vec!["fmt".to_string(), "raw".to_string()], vec![], vec![]), + ); + + let obj = json!({"fmt": "100", "raw": 100}) + .as_object() + .unwrap() + .clone(); + let matched = find_matching_struct(&obj, ®istry); + assert_eq!(matched, Some("FR".to_string())); + } + + #[test] + fn test_find_matching_struct_no_match() { + let mut registry = StructRegistry::new(); + registry.insert( + "FR".to_string(), + (vec!["fmt".to_string(), "raw".to_string()], vec![], vec![]), + ); + + let obj = json!({"x": 1, "y": 2}).as_object().unwrap().clone(); + let matched = find_matching_struct(&obj, ®istry); + assert!(matched.is_none()); + } + + #[test] + fn test_find_matching_struct_with_nested_returns_none() { + let mut registry = StructRegistry::new(); + registry.insert( + "FR".to_string(), + (vec!["fmt".to_string(), "raw".to_string()], vec![], vec![]), + ); + + // Object with nested value - should not match struct + let obj = json!({"fmt": "100", "raw": 100, "nested": {"a": 1}}) + .as_object() + .unwrap() + .clone(); + let matched = find_matching_struct(&obj, ®istry); + assert!(matched.is_none()); + } + + // ======================================================================== + // Edge cases + // ======================================================================== + + #[test] + fn test_unicode_strings() { + let data = json!({"text": "Hello 世界"}); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(decoded["text"], "Hello 世界"); + } + + #[test] + fn test_quoted_string_with_colon() { + // Strings containing ':' should be quoted and preserved + let data = json!(["keyword match: for, object, return", "plain text"]); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert!(decoded.is_array()); + let arr = decoded.as_array().unwrap(); + assert_eq!(arr[0], "keyword match: for, object, return"); + assert_eq!(arr[1], "plain text"); + } + + #[test] + fn test_struct_with_boolean_values() { + let payload = "@AGON struct\n\n@Flags: a, b\n\nitem: Flags(true, false)"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["item"]["a"], true); + assert_eq!(decoded["item"]["b"], false); + } + + #[test] + fn test_struct_with_null_values() { + let payload = "@AGON struct\n\n@Pair: a, b\n\nitem: Pair(, test)"; + let decoded = decode(payload).unwrap(); + assert!(decoded["item"]["a"].is_null()); + assert_eq!(decoded["item"]["b"], "test"); + } + + #[test] + fn test_struct_with_numeric_values() { + let payload = "@AGON struct\n\n@Nums: int_val, float_val\n\nitem: Nums(42, 3.15)"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["item"]["int_val"], 42); + assert_eq!(decoded["item"]["float_val"], 3.15); + } + + #[test] + fn test_deeply_nested_with_structs() { + let data = json!({ + "level1": { + "level2": { + "a": {"fmt": "1", "raw": 1}, + "b": {"fmt": "2", "raw": 2}, + "c": {"fmt": "3", "raw": 3} + } + } + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert!(decoded["level1"]["level2"].is_object()); + } +} diff --git a/crates/agon-core/src/formats/text.rs b/crates/agon-core/src/formats/text.rs new file mode 100644 index 0000000..e5c9e88 --- /dev/null +++ b/crates/agon-core/src/formats/text.rs @@ -0,0 +1,1339 @@ +//! AGONText format encoder/decoder +//! +//! Row-based encoding with tabular format for arrays of uniform objects. +//! +//! Format structure: +//! @AGON text +//! @D= # optional, default: \t +//! + +use regex::Regex; +use serde_json::{Map, Value}; +use std::sync::LazyLock; + +use crate::error::{AgonError, Result}; + +const HEADER: &str = "@AGON text"; +const DEFAULT_DELIMITER: &str = "\t"; +const INDENT: &str = " "; + +// Regex patterns for parsing +static TABULAR_HEADER_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^(\w*)\[(\d+)\]\{(.+)\}$").unwrap()); +static PRIMITIVE_ARRAY_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^(\w*)\[(\d+)\]:\s*(.*)$").unwrap()); +static LIST_ARRAY_RE: LazyLock = LazyLock::new(|| Regex::new(r"^(\w*)\[(\d+)\]:$").unwrap()); +static KEY_VALUE_RE: LazyLock = LazyLock::new(|| Regex::new(r"^([^:]+):\s*(.*)$").unwrap()); +static NUMBER_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$").unwrap()); + +/// Encode data to AGONText format +pub fn encode(data: &Value, include_header: bool) -> Result { + let mut lines = Vec::new(); + let delimiter = DEFAULT_DELIMITER; + + if include_header { + lines.push(HEADER.to_string()); + lines.push(String::new()); + } + + encode_value(data, &mut lines, 0, delimiter, None); + + Ok(lines.join("\n")) +} + +/// Decode AGONText payload +pub fn decode(payload: &str) -> Result { + let lines: Vec<&str> = payload.lines().collect(); + if lines.is_empty() { + return Err(AgonError::DecodingError("Empty payload".to_string())); + } + + let mut idx = 0; + + // Parse header + let header_line = lines[idx].trim(); + if !header_line.starts_with("@AGON text") { + return Err(AgonError::DecodingError(format!( + "Invalid header: {}", + header_line + ))); + } + idx += 1; + + // Parse optional delimiter + let delimiter = if idx < lines.len() && lines[idx].starts_with("@D=") { + let d = parse_delimiter(&lines[idx][3..]); + idx += 1; + d + } else { + DEFAULT_DELIMITER.to_string() + }; + + // Skip blank lines + while idx < lines.len() && lines[idx].trim().is_empty() { + idx += 1; + } + + if idx >= lines.len() { + return Ok(Value::Null); + } + + let (result, _) = decode_value(&lines, idx, 0, &delimiter)?; + Ok(result) +} + +// ============================================================================ +// Encoding helpers +// ============================================================================ + +fn needs_quote(s: &str, delimiter: &str) -> bool { + if s.is_empty() { + return true; + } + if s.trim() != s { + return true; + } + if s.contains(delimiter) { + return true; + } + if s.contains('\n') || s.contains('\r') || s.contains('\\') || s.contains('"') { + return true; + } + let first = s.chars().next().unwrap(); + if first == '@' || first == '#' || first == '-' { + return true; + } + let lower = s.to_lowercase(); + if lower == "true" || lower == "false" || lower == "null" { + return true; + } + NUMBER_RE.is_match(s) +} + +fn quote_string(s: &str) -> String { + let escaped = s + .replace('\\', "\\\\") + .replace('"', "\\\"") + .replace('\n', "\\n") + .replace('\r', "\\r") + .replace('\t', "\\t"); + format!("\"{}\"", escaped) +} + +fn unquote_string(s: &str) -> String { + if !(s.starts_with('"') && s.ends_with('"')) { + return s.to_string(); + } + let inner = &s[1..s.len() - 1]; + let mut result = String::new(); + let mut chars = inner.chars().peekable(); + + while let Some(c) = chars.next() { + if c == '\\' { + match chars.next() { + Some('n') => result.push('\n'), + Some('r') => result.push('\r'), + Some('t') => result.push('\t'), + Some('\\') => result.push('\\'), + Some('"') => result.push('"'), + Some(other) => result.push(other), + None => result.push('\\'), + } + } else { + result.push(c); + } + } + result +} + +fn encode_primitive(val: &Value, delimiter: &str) -> String { + match val { + Value::Null => "null".to_string(), + Value::Bool(b) => if *b { "true" } else { "false" }.to_string(), + Value::Number(n) => n.to_string(), + Value::String(s) => { + if needs_quote(s, delimiter) { + quote_string(s) + } else { + s.clone() + } + } + _ => serde_json::to_string(val).unwrap_or_default(), + } +} + +fn parse_primitive(s: &str) -> Value { + let s = s.trim(); + if s.is_empty() { + return Value::Null; + } + + // Quoted string + if s.starts_with('"') && s.ends_with('"') { + return Value::String(unquote_string(s)); + } + + // Boolean/null + let lower = s.to_lowercase(); + if lower == "null" { + return Value::Null; + } + if lower == "true" { + return Value::Bool(true); + } + if lower == "false" { + return Value::Bool(false); + } + + // Number + if NUMBER_RE.is_match(s) { + if s.contains('.') || s.to_lowercase().contains('e') { + if let Ok(f) = s.parse::() { + if let Some(n) = serde_json::Number::from_f64(f) { + return Value::Number(n); + } + } + } else if let Ok(i) = s.parse::() { + return Value::Number(i.into()); + } + } + + Value::String(s.to_string()) +} + +fn parse_delimiter(d: &str) -> String { + let d = d.trim(); + match d { + "\\t" => "\t".to_string(), + "\\n" => "\n".to_string(), + _ => d.to_string(), + } +} + +fn is_uniform_array(arr: &[Value]) -> (bool, Vec) { + if arr.is_empty() { + return (false, vec![]); + } + + // Check all are objects + if !arr.iter().all(|v| v.is_object()) { + return (false, vec![]); + } + + // Check all values are primitives + for obj in arr { + if let Some(map) = obj.as_object() { + for v in map.values() { + if v.is_object() || v.is_array() { + return (false, vec![]); + } + } + } + } + + // Collect keys in order + let mut key_order = Vec::new(); + for obj in arr { + if let Some(map) = obj.as_object() { + for k in map.keys() { + if !key_order.contains(k) { + key_order.push(k.clone()); + } + } + } + } + + (true, key_order) +} + +fn is_primitive_array(arr: &[Value]) -> bool { + arr.iter().all(|v| !v.is_object() && !v.is_array()) +} + +fn encode_value( + val: &Value, + lines: &mut Vec, + depth: usize, + delimiter: &str, + name: Option<&str>, +) { + let indent = INDENT.repeat(depth); + + match val { + Value::Null | Value::Bool(_) | Value::Number(_) | Value::String(_) => { + let encoded = encode_primitive(val, delimiter); + if let Some(n) = name { + lines.push(format!("{}{}: {}", indent, n, encoded)); + } else { + lines.push(format!("{}{}", indent, encoded)); + } + } + Value::Array(arr) => { + encode_array(arr, lines, depth, delimiter, name); + } + Value::Object(obj) => { + encode_object(obj, lines, depth, delimiter, name); + } + } +} + +fn encode_array( + arr: &[Value], + lines: &mut Vec, + depth: usize, + delimiter: &str, + name: Option<&str>, +) { + let indent = INDENT.repeat(depth); + + if arr.is_empty() { + if let Some(n) = name { + lines.push(format!("{}{}[0]:", indent, n)); + } else { + lines.push(format!("{}[0]:", indent)); + } + return; + } + + // Check for uniform objects (tabular format) + let (is_uniform, fields) = is_uniform_array(arr); + if is_uniform && !fields.is_empty() { + let header = fields.join(delimiter); + if let Some(n) = name { + lines.push(format!("{}{}[{}]{{{}}}", indent, n, arr.len(), header)); + } else { + lines.push(format!("{}[{}]{{{}}}", indent, arr.len(), header)); + } + + for obj in arr { + if let Some(map) = obj.as_object() { + let row: Vec = fields + .iter() + .map(|f| { + map.get(f) + .map(|v| encode_primitive(v, delimiter)) + .unwrap_or_default() + }) + .collect(); + lines.push(format!("{}{}", indent, row.join(delimiter))); + } + } + return; + } + + // Primitive array (inline format) + if is_primitive_array(arr) { + let values: Vec = arr.iter().map(|v| encode_primitive(v, delimiter)).collect(); + if let Some(n) = name { + lines.push(format!( + "{}{}[{}]: {}", + indent, + n, + arr.len(), + values.join(delimiter) + )); + } else { + lines.push(format!( + "{}[{}]: {}", + indent, + arr.len(), + values.join(delimiter) + )); + } + return; + } + + // Mixed/nested array + if let Some(n) = name { + lines.push(format!("{}{}[{}]:", indent, n, arr.len())); + } else { + lines.push(format!("{}[{}]:", indent, arr.len())); + } + + for item in arr { + if item.is_object() { + encode_list_item_object(item.as_object().unwrap(), lines, depth + 1, delimiter); + } else { + lines.push(format!( + "{} - {}", + indent, + encode_primitive(item, delimiter) + )); + } + } +} + +fn encode_list_item_object( + obj: &Map, + lines: &mut Vec, + depth: usize, + delimiter: &str, +) { + let indent = INDENT.repeat(depth); + let mut first = true; + + for (k, v) in obj { + let prefix = if first { + format!("{}- ", indent) + } else { + format!("{} ", indent) + }; + first = false; + + match v { + Value::Object(nested) => { + lines.push(format!("{}{}:", prefix, k)); + for (nk, nv) in nested { + if nv.is_object() || nv.is_array() { + encode_value(nv, lines, depth + 2, delimiter, Some(nk)); + } else { + lines.push(format!( + "{} {}: {}", + indent, + nk, + encode_primitive(nv, delimiter) + )); + } + } + } + Value::Array(_) => { + lines.push(format!("{}{}:", prefix, k)); + encode_value(v, lines, depth + 2, delimiter, None); + } + _ => { + lines.push(format!( + "{}{}: {}", + prefix, + k, + encode_primitive(v, delimiter) + )); + } + } + } +} + +fn encode_object( + obj: &Map, + lines: &mut Vec, + depth: usize, + delimiter: &str, + name: Option<&str>, +) { + let indent = INDENT.repeat(depth); + let mut actual_depth = depth; + + if let Some(n) = name { + lines.push(format!("{}{}:", indent, n)); + actual_depth += 1; + } + + let actual_indent = INDENT.repeat(actual_depth); + + for (k, v) in obj { + match v { + Value::Object(_) | Value::Array(_) => { + encode_value(v, lines, actual_depth, delimiter, Some(k)); + } + _ => { + lines.push(format!( + "{}{}: {}", + actual_indent, + k, + encode_primitive(v, delimiter) + )); + } + } + } +} + +// ============================================================================ +// Decoding helpers +// ============================================================================ + +fn get_indent_depth(line: &str) -> usize { + let stripped = line.trim_start_matches(' '); + let spaces = line.len() - stripped.len(); + spaces / 2 +} + +fn split_row(values_str: &str, delimiter: &str) -> Vec { + if delimiter.len() == 1 { + // Fast path for single-char delimiter (common case: tab) + let delim_char = delimiter.chars().next().unwrap(); + let mut result = Vec::new(); + let mut current = String::new(); + let mut in_quote = false; + let mut escape_next = false; + + for c in values_str.chars() { + if escape_next { + current.push(c); + escape_next = false; + continue; + } + + if c == '\\' && in_quote { + escape_next = true; + current.push(c); + continue; + } + + if c == '"' { + in_quote = !in_quote; + current.push(c); + } else if c == delim_char && !in_quote { + result.push(current); + current = String::new(); + } else { + current.push(c); + } + } + + result.push(current); + result + } else { + // Multi-char delimiter (less common) + let mut result = Vec::new(); + let mut current = String::new(); + let mut in_quote = false; + let mut i = 0; + let chars: Vec = values_str.chars().collect(); + + while i < chars.len() { + let c = chars[i]; + + if c == '"' { + in_quote = !in_quote; + current.push(c); + i += 1; + } else if !in_quote && values_str[i..].starts_with(delimiter) { + result.push(current); + current = String::new(); + i += delimiter.len(); + } else { + current.push(c); + i += 1; + } + } + + result.push(current); + result + } +} + +fn decode_value( + lines: &[&str], + idx: usize, + depth: usize, + delimiter: &str, +) -> Result<(Value, usize)> { + if idx >= lines.len() { + return Ok((Value::Null, idx)); + } + + let line = lines[idx]; + if get_indent_depth(line) < depth { + return Ok((Value::Null, idx)); + } + + let stripped = line.trim(); + + if stripped.is_empty() || stripped.starts_with('#') { + return decode_value(lines, idx + 1, depth, delimiter); + } + + // Check for tabular array + if let Some(caps) = TABULAR_HEADER_RE.captures(stripped) { + let name = caps.get(1).map(|m| m.as_str()).unwrap_or(""); + if !name.is_empty() { + return decode_object(lines, idx, depth, delimiter); + } + return decode_tabular_array(lines, idx, depth, delimiter, &caps); + } + + // Check for primitive array + if let Some(caps) = PRIMITIVE_ARRAY_RE.captures(stripped) { + let name = caps.get(1).map(|m| m.as_str()).unwrap_or(""); + let values_part = caps.get(3).map(|m| m.as_str()).unwrap_or("").trim(); + if !values_part.is_empty() { + if !name.is_empty() { + return decode_object(lines, idx, depth, delimiter); + } + return decode_primitive_array(&caps, delimiter, idx); + } + } + + // Check for list array + if let Some(caps) = LIST_ARRAY_RE.captures(stripped) { + let name = caps.get(1).map(|m| m.as_str()).unwrap_or(""); + if !name.is_empty() { + return decode_object(lines, idx, depth, delimiter); + } + return decode_list_array(lines, idx, depth, delimiter, &caps); + } + + // Check for key:value + if KEY_VALUE_RE.is_match(stripped) { + return decode_object(lines, idx, depth, delimiter); + } + + Err(AgonError::ParseError { + line: idx, + message: format!("Cannot parse: {}", stripped), + }) +} + +fn decode_tabular_array( + lines: &[&str], + idx: usize, + _depth: usize, + delimiter: &str, + caps: ®ex::Captures, +) -> Result<(Value, usize)> { + let name = caps.get(1).map(|m| m.as_str()).unwrap_or(""); + let count: usize = caps + .get(2) + .map(|m| m.as_str()) + .unwrap_or("0") + .parse() + .unwrap_or(0); + let fields_str = caps.get(3).map(|m| m.as_str()).unwrap_or(""); + let fields: Vec<&str> = fields_str.split(delimiter).map(|s| s.trim()).collect(); + + let mut idx = idx + 1; + let mut result = Vec::new(); + + while idx < lines.len() && result.len() < count { + let row_line = lines[idx].trim(); + if row_line.is_empty() || row_line.starts_with('#') { + idx += 1; + continue; + } + + let values = split_row(row_line, delimiter); + let mut obj = Map::new(); + + for (i, field) in fields.iter().enumerate() { + if i < values.len() { + let raw = &values[i]; + let val = parse_primitive(raw); + if !matches!(val, Value::Null) || !raw.trim().is_empty() { + obj.insert(field.to_string(), val); + } + } + } + + result.push(Value::Object(obj)); + idx += 1; + } + + let arr = Value::Array(result); + if !name.is_empty() { + let mut wrapper = Map::new(); + wrapper.insert(name.to_string(), arr); + Ok((Value::Object(wrapper), idx)) + } else { + Ok((arr, idx)) + } +} + +fn decode_primitive_array( + caps: ®ex::Captures, + delimiter: &str, + idx: usize, +) -> Result<(Value, usize)> { + let name = caps.get(1).map(|m| m.as_str()).unwrap_or(""); + let values_str = caps.get(3).map(|m| m.as_str()).unwrap_or(""); + + let arr = if values_str.trim().is_empty() { + Value::Array(vec![]) + } else { + let values = split_row(values_str, delimiter); + Value::Array(values.iter().map(|v| parse_primitive(v)).collect()) + }; + + if !name.is_empty() { + let mut wrapper = Map::new(); + wrapper.insert(name.to_string(), arr); + Ok((Value::Object(wrapper), idx + 1)) + } else { + Ok((arr, idx + 1)) + } +} + +fn decode_list_array( + lines: &[&str], + idx: usize, + depth: usize, + delimiter: &str, + caps: ®ex::Captures, +) -> Result<(Value, usize)> { + let name = caps.get(1).map(|m| m.as_str()).unwrap_or(""); + let count: usize = caps + .get(2) + .map(|m| m.as_str()) + .unwrap_or("0") + .parse() + .unwrap_or(0); + + let mut idx = idx + 1; + let mut result = Vec::new(); + let base_depth = depth + 1; + + while idx < lines.len() && result.len() < count { + let line = lines[idx]; + if line.trim().is_empty() || line.trim().starts_with('#') { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth < base_depth { + break; + } + + let stripped = line.trim(); + if let Some(item_str) = stripped.strip_prefix("- ") { + let item_str = item_str.trim(); + if KEY_VALUE_RE.is_match(item_str) { + let (obj, new_idx) = decode_list_item_object(lines, idx, base_depth, delimiter)?; + result.push(obj); + idx = new_idx; + } else { + result.push(parse_primitive(item_str)); + idx += 1; + } + } else { + break; + } + } + + let arr = Value::Array(result); + if !name.is_empty() { + let mut wrapper = Map::new(); + wrapper.insert(name.to_string(), arr); + Ok((Value::Object(wrapper), idx)) + } else { + Ok((arr, idx)) + } +} + +fn decode_list_item_object( + lines: &[&str], + idx: usize, + base_depth: usize, + delimiter: &str, +) -> Result<(Value, usize)> { + let mut obj = Map::new(); + let item_depth = base_depth; + + // Parse first line (starts with -) + let first_line = lines[idx].trim(); + let first_content = first_line.strip_prefix("- ").unwrap_or(first_line).trim(); + + let mut idx = idx; + + if let Some(caps) = KEY_VALUE_RE.captures(first_content) { + let key = caps.get(1).map(|m| m.as_str()).unwrap_or("").trim(); + let val_str = caps.get(2).map(|m| m.as_str()).unwrap_or("").trim(); + + if !val_str.is_empty() { + obj.insert(key.to_string(), parse_primitive(val_str)); + idx += 1; + } else { + idx += 1; + if idx < lines.len() { + let next_depth = get_indent_depth(lines[idx]); + if next_depth > item_depth + 1 { + let (nested, new_idx) = decode_value(lines, idx, next_depth, delimiter)?; + obj.insert(key.to_string(), nested); + idx = new_idx; + } else { + // Empty object - no nested content at higher depth + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } else { + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } + } else { + idx += 1; + } + + // Parse continuation lines + while idx < lines.len() { + let line = lines[idx]; + if line.trim().is_empty() { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth <= item_depth { + break; + } + + let stripped = line.trim(); + + if let Some(caps) = KEY_VALUE_RE.captures(stripped) { + let key = caps.get(1).map(|m| m.as_str()).unwrap_or("").trim(); + let val_str = caps.get(2).map(|m| m.as_str()).unwrap_or("").trim(); + + if !val_str.is_empty() { + obj.insert(key.to_string(), parse_primitive(val_str)); + idx += 1; + } else { + idx += 1; + if idx < lines.len() { + let next_depth = get_indent_depth(lines[idx]); + if next_depth > line_depth { + let (nested, new_idx) = decode_value(lines, idx, next_depth, delimiter)?; + obj.insert(key.to_string(), nested); + idx = new_idx; + } else { + // Empty object - no nested content + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } else { + obj.insert(key.to_string(), Value::Object(Map::new())); + } + } + } else { + idx += 1; + } + } + + Ok((Value::Object(obj), idx)) +} + +fn decode_object( + lines: &[&str], + idx: usize, + _depth: usize, + delimiter: &str, +) -> Result<(Value, usize)> { + let mut result = Map::new(); + if idx >= lines.len() { + return Ok((Value::Object(result), idx)); + } + + let base_depth = get_indent_depth(lines[idx]); + let mut idx = idx; + + while idx < lines.len() { + let line = lines[idx]; + if line.trim().is_empty() || line.trim().starts_with('#') { + idx += 1; + continue; + } + + let line_depth = get_indent_depth(line); + if line_depth < base_depth { + break; + } + + let stripped = line.trim(); + + // Check for array patterns first + if let Some(caps) = TABULAR_HEADER_RE.captures(stripped) { + let (nested, new_idx) = decode_tabular_array(lines, idx, line_depth, delimiter, &caps)?; + if let Value::Object(map) = nested { + for (k, v) in map { + result.insert(k, v); + } + } + idx = new_idx; + continue; + } + + if let Some(caps) = PRIMITIVE_ARRAY_RE.captures(stripped) { + let values_part = caps.get(3).map(|m| m.as_str()).unwrap_or("").trim(); + if !values_part.is_empty() { + let (nested, new_idx) = decode_primitive_array(&caps, delimiter, idx)?; + if let Value::Object(map) = nested { + for (k, v) in map { + result.insert(k, v); + } + } + idx = new_idx; + continue; + } + } + + if let Some(caps) = LIST_ARRAY_RE.captures(stripped) { + let (nested, new_idx) = decode_list_array(lines, idx, line_depth, delimiter, &caps)?; + if let Value::Object(map) = nested { + for (k, v) in map { + result.insert(k, v); + } + } + idx = new_idx; + continue; + } + + if let Some(caps) = KEY_VALUE_RE.captures(stripped) { + let key = caps.get(1).map(|m| m.as_str()).unwrap_or("").trim(); + let val_str = caps.get(2).map(|m| m.as_str()).unwrap_or("").trim(); + + if !val_str.is_empty() { + result.insert(key.to_string(), parse_primitive(val_str)); + idx += 1; + } else { + idx += 1; + if idx < lines.len() { + let next_depth = get_indent_depth(lines[idx]); + if next_depth > line_depth { + let (nested, new_idx) = decode_value(lines, idx, next_depth, delimiter)?; + result.insert(key.to_string(), nested); + idx = new_idx; + } else { + result.insert(key.to_string(), Value::Object(Map::new())); + } + } else { + result.insert(key.to_string(), Value::Object(Map::new())); + } + } + } else { + break; + } + } + + Ok((Value::Object(result), idx)) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + // ======================================================================== + // Encoding tests + // ======================================================================== + + #[test] + fn test_encode_simple_array() { + let data = json!([ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"} + ]); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("[2]{")); + assert!(encoded.contains("Alice")); + } + + #[test] + fn test_encode_with_header() { + let data = json!({"name": "test"}); + let encoded = encode(&data, true).unwrap(); + assert!(encoded.starts_with("@AGON text")); + } + + #[test] + fn test_encode_without_header() { + let data = json!({"name": "test"}); + let encoded = encode(&data, false).unwrap(); + assert!(!encoded.contains("@AGON")); + } + + #[test] + fn test_encode_primitives() { + let data = json!({ + "string": "hello", + "number": 42, + "float": 3.15, + "bool_true": true, + "bool_false": false, + "null_val": null + }); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("string: hello")); + assert!(encoded.contains("number: 42")); + assert!(encoded.contains("float: 3.15")); + assert!(encoded.contains("bool_true: true")); + assert!(encoded.contains("bool_false: false")); + assert!(encoded.contains("null_val: null")); + } + + #[test] + fn test_encode_empty_array() { + let data = json!({"items": []}); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("items[0]:")); + } + + #[test] + fn test_encode_primitive_array() { + let data = json!({"nums": [1, 2, 3]}); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("nums[3]:")); + } + + #[test] + fn test_encode_nested_object() { + let data = json!({ + "outer": { + "inner": { + "value": 42 + } + } + }); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("outer:")); + assert!(encoded.contains("inner:")); + assert!(encoded.contains("value: 42")); + } + + // ======================================================================== + // Decoding tests + // ======================================================================== + + #[test] + fn test_decode_empty_payload() { + let result = decode(""); + assert!(result.is_err()); + } + + #[test] + fn test_decode_invalid_header() { + let result = decode("invalid header"); + assert!(result.is_err()); + } + + #[test] + fn test_decode_header_only() { + let result = decode("@AGON text\n\n").unwrap(); + assert!(result.is_null()); + } + + #[test] + fn test_decode_simple_object() { + let payload = "@AGON text\n\nname: Alice\nage: 30"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["name"], "Alice"); + assert_eq!(decoded["age"], 30); + } + + #[test] + fn test_decode_tabular_array() { + let payload = "@AGON text\n\n[2]{id\tname}\n1\tAlice\n2\tBob"; + let decoded = decode(payload).unwrap(); + assert!(decoded.is_array()); + let arr = decoded.as_array().unwrap(); + assert_eq!(arr.len(), 2); + assert_eq!(arr[0]["id"], 1); + assert_eq!(arr[0]["name"], "Alice"); + } + + #[test] + fn test_decode_named_tabular_array() { + let payload = "@AGON text\n\nusers[2]{id\tname}\n1\tAlice\n2\tBob"; + let decoded = decode(payload).unwrap(); + assert!(decoded.is_object()); + let users = decoded["users"].as_array().unwrap(); + assert_eq!(users.len(), 2); + } + + #[test] + fn test_decode_primitive_array() { + let payload = "@AGON text\n\nnums[3]: 1\t2\t3"; + let decoded = decode(payload).unwrap(); + let nums = decoded["nums"].as_array().unwrap(); + assert_eq!(nums.len(), 3); + assert_eq!(nums[0], 1); + assert_eq!(nums[1], 2); + assert_eq!(nums[2], 3); + } + + #[test] + fn test_decode_custom_delimiter() { + let payload = "@AGON text\n@D=\\t\n\nname: test"; + let decoded = decode(payload).unwrap(); + assert_eq!(decoded["name"], "test"); + } + + // ======================================================================== + // Roundtrip tests + // ======================================================================== + + #[test] + fn test_roundtrip() { + let data = json!({ + "users": [ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"} + ] + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + + assert!(decoded.is_object()); + let users = decoded.get("users").unwrap(); + assert!(users.is_array()); + assert_eq!(users.as_array().unwrap().len(), 2); + } + + #[test] + fn test_roundtrip_nested_object() { + let data = json!({ + "company": { + "name": "ACME", + "address": { + "city": "Seattle" + } + } + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(decoded["company"]["name"], "ACME"); + assert_eq!(decoded["company"]["address"]["city"], "Seattle"); + } + + #[test] + fn test_roundtrip_empty_object() { + let data = json!({}); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert!( + decoded.is_null() || (decoded.is_object() && decoded.as_object().unwrap().is_empty()) + ); + } + + #[test] + fn test_roundtrip_mixed_array() { + let data = json!({ + "items": [1, "two", true, null] + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + let items = decoded["items"].as_array().unwrap(); + assert_eq!(items.len(), 4); + } + + // ======================================================================== + // Helper function tests + // ======================================================================== + + #[test] + fn test_needs_quote_empty() { + assert!(needs_quote("", "\t")); + } + + #[test] + fn test_needs_quote_whitespace() { + assert!(needs_quote(" padded ", "\t")); + assert!(needs_quote(" leading", "\t")); + assert!(needs_quote("trailing ", "\t")); + } + + #[test] + fn test_needs_quote_delimiter() { + assert!(needs_quote("has\ttab", "\t")); + assert!(needs_quote("has,comma", ",")); + } + + #[test] + fn test_needs_quote_special_chars() { + assert!(needs_quote("has\nnewline", "\t")); + assert!(needs_quote("has\"quote", "\t")); + assert!(needs_quote("has\\backslash", "\t")); + } + + #[test] + fn test_needs_quote_special_prefix() { + assert!(needs_quote("@mention", "\t")); + assert!(needs_quote("#comment", "\t")); + assert!(needs_quote("-item", "\t")); + } + + #[test] + fn test_needs_quote_looks_like_primitive() { + assert!(needs_quote("true", "\t")); + assert!(needs_quote("false", "\t")); + assert!(needs_quote("null", "\t")); + assert!(needs_quote("42", "\t")); + assert!(needs_quote("3.14", "\t")); + } + + #[test] + fn test_needs_quote_normal_string() { + assert!(!needs_quote("hello", "\t")); + assert!(!needs_quote("normal string", "\t")); + } + + #[test] + fn test_quote_string() { + assert_eq!(quote_string("hello"), "\"hello\""); + assert_eq!(quote_string("say \"hi\""), "\"say \\\"hi\\\"\""); + assert_eq!(quote_string("line\nbreak"), "\"line\\nbreak\""); + assert_eq!(quote_string("tab\there"), "\"tab\\there\""); + } + + #[test] + fn test_unquote_string() { + assert_eq!(unquote_string("\"hello\""), "hello"); + assert_eq!(unquote_string("\"say \\\"hi\\\"\""), "say \"hi\""); + assert_eq!(unquote_string("\"line\\nbreak\""), "line\nbreak"); + assert_eq!(unquote_string("unquoted"), "unquoted"); + } + + #[test] + fn test_parse_primitive_null() { + assert_eq!(parse_primitive("null"), Value::Null); + assert_eq!(parse_primitive("NULL"), Value::Null); + assert_eq!(parse_primitive(""), Value::Null); + } + + #[test] + fn test_parse_primitive_bool() { + assert_eq!(parse_primitive("true"), Value::Bool(true)); + assert_eq!(parse_primitive("TRUE"), Value::Bool(true)); + assert_eq!(parse_primitive("false"), Value::Bool(false)); + assert_eq!(parse_primitive("FALSE"), Value::Bool(false)); + } + + #[test] + fn test_parse_primitive_number() { + assert_eq!(parse_primitive("42"), json!(42)); + assert_eq!(parse_primitive("-17"), json!(-17)); + assert_eq!(parse_primitive("3.15"), json!(3.15)); + assert_eq!(parse_primitive("1e10"), json!(1e10)); + } + + #[test] + fn test_parse_primitive_string() { + assert_eq!(parse_primitive("hello"), Value::String("hello".to_string())); + assert_eq!( + parse_primitive("\"quoted\""), + Value::String("quoted".to_string()) + ); + } + + #[test] + fn test_parse_delimiter() { + assert_eq!(parse_delimiter("\\t"), "\t"); + assert_eq!(parse_delimiter("\\n"), "\n"); + assert_eq!(parse_delimiter(","), ","); + } + + #[test] + fn test_is_uniform_array_empty() { + let arr: Vec = vec![]; + let (uniform, _) = is_uniform_array(&arr); + assert!(!uniform); + } + + #[test] + fn test_is_uniform_array_primitives() { + let arr = vec![json!(1), json!(2), json!(3)]; + let (uniform, _) = is_uniform_array(&arr); + assert!(!uniform); + } + + #[test] + fn test_is_uniform_array_uniform_objects() { + let arr = vec![json!({"id": 1, "name": "a"}), json!({"id": 2, "name": "b"})]; + let (uniform, fields) = is_uniform_array(&arr); + assert!(uniform); + assert!(fields.contains(&"id".to_string())); + assert!(fields.contains(&"name".to_string())); + } + + #[test] + fn test_is_uniform_array_nested_objects() { + let arr = vec![json!({"id": 1, "nested": {"a": 1}})]; + let (uniform, _) = is_uniform_array(&arr); + assert!(!uniform); // Contains nested object + } + + #[test] + fn test_is_primitive_array() { + assert!(is_primitive_array(&[json!(1), json!("two"), json!(true)])); + assert!(!is_primitive_array(&[json!({"a": 1})])); + assert!(!is_primitive_array(&[json!([1, 2])])); + } + + #[test] + fn test_split_row_simple() { + let row = split_row("a\tb\tc", "\t"); + assert_eq!(row, vec!["a", "b", "c"]); + } + + #[test] + fn test_split_row_quoted() { + let row = split_row("\"a\tb\"\tc", "\t"); + assert_eq!(row, vec!["\"a\tb\"", "c"]); + } + + #[test] + fn test_split_row_escaped_quote() { + let row = split_row("\"a\\\"b\"\tc", "\t"); + assert_eq!(row, vec!["\"a\\\"b\"", "c"]); + } + + #[test] + fn test_get_indent_depth() { + assert_eq!(get_indent_depth("no indent"), 0); + assert_eq!(get_indent_depth(" one level"), 1); + assert_eq!(get_indent_depth(" two levels"), 2); + } + + // ======================================================================== + // Edge cases + // ======================================================================== + + #[test] + fn test_encode_special_floats() { + let data = json!({ + "nan": null, // NaN becomes null in JSON + "inf": null // Infinity becomes null in JSON + }); + let encoded = encode(&data, false).unwrap(); + assert!(encoded.contains("null")); + } + + #[test] + fn test_unicode_strings() { + let data = json!({"text": "Hello 世界 🌍"}); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(decoded["text"], "Hello 世界 🌍"); + } + + #[test] + fn test_long_string() { + let long = "x".repeat(1000); + let data = json!({"text": long}); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(decoded["text"].as_str().unwrap().len(), 1000); + } + + #[test] + fn test_deeply_nested() { + let data = json!({ + "a": { + "b": { + "c": { + "d": "deep" + } + } + } + }); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(decoded["a"]["b"]["c"]["d"], "deep"); + } + + #[test] + fn test_array_of_mixed_objects() { + let data = json!([ + {"type": "a", "value": 1}, + {"type": "b", "extra": "field"} + ]); + let encoded = encode(&data, true).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert!(decoded.is_array()); + assert_eq!(decoded.as_array().unwrap().len(), 2); + } +} diff --git a/crates/agon-core/src/lib.rs b/crates/agon-core/src/lib.rs new file mode 100644 index 0000000..e0b6432 --- /dev/null +++ b/crates/agon-core/src/lib.rs @@ -0,0 +1,503 @@ +//! AGON Core: Rust implementation of AGON encoding formats +//! +//! All format classes inherit from AGONFormat base class: +//! - AGONText: Row-based tabular encoding +//! - AGONColumns: Columnar encoding with type clustering +//! - AGONStruct: Template-based encoding for nested patterns + +use pyo3::exceptions::PyNotImplementedError; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyList}; +use std::collections::HashMap; + +mod error; +mod formats; +mod types; +mod utils; + +pub use error::AgonError; +pub use formats::{columns, struct_fmt, text}; +pub use types::JsonValue; + +// ============================================================================ +// AGONFormat - Abstract base class +// ============================================================================ + +/// Abstract base class for AGON format codecs. +/// +/// All AGON formats inherit from this class and implement: +/// - encode(data, include_header=False) -> str +/// - decode(payload) -> object +/// - hint() -> str +/// +/// Provides concrete method: +/// - project_data(data, keep_paths) -> projected data +#[pyclass(subclass)] +struct AGONFormat; + +#[pymethods] +impl AGONFormat { + #[new] + fn new() -> Self { + AGONFormat + } + + /// Encode data to this format. Must be implemented by subclasses. + #[staticmethod] + #[pyo3(signature = (_data, _include_header = false))] + fn encode(_data: &Bound<'_, PyAny>, _include_header: bool) -> PyResult { + Err(PyNotImplementedError::new_err( + "encode() must be implemented by subclass", + )) + } + + /// Decode a payload in this format. Must be implemented by subclasses. + #[staticmethod] + fn decode(_payload: &str) -> PyResult> { + Err(PyNotImplementedError::new_err( + "decode() must be implemented by subclass", + )) + } + + /// Return a short hint describing this format. Must be implemented by subclasses. + #[staticmethod] + fn hint() -> PyResult { + Err(PyNotImplementedError::new_err( + "hint() must be implemented by subclass", + )) + } + + /// Project data to only keep specified fields. + /// + /// Args: + /// data: List of objects to project + /// keep_paths: List of field paths to keep (supports dotted paths like "user.name") + /// + /// Returns: + /// Projected data with only the specified fields + #[staticmethod] + fn project_data( + py: Python<'_>, + data: &Bound<'_, PyList>, + keep_paths: Vec, + ) -> PyResult> { + let keep_tree = build_keep_tree(&keep_paths); + let result = PyList::empty(py); + + for item in data.iter() { + if item.is_instance_of::() { + let dict = item + .cast::() + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + let projected = project_obj(py, dict, &keep_tree)?; + result.append(projected)?; + } + } + + Ok(result.unbind()) + } + + fn __repr__(&self) -> String { + "AGONFormat()".to_string() + } +} + +/// Recursive keep tree: None means "keep whole value", Some(map) means "keep these subfields" +#[derive(Default)] +struct KeepTree { + children: HashMap>>, +} + +// Helper: Build keep tree from dotted paths +fn build_keep_tree(keep_paths: &[String]) -> KeepTree { + let mut tree = KeepTree::default(); + + for raw_path in keep_paths { + let path = raw_path.trim().trim_matches('.'); + if path.is_empty() { + continue; + } + let parts: Vec<&str> = path.split('.').filter(|p| !p.is_empty()).collect(); + if parts.is_empty() { + continue; + } + + // Walk the path and build nested structure + let mut cur = &mut tree; + for (i, part) in parts.iter().enumerate() { + let is_last = i == parts.len() - 1; + let key = part.to_string(); + + if is_last { + // Leaf: set to None if not already a subtree + cur.children.entry(key).or_insert(None); + } else { + // Intermediate: ensure subtree exists + let entry = cur + .children + .entry(key) + .or_insert_with(|| Some(Box::new(KeepTree::default()))); + if let Some(ref mut subtree) = entry { + cur = subtree.as_mut(); + } else { + // Was None (keep whole), upgrade to subtree + let new_subtree = Box::new(KeepTree::default()); + *entry = Some(new_subtree); + cur = entry.as_mut().unwrap().as_mut(); + } + } + } + } + + tree +} + +// Helper: Project a single object recursively +fn project_obj( + py: Python<'_>, + obj: &Bound<'_, PyDict>, + keep_tree: &KeepTree, +) -> PyResult> { + let out = PyDict::new(py); + + for (key, sub_keep) in &keep_tree.children { + if let Ok(Some(value)) = obj.get_item(key) { + match sub_keep { + None => { + // Leaf: keep the whole value + out.set_item(key, &value)?; + } + Some(sub_tree) => { + // Need to project nested structure + if value.is_none() { + out.set_item(key, &value)?; + } else if value.is_instance_of::() { + let nested_dict = value + .cast::() + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + let projected = project_obj(py, nested_dict, sub_tree)?; + out.set_item(key, projected)?; + } else if value.is_instance_of::() { + let nested_list = value + .cast::() + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + + // Check if list is empty or all items are dicts + let all_dicts = nested_list + .iter() + .all(|item| item.is_instance_of::()); + + if nested_list.is_empty() || all_dicts { + let projected_list = PyList::empty(py); + for item in nested_list.iter() { + let item_dict = item.cast::().map_err(|e| { + pyo3::exceptions::PyValueError::new_err(e.to_string()) + })?; + let projected = project_obj(py, item_dict, sub_tree)?; + projected_list.append(projected)?; + } + out.set_item(key, projected_list)?; + } else { + // Mixed list or not all dicts: keep as-is + out.set_item(key, &value)?; + } + } else { + // Not a dict or list: keep as-is + out.set_item(key, &value)?; + } + } + } + } + } + + Ok(out.unbind()) +} + +// ============================================================================ +// AGONText - Row-based tabular encoding +// ============================================================================ + +/// Row-based tabular encoding format. +#[pyclass(extends=AGONFormat)] +struct AGONText; + +#[pymethods] +impl AGONText { + #[new] + fn new() -> (Self, AGONFormat) { + (AGONText, AGONFormat) + } + + #[staticmethod] + #[pyo3(signature = (data, include_header = false))] + fn encode(data: &Bound<'_, PyAny>, include_header: bool) -> PyResult { + let value = types::py_to_json(data)?; + text::encode(&value, include_header).map_err(|e| e.into()) + } + + #[staticmethod] + fn decode(py: Python<'_>, payload: &str) -> PyResult> { + let value = text::decode(payload)?; + types::json_to_py(py, &value) + } + + #[staticmethod] + fn hint() -> String { + "Return in AGON text format: Start with @AGON text header, encode arrays as name[N]{fields} with tab-delimited rows".to_string() + } + + fn __repr__(&self) -> String { + "AGONText()".to_string() + } +} + +// ============================================================================ +// AGONColumns - Columnar encoding +// ============================================================================ + +/// Columnar encoding that transposes arrays to group by field. +#[pyclass(extends=AGONFormat)] +struct AGONColumns; + +#[pymethods] +impl AGONColumns { + #[new] + fn new() -> (Self, AGONFormat) { + (AGONColumns, AGONFormat) + } + + #[staticmethod] + #[pyo3(signature = (data, include_header = false))] + fn encode(data: &Bound<'_, PyAny>, include_header: bool) -> PyResult { + let value = types::py_to_json(data)?; + columns::encode(&value, include_header).map_err(|e| e.into()) + } + + #[staticmethod] + fn decode(py: Python<'_>, payload: &str) -> PyResult> { + let value = columns::decode(payload)?; + types::json_to_py(py, &value) + } + + #[staticmethod] + fn hint() -> String { + "Return in AGON columns format: Start with @AGON columns header, transpose arrays to name[N] with ├/└ field: val1, val2, ...".to_string() + } + + fn __repr__(&self) -> String { + "AGONColumns()".to_string() + } +} + +// ============================================================================ +// AGONStruct - Template-based encoding +// ============================================================================ + +/// Template-based encoding that detects repeated object patterns. +#[pyclass(extends=AGONFormat)] +struct AGONStruct; + +#[pymethods] +impl AGONStruct { + #[new] + fn new() -> (Self, AGONFormat) { + (AGONStruct, AGONFormat) + } + + #[staticmethod] + #[pyo3(signature = (data, include_header = false))] + fn encode(data: &Bound<'_, PyAny>, include_header: bool) -> PyResult { + let value = types::py_to_json(data)?; + struct_fmt::encode(&value, include_header).map_err(|e| e.into()) + } + + #[staticmethod] + fn decode(py: Python<'_>, payload: &str) -> PyResult> { + let value = struct_fmt::decode(payload)?; + types::json_to_py(py, &value) + } + + #[staticmethod] + fn hint() -> String { + "Return in AGON struct format: Start with @AGON struct header, define templates as @Struct: fields, instantiate as Struct(v1, v2)".to_string() + } + + fn __repr__(&self) -> String { + "AGONStruct()".to_string() + } +} + +// ============================================================================ +// EncodingResult +// ============================================================================ + +/// Result of parallel encoding with format selection. +#[pyclass] +#[derive(Clone)] +struct EncodingResult { + #[pyo3(get)] + format: String, + #[pyo3(get)] + text: String, + #[pyo3(get)] + header: String, + #[pyo3(get)] + token_estimate: usize, +} + +#[pymethods] +impl EncodingResult { + fn __repr__(&self) -> String { + format!( + "EncodingResult(format={:?}, len={}, tokens={})", + self.format, + self.text.len(), + self.token_estimate + ) + } +} + +// ============================================================================ +// Module-level functions +// ============================================================================ + +#[pyfunction] +#[pyo3(signature = (data, force = false, min_savings = 0.10))] +fn encode_auto_parallel( + data: &Bound<'_, PyAny>, + force: bool, + min_savings: f64, +) -> PyResult { + let value = types::py_to_json(data)?; + let result = formats::encode_auto_parallel(&value, force, min_savings)?; + Ok(EncodingResult { + format: result.format, + text: result.text, + header: result.header, + token_estimate: result.token_estimate, + }) +} + +#[pyfunction] +fn encode_all_parallel(data: &Bound<'_, PyAny>) -> PyResult> { + let value = types::py_to_json(data)?; + let results = formats::encode_all_parallel(&value)?; + Ok(results + .into_iter() + .map(|r| EncodingResult { + format: r.format, + text: r.text, + header: r.header, + token_estimate: r.token_estimate, + }) + .collect()) +} + +// ============================================================================ +// Python module +// ============================================================================ + +#[pymodule] +fn agon_core(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(encode_auto_parallel, m)?)?; + m.add_function(wrap_pyfunction!(encode_all_parallel, m)?)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ======================================================================== + // build_keep_tree tests (pure Rust) + // ======================================================================== + + #[test] + fn test_build_keep_tree_single_field() { + let paths = vec!["name".to_string()]; + let tree = build_keep_tree(&paths); + assert!(tree.children.contains_key("name")); + assert!(tree.children.get("name").unwrap().is_none()); // Leaf node + } + + #[test] + fn test_build_keep_tree_multiple_fields() { + let paths = vec!["name".to_string(), "age".to_string(), "email".to_string()]; + let tree = build_keep_tree(&paths); + assert_eq!(tree.children.len(), 3); + assert!(tree.children.contains_key("name")); + assert!(tree.children.contains_key("age")); + assert!(tree.children.contains_key("email")); + } + + #[test] + fn test_build_keep_tree_nested_path() { + let paths = vec!["user.name".to_string()]; + let tree = build_keep_tree(&paths); + assert!(tree.children.contains_key("user")); + let user_subtree = tree.children.get("user").unwrap(); + assert!(user_subtree.is_some()); + let user = user_subtree.as_ref().unwrap(); + assert!(user.children.contains_key("name")); + } + + #[test] + fn test_build_keep_tree_deeply_nested() { + let paths = vec!["a.b.c.d".to_string()]; + let tree = build_keep_tree(&paths); + + let a = tree.children.get("a").unwrap().as_ref().unwrap(); + let b = a.children.get("b").unwrap().as_ref().unwrap(); + let c = b.children.get("c").unwrap().as_ref().unwrap(); + assert!(c.children.contains_key("d")); + assert!(c.children.get("d").unwrap().is_none()); // Leaf + } + + #[test] + fn test_build_keep_tree_mixed_depth() { + let paths = vec![ + "id".to_string(), + "user.name".to_string(), + "user.email".to_string(), + ]; + let tree = build_keep_tree(&paths); + + // Top-level "id" + assert!(tree.children.contains_key("id")); + assert!(tree.children.get("id").unwrap().is_none()); + + // Nested "user.name" and "user.email" + let user = tree.children.get("user").unwrap().as_ref().unwrap(); + assert!(user.children.contains_key("name")); + assert!(user.children.contains_key("email")); + } + + #[test] + fn test_build_keep_tree_empty_paths() { + let paths: Vec = vec![]; + let tree = build_keep_tree(&paths); + assert!(tree.children.is_empty()); + } + + #[test] + fn test_build_keep_tree_whitespace_paths() { + let paths = vec![" ".to_string(), "".to_string(), "...".to_string()]; + let tree = build_keep_tree(&paths); + assert!(tree.children.is_empty()); + } + + #[test] + fn test_build_keep_tree_leading_trailing_dots() { + let paths = vec![".name.".to_string()]; + let tree = build_keep_tree(&paths); + assert!(tree.children.contains_key("name")); + } + + // Note: PyO3 integration tests (py_to_json, json_to_py) are tested via Python tests + // since they require linking to Python runtime which isn't available in cargo test. +} diff --git a/crates/agon-core/src/types.rs b/crates/agon-core/src/types.rs new file mode 100644 index 0000000..2bc03b4 --- /dev/null +++ b/crates/agon-core/src/types.rs @@ -0,0 +1,130 @@ +//! Type definitions and Python/JSON conversion utilities + +use pyo3::prelude::*; +use pyo3::types::{PyBool, PyDict, PyFloat, PyInt, PyList, PyString}; +use serde_json::Value as SerdeValue; + +use crate::error::{AgonError, Result}; + +/// Our JSON value type (re-export of serde_json::Value for convenience) +pub type JsonValue = SerdeValue; + +/// Convert a Python object to a JSON Value +pub fn py_to_json(obj: &Bound<'_, PyAny>) -> Result { + if obj.is_none() { + return Ok(JsonValue::Null); + } + + // Check bool before int (bool is subclass of int in Python) + if obj.is_instance_of::() { + return Ok(JsonValue::Bool(obj.extract::()?)); + } + + if obj.is_instance_of::() { + if let Ok(n) = obj.extract::() { + return Ok(JsonValue::Number(n.into())); + } + // Try as float if i64 doesn't work (large numbers) + if let Ok(f) = obj.extract::() { + if let Some(n) = serde_json::Number::from_f64(f) { + return Ok(JsonValue::Number(n)); + } + } + return Err(AgonError::InvalidData("Integer too large".to_string())); + } + + if obj.is_instance_of::() { + let val: f64 = obj.extract().map_err(AgonError::from)?; + if let Some(n) = serde_json::Number::from_f64(val) { + return Ok(JsonValue::Number(n)); + } + // Handle NaN/Infinity as null (JSON doesn't support them) + return Ok(JsonValue::Null); + } + + if obj.is_instance_of::() { + return Ok(JsonValue::String(obj.extract::()?)); + } + + if obj.is_instance_of::() { + let list = obj + .cast::() + .map_err(|e| AgonError::InvalidData(e.to_string()))?; + let arr: Result> = list.iter().map(|item| py_to_json(&item)).collect(); + return Ok(JsonValue::Array(arr?)); + } + + if obj.is_instance_of::() { + let dict = obj + .cast::() + .map_err(|e| AgonError::InvalidData(e.to_string()))?; + let mut map = serde_json::Map::new(); + for (key, value) in dict.iter() { + let key_str = key + .extract::() + .map_err(|_| AgonError::InvalidData("Dict keys must be strings".to_string()))?; + map.insert(key_str, py_to_json(&value)?); + } + return Ok(JsonValue::Object(map)); + } + + // Try to convert via str() as fallback + if let Ok(s) = obj.str() { + return Ok(JsonValue::String(s.to_string())); + } + + let type_name = obj + .get_type() + .name() + .map(|n| n.to_string()) + .unwrap_or_else(|_| "unknown".to_string()); + Err(AgonError::InvalidData(format!( + "Cannot convert {} to JSON", + type_name + ))) +} + +/// Convert a JSON Value to a Python object +pub fn json_to_py(py: Python<'_>, value: &JsonValue) -> PyResult> { + match value { + JsonValue::Null => Ok(py.None()), + JsonValue::Bool(b) => Ok(b.into_pyobject(py)?.to_owned().unbind().into_any()), + JsonValue::Number(n) => { + if let Some(i) = n.as_i64() { + Ok(i.into_pyobject(py)?.to_owned().unbind().into_any()) + } else if let Some(f) = n.as_f64() { + Ok(f.into_pyobject(py)?.to_owned().unbind().into_any()) + } else { + Ok(n.to_string().into_pyobject(py)?.unbind().into_any()) + } + } + JsonValue::String(s) => Ok(s.into_pyobject(py)?.unbind().into_any()), + JsonValue::Array(arr) => { + let list = PyList::empty(py); + for item in arr { + list.append(json_to_py(py, item)?)?; + } + Ok(list.unbind().into_any()) + } + JsonValue::Object(map) => { + let dict = PyDict::new(py); + for (key, val) in map { + dict.set_item(key, json_to_py(py, val)?)?; + } + Ok(dict.unbind().into_any()) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_json_roundtrip() { + let json = r#"{"name": "test", "values": [1, 2, 3], "nested": {"a": true}}"#; + let value: JsonValue = serde_json::from_str(json).unwrap(); + let back = serde_json::to_string(&value).unwrap(); + assert!(back.contains("name")); + } +} diff --git a/crates/agon-core/src/utils.rs b/crates/agon-core/src/utils.rs new file mode 100644 index 0000000..5bbd3e6 --- /dev/null +++ b/crates/agon-core/src/utils.rs @@ -0,0 +1,30 @@ +//! Shared utilities for AGON encoding + +use std::sync::OnceLock; +use tiktoken_rs::CoreBPE; + +/// Global tokenizer instance (o200k_base - used by GPT-4o/Claude) +static TOKENIZER: OnceLock = OnceLock::new(); + +fn get_tokenizer() -> &'static CoreBPE { + TOKENIZER.get_or_init(|| { + tiktoken_rs::o200k_base().expect("Failed to initialize o200k_base tokenizer") + }) +} + +/// Count tokens using tiktoken's o200k_base encoding +pub fn count_tokens(text: &str) -> usize { + get_tokenizer().encode_ordinary(text).len() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_count_tokens() { + assert!(count_tokens("hello world") > 0); + assert!(count_tokens("a longer piece of text") > count_tokens("short")); + assert_eq!(count_tokens(""), 0); + } +} diff --git a/pyproject.toml b/pyproject.toml index 855a98c..2b1e61a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Rust", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries :: Python Modules", "Typing :: Typed", @@ -52,6 +53,7 @@ dev = [ "pytest>=8.3.5", "pytest-cov>=4.0.0", "pytest-sugar>=1.0.0", + "pytest-xdist>=3.8.0", # Code quality "ruff>=0.11.9", "basedpyright>=1.29.1", @@ -64,29 +66,30 @@ dev = [ # Documentation "mkdocs>=1.6.1", "mkdocs-material>=9.5.0", + # Build + "maturin>=1.10.2", ] [build-system] -requires = ["hatchling", "uv-dynamic-versioning"] -build-backend = "hatchling.build" +requires = ["maturin>=1.8,<2.0"] +build-backend = "maturin" -[tool.hatch.version] -source = "uv-dynamic-versioning" -[tool.uv-dynamic-versioning] -vcs = "git" -style = "pep440" -bump = true - -[tool.hatch.build.targets.wheel] -packages = ["src/agon"] +[tool.maturin] +# Build from workspace member +manifest-path = "crates/agon-core/Cargo.toml" +# Python source location +python-source = "python" +# Rust module installed as agon.agon_core (hard dependency, cleaner than agon._agon) +module-name = "agon.agon_core" +features = ["pyo3/extension-module"] [tool.ruff] line-length = 100 target-version = "py311" -src = ["src", "tests"] +src = ["python", "tests"] [tool.ruff.lint] select = [ @@ -153,10 +156,10 @@ force-sort-within-sections = true [tool.basedpyright] -include = ["src", "tests"] +include = ["python", "tests"] executionEnvironments = [ { root = "tests", reportPrivateUsage = false, reportUnknownParameterType = false }, - { root = "src" }, + { root = "python" }, ] pythonVersion = "3.11" pythonPlatform = "All" @@ -179,6 +182,7 @@ reportOptionalMemberAccess = false [tool.pytest.ini_options] minversion = "8.0" addopts = [ + "-n=auto", "--strict-markers", "--strict-config", "--cov=agon", @@ -204,7 +208,7 @@ filterwarnings = [ [tool.coverage.run] -source = ["src"] +source = ["python"] branch = true relative_files = true omit = [ @@ -215,7 +219,7 @@ omit = [ [tool.coverage.paths] source = [ - "src/", + "python/", ".nox/*/lib/python*/site-packages/", ] @@ -234,4 +238,4 @@ exclude_lines = [ ] [tool.codespell] -skip = "*.git,*.lock,*.json,__pycache__,*.pyc,*.egg-info,htmlcov,.coverage,coverage.xml" +skip = "*.git,*.lock,*.json,__pycache__,*.pyc,*.egg-info,htmlcov,.coverage,coverage.xml,target" diff --git a/python/agon/__init__.py b/python/agon/__init__.py new file mode 100644 index 0000000..69670f6 --- /dev/null +++ b/python/agon/__init__.py @@ -0,0 +1,32 @@ +"""AGON - Adaptive Guarded Object Notation. + +A self-describing, token-efficient data interchange format optimized for LLMs. +""" + +# Re-export Rust format classes (inherit from AGONFormat) +from agon.agon_core import ( + AGONColumns, + AGONFormat, + AGONStruct, + AGONText, + EncodingResult, + encode_all_parallel, + encode_auto_parallel, +) +from agon.core import AGON, AGONEncoding, Format +from agon.errors import AGONError + +__all__ = [ + "AGON", + "AGONColumns", + "AGONEncoding", + "AGONError", + "AGONFormat", + "AGONStruct", + "AGONText", + "EncodingResult", + "Format", + "encode_all_parallel", + "encode_auto_parallel", +] +__version__ = "0.1.0" diff --git a/src/agon/core.py b/python/agon/core.py similarity index 87% rename from src/agon/core.py rename to python/agon/core.py index 05c6dce..ff72467 100644 --- a/src/agon/core.py +++ b/python/agon/core.py @@ -20,9 +20,20 @@ import orjson +# Rust format classes (primary API - inherit from AGONFormat) +from agon.agon_core import ( + AGONColumns, + AGONFormat, + AGONStruct, + AGONText, +) +from agon.agon_core import ( + encode_auto_parallel as _rs_encode_auto_parallel, +) from agon.encoding import DEFAULT_ENCODING, count_tokens from agon.errors import AGONError -from agon.formats import AGONColumns, AGONFormat, AGONStruct, AGONText + +# Python format classes (for reference/fallback) Format = Literal["auto", "json", "text", "columns", "struct"] ConcreteFormat = Literal["json", "text", "columns", "struct"] @@ -92,14 +103,15 @@ class AGON: "struct": "@AGON struct", } - # Format registries (encode without headers - headers added separately) + # Encoders - Rust for AGON formats, orjson for JSON _encoders: ClassVar[dict[ConcreteFormat, Callable[[Any], str]]] = { "json": lambda data: orjson.dumps(data).decode(), - "text": lambda data: AGONText.encode(data, include_header=False), - "columns": lambda data: AGONColumns.encode(data, include_header=False), - "struct": lambda data: AGONStruct.encode(data, include_header=False), + "text": lambda data: str(AGONText.encode(data, include_header=False)), + "columns": lambda data: str(AGONColumns.encode(data, include_header=False)), + "struct": lambda data: str(AGONStruct.encode(data, include_header=False)), } + # Decoders - Rust for AGON formats _decoders: ClassVar[dict[str, Callable[[str], Any]]] = { "@AGON text": AGONText.decode, "@AGON columns": AGONColumns.decode, @@ -143,34 +155,16 @@ def encode( """ # Direct format dispatch if format != "auto": - text = AGON._encoders[format](data) + encoder = AGON._encoders[format] + text = encoder(data) header = AGON._headers[format] return AGONEncoding(format, text, header) - # format == "auto" - candidates = [ - AGONEncoding( - cast("Format", fmt), - encoder(data), - AGON._headers.get(fmt, ""), - ) - for fmt, encoder in AGON._encoders.items() - if force is False or fmt != "json" - ] - - token_counts = [count_tokens(c.text, encoding=encoding) for c in candidates] - best_idx = min(range(len(candidates)), key=lambda i: token_counts[i]) - best = candidates[best_idx] - - if not force and best.format != "json": - json_result = next(c for c in candidates if c.format == "json") - json_idx = candidates.index(json_result) - json_tokens = token_counts[json_idx] - savings = 1.0 - (token_counts[best_idx] / max(1, json_tokens)) - if savings < min_savings: - return json_result - - return best + # format == "auto": use Rust for fast parallel encoding and format selection + result = _rs_encode_auto_parallel(data, force, min_savings) + selected_format = cast("ConcreteFormat", result.format) + header = AGON._headers[selected_format] + return AGONEncoding(selected_format, result.text, header) @overload @staticmethod @@ -220,7 +214,7 @@ def decode( case "json": return AGON._decode_json(text) case "text" | "columns" | "struct": - header = AGON._headers[cast("ConcreteFormat", format)] + header = AGON._headers[format] if not text.startswith(header): text = f"{header}\n\n{text}" return AGON._decoders[header](text) diff --git a/src/agon/encoding.py b/python/agon/encoding.py similarity index 100% rename from src/agon/encoding.py rename to python/agon/encoding.py diff --git a/python/agon/errors.py b/python/agon/errors.py new file mode 100644 index 0000000..a91cd52 --- /dev/null +++ b/python/agon/errors.py @@ -0,0 +1,15 @@ +"""Shared error types for the AGON package. + +All public exceptions raised by AGON should inherit from `AGONError` so callers +can catch AGON failures with a single except clause. +""" + +from __future__ import annotations + + +class AGONError(ValueError): + """Base class for all AGON-related errors. + + Inherits from ValueError for compatibility with errors raised by + the Rust bindings (which raise ValueError via PyO3). + """ diff --git a/src/agon/py.typed b/python/agon/py.typed similarity index 100% rename from src/agon/py.typed rename to python/agon/py.typed diff --git a/src/agon/__init__.py b/src/agon/__init__.py deleted file mode 100644 index 0366834..0000000 --- a/src/agon/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -"""AGON - Adaptive Guarded Object Notation. - -A self-describing, token-efficient data interchange format optimized for LLMs. -""" - -from agon.core import AGON, AGONEncoding, Format -from agon.errors import ( - AGONColumnsError, - AGONError, - AGONStructError, - AGONTextError, -) - -__all__ = [ - "AGON", - "AGONColumnsError", - "AGONEncoding", - "AGONError", - "AGONStructError", - "AGONTextError", - "Format", -] -__version__ = "0.1.0" diff --git a/src/agon/errors.py b/src/agon/errors.py deleted file mode 100644 index 55009d4..0000000 --- a/src/agon/errors.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Shared error types for the AGON package. - -All public exceptions raised by AGON should inherit from `AGONError` so callers -can catch AGON failures with a single except clause. -""" - -from __future__ import annotations - - -class AGONError(ValueError): - """Base class for all AGON-related errors.""" - - -class AGONTextError(AGONError): - """Raised when AGONText decoding fails.""" - - -class AGONColumnsError(AGONError): - """Raised when AGONColumns encoding/decoding fails.""" - - -class AGONStructError(AGONError): - """Raised when AGONStruct encoding/decoding fails.""" diff --git a/src/agon/formats/__init__.py b/src/agon/formats/__init__.py deleted file mode 100644 index 5962868..0000000 --- a/src/agon/formats/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Codec implementations used by AGON. - -This package contains concrete encoders/decoders ("formats"). The public, -user-facing API lives in `agon.core`, which selects among formats. -""" - -from __future__ import annotations - -from agon.formats.base import AGONFormat -from agon.formats.columns import AGONColumns -from agon.formats.struct import AGONStruct -from agon.formats.text import AGONText - -__all__ = ["AGONColumns", "AGONFormat", "AGONStruct", "AGONText"] diff --git a/src/agon/formats/base.py b/src/agon/formats/base.py deleted file mode 100644 index fe01146..0000000 --- a/src/agon/formats/base.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Base class for AGON format codecs.""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import Any - - -class AGONFormat(ABC): - """Abstract base class for AGON format codecs. - - All AGON formats should inherit from this class and implement: - - encode(data, ...) -> str - - decode(payload, ...) -> Any - - hint() -> str - """ - - @staticmethod - @abstractmethod - def encode(data: object, *, include_header: bool = False) -> str: - """Encode data to this format.""" - ... - - @staticmethod - @abstractmethod - def decode(payload: str) -> object: - """Decode a payload in this format.""" - ... - - @staticmethod - @abstractmethod - def hint() -> str: - """Return a short hint describing this format for LLMs.""" - ... - - # ---------- Projection ---------- - - @staticmethod - def project_data(data: list[dict[str, Any]], keep_paths: list[str]) -> list[dict[str, Any]]: - """Project data to only keep specified fields. - - Args: - data: List of objects to project. - keep_paths: List of field paths to keep. Supports dotted paths - like "user.name" or "quotes.symbol". - - Returns: - Projected data with only the specified fields. - """ - keep_tree = AGONFormat._build_keep_tree(keep_paths) - return [AGONFormat._project_obj(r, keep_tree) for r in data] - - @staticmethod - def _build_keep_tree(keep_paths: list[str]) -> dict[str, Any]: - keep_tree: dict[str, Any] = {} - - for raw_path in keep_paths: - path = raw_path.strip().strip(".") - if not path: - continue - parts = [p for p in path.split(".") if p] - - cur: dict[str, Any] = keep_tree - for part in parts[:-1]: - nxt = cur.get(part) - if nxt is None: - nxt = {} - cur[part] = nxt - cur = nxt - - cur.setdefault(parts[-1], None) - - return keep_tree - - @staticmethod - def _project_obj(obj: dict[str, Any], keep_tree: dict[str, Any]) -> dict[str, Any]: - out: dict[str, Any] = {} - for k, sub_keep in keep_tree.items(): - if k not in obj: - continue - v = obj[k] - if v is None or sub_keep is None: - out[k] = v - continue - - # If sub_keep is not None, it is always a dict produced by _build_keep_tree(). - if isinstance(v, dict): - out[k] = AGONFormat._project_obj(v, sub_keep) - elif isinstance(v, list) and (not v or all(isinstance(x, dict) for x in v)): - out[k] = [AGONFormat._project_obj(x, sub_keep) for x in v] - else: - out[k] = v - return out diff --git a/src/agon/formats/columns.py b/src/agon/formats/columns.py deleted file mode 100644 index da2f826..0000000 --- a/src/agon/formats/columns.py +++ /dev/null @@ -1,895 +0,0 @@ -r"""AGONColumns format codec. - -AGONColumns is a columnar encoding that transposes data to group by column (type) -instead of row. This provides better token efficiency for wide tables with -many columns of the same type. - -Format structure: - @AGON columns - @D= # optional, default: \t - - -Example: - @AGON columns - - products[3] - ├ sku: A123, B456, C789 - ├ name: Widget, Gadget, Gizmo - └ price: 9.99, 19.99, 29.99 -""" - -from __future__ import annotations - -import re -from typing import Any - -from agon.errors import AGONColumnsError -from agon.formats.base import AGONFormat - -HEADER = "@AGON columns" -DEFAULT_DELIMITER = "\t" -INDENT = " " - -# Tree drawing characters -BRANCH = "├" # U+251C: has more siblings -LAST_BRANCH = "└" # U+2514: last sibling -ASCII_BRANCH = "|" -ASCII_LAST = "`" - -# Characters that require quoting -SPECIAL_CHARS = frozenset(["@", "#", "-", BRANCH, LAST_BRANCH, ASCII_BRANCH]) -BOOL_NULL = frozenset(["true", "false", "null"]) - -# Regex for numbers -NUMBER_RE = re.compile(r"^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$") - - -class AGONColumns(AGONFormat): - """AGONColumns format encoder/decoder. - - Encodes JSON data to a columnar format where array fields are transposed - to show all values of each field together, optimizing for type clustering. - """ - - @staticmethod - def hint() -> str: - """Return a short hint instructing LLMs how to generate this format.""" - return "Return in AGON columns format: Start with @AGON columns header, transpose arrays to name[N] with ├/└ field: val1, val2, ..." - - @staticmethod - def encode( - data: object, - *, - delimiter: str = DEFAULT_DELIMITER, - include_header: bool = True, - use_ascii: bool = False, - ) -> str: - """Encode data to AGONColumns format. - - Args: - data: JSON-serializable data to encode. - delimiter: Value delimiter within columns (default: ", "). - include_header: Whether to include @AGON columns header. - use_ascii: Use ASCII tree chars (|, `) instead of Unicode. - - Returns: - AGONColumns encoded string. - """ - lines: list[str] = [] - - if include_header: - lines.append(HEADER) - if delimiter != DEFAULT_DELIMITER: - lines.append(f"@D={_escape_delimiter(delimiter)}") - lines.append("") - - _encode_value(data, lines, depth=0, delimiter=delimiter, name=None, use_ascii=use_ascii) - - return "\n".join(lines) - - @staticmethod - def decode(payload: str, *, lenient: bool = False) -> Any: - """Decode AGONColumns payload. - - Args: - payload: AGONColumns encoded string. - lenient: If True, allow length mismatches and best-effort parsing. - - Returns: - Decoded Python value. - - Raises: - AGONColumnsError: If payload is invalid. - """ - lines = payload.splitlines() - if not lines: - raise AGONColumnsError("Empty payload") - - idx = 0 - header_line = lines[idx].strip() - if not header_line.startswith("@AGON columns"): - raise AGONColumnsError(f"Invalid header: {header_line}") - idx += 1 - - delimiter = DEFAULT_DELIMITER - if idx < len(lines) and lines[idx].startswith("@D="): - delimiter = _parse_delimiter(lines[idx][3:].strip()) - idx += 1 - - while idx < len(lines) and not lines[idx].strip(): - idx += 1 - - if idx >= len(lines): - return None - - result, _ = _decode_value(lines, idx, depth=0, delimiter=delimiter, lenient=lenient) - return result - - -def _escape_delimiter(d: str) -> str: - """Escape delimiter for @D= declaration.""" - if d == "\t": - return "\\t" - if d == "\n": - return "\\n" - # Unreachable via public API: DEFAULT_DELIMITER is ", " and encode() only calls - # _escape_delimiter() when delimiter != DEFAULT_DELIMITER. - if d == ", ": # pragma: no cover - return ", " - return d - - -def _parse_delimiter(d: str) -> str: - """Parse delimiter from @D= declaration.""" - if d == "\\t": - return "\t" - if d == "\\n": - return "\n" - return d - - -def _needs_quote(s: str, delimiter: str) -> bool: - """Check if string needs quoting.""" - if not s: - return True - if s != s.strip(): - return True - if delimiter in s: - return True - if "\n" in s or "\r" in s or "\\" in s or '"' in s: - return True - if s[0] in SPECIAL_CHARS: - return True - return bool(s.lower() in BOOL_NULL or NUMBER_RE.match(s)) - - -def _quote_string(s: str) -> str: - """Quote and escape a string value.""" - escaped = ( - s.replace("\\", "\\\\") - .replace('"', '\\"') - .replace("\n", "\\n") - .replace("\r", "\\r") - .replace("\t", "\\t") - ) - return f'"{escaped}"' - - -def _unquote_string(s: str) -> str: - """Unquote and unescape a string value.""" - if not (s.startswith('"') and s.endswith('"')): - return s - inner = s[1:-1] - result = [] - i = 0 - while i < len(inner): - if inner[i] == "\\" and i + 1 < len(inner): - c = inner[i + 1] - if c == "n": - result.append("\n") - elif c == "r": - result.append("\r") - elif c == "t": - result.append("\t") - elif c == "\\": - result.append("\\") - elif c == '"': - result.append('"') - else: - result.append(inner[i + 1]) - i += 2 - else: - result.append(inner[i]) - i += 1 - return "".join(result) - - -def _encode_primitive(val: Any, delimiter: str) -> str: - """Encode a primitive value to string.""" - if val is None: - return "null" - if isinstance(val, bool): - return "true" if val else "false" - if isinstance(val, int | float): - if isinstance(val, float): - if val != val: # NaN - return "" - if val == float("inf") or val == float("-inf"): - return "" - if val == 0.0 and str(val) == "-0.0": - return "0" - return str(val) - s = str(val) - if _needs_quote(s, delimiter): - return _quote_string(s) - return s - - -def _parse_primitive(s: str) -> Any: - """Parse a primitive value from string.""" - s = s.strip() - if not s: - return None - - if s.startswith('"') and s.endswith('"'): - return _unquote_string(s) - - lower = s.lower() - if lower == "null": - return None - if lower == "true": - return True - if lower == "false": - return False - - if NUMBER_RE.match(s): - if "." in s or "e" in s.lower(): - return float(s) - return int(s) - - return s - - -def _is_columnar_array(arr: list[Any]) -> tuple[bool, list[str]]: - """Check if array can be encoded in columnar format.""" - if not arr: - return False, [] - - if not all(isinstance(x, dict) for x in arr): - return False, [] - - all_keys: set[str] = set() - for obj in arr: - all_keys.update(obj.keys()) - - if not all_keys: - return False, [] - - for obj in arr: - for v in obj.values(): - if isinstance(v, dict | list): - return False, [] - - key_order: list[str] = [] - for obj in arr: - for k in obj: - if k not in key_order: - key_order.append(k) - - return True, key_order - - -def _is_primitive_array(arr: list[Any]) -> bool: - """Check if array contains only primitives.""" - return all(not isinstance(x, dict | list) for x in arr) - - -def _encode_value( - val: Any, - lines: list[str], - depth: int, - delimiter: str, - name: str | None, - use_ascii: bool, -) -> None: - """Encode a value, appending lines.""" - indent = INDENT * depth - - if val is None or isinstance(val, bool | int | float | str): - if name: - lines.append(f"{indent}{name}: {_encode_primitive(val, delimiter)}") - else: - lines.append(f"{indent}{_encode_primitive(val, delimiter)}") - return - - if isinstance(val, list): - _encode_array(val, lines, depth, delimiter, name, use_ascii) - return - - if isinstance(val, dict): - _encode_object(val, lines, depth, delimiter, name, use_ascii) - return - - if name: - lines.append(f"{indent}{name}: {_encode_primitive(str(val), delimiter)}") - else: - lines.append(f"{indent}{_encode_primitive(str(val), delimiter)}") - - -def _encode_array( - arr: list[Any], - lines: list[str], - depth: int, - delimiter: str, - name: str | None, - use_ascii: bool, -) -> None: - """Encode an array value in columnar format.""" - indent = INDENT * depth - branch = ASCII_BRANCH if use_ascii else BRANCH - last = ASCII_LAST if use_ascii else LAST_BRANCH - - if not arr: - if name: - lines.append(f"{indent}{name}[0]") - else: - lines.append(f"{indent}[0]") - return - - is_columnar, fields = _is_columnar_array(arr) - if is_columnar and fields: - if name: - lines.append(f"{indent}{name}[{len(arr)}]") - else: - lines.append(f"{indent}[{len(arr)}]") - - for i, field in enumerate(fields): - tree_char = last if i == len(fields) - 1 else branch - values = [ - (_encode_primitive(obj[field], delimiter) if field in obj else "") for obj in arr - ] - lines.append(f"{indent}{tree_char} {field}: {delimiter.join(values)}") - return - - if _is_primitive_array(arr): - values = delimiter.join(_encode_primitive(v, delimiter) for v in arr) - if name: - lines.append(f"{indent}{name}[{len(arr)}]: {values}") - else: - lines.append(f"{indent}[{len(arr)}]: {values}") - return - - if name: - lines.append(f"{indent}{name}[{len(arr)}]:") - else: - lines.append(f"{indent}[{len(arr)}]:") - - for item in arr: - if isinstance(item, dict): - _encode_list_item_object(item, lines, depth + 1, delimiter, use_ascii) - else: - lines.append(f"{indent} - {_encode_primitive(item, delimiter)}") - - -def _encode_list_item_object( - obj: dict[str, Any], - lines: list[str], - depth: int, - delimiter: str, - use_ascii: bool, -) -> None: - """Encode an object as a list item.""" - indent = INDENT * depth - first = True - - for k, v in obj.items(): - prefix = f"{indent}- " if first else f"{indent} " - first = False - - if isinstance(v, dict): - lines.append(f"{prefix}{k}:") - for nk, nv in v.items(): - if isinstance(nv, dict | list): - _encode_value(nv, lines, depth + 2, delimiter, nk, use_ascii) - else: - lines.append(f"{indent} {nk}: {_encode_primitive(nv, delimiter)}") - elif isinstance(v, list): - lines.append(f"{prefix}{k}:") - _encode_value(v, lines, depth + 2, delimiter, None, use_ascii) - else: - lines.append(f"{prefix}{k}: {_encode_primitive(v, delimiter)}") - - -def _encode_object( - obj: dict[str, Any], - lines: list[str], - depth: int, - delimiter: str, - name: str | None, - use_ascii: bool, -) -> None: - """Encode an object value.""" - indent = INDENT * depth - - if name: - lines.append(f"{indent}{name}:") - depth += 1 - indent = INDENT * depth - - for k, v in obj.items(): - if isinstance(v, dict | list): - _encode_value(v, lines, depth, delimiter, k, use_ascii) - else: - lines.append(f"{indent}{k}: {_encode_primitive(v, delimiter)}") - - -# Decode helpers - -ARRAY_HEADER_RE = re.compile(r"^(\w*)\[(\d+)\]$") -PRIMITIVE_ARRAY_RE = re.compile(r"^(\w*)\[(\d+)\]:\s*(.*)$") -LIST_ARRAY_RE = re.compile(r"^(\w*)\[(\d+)\]:$") -KEY_VALUE_RE = re.compile(r"^([^:]+):\s*(.*)$") -COLUMN_LINE_RE = re.compile(r"^[├└|`]\s*([^:]+):\s*(.*)$") - -_MISSING_CELL = object() - - -def _get_indent_depth(line: str) -> int: - """Get indentation depth (number of 2-space indents).""" - stripped = line.lstrip(" ") - spaces = len(line) - len(stripped) - return spaces // 2 - - -def _decode_value( - lines: list[str], - idx: int, - depth: int, - delimiter: str, - lenient: bool, -) -> tuple[Any, int]: - """Decode a value from lines starting at idx.""" - if idx >= len(lines): - return None, idx - - line = lines[idx] - stripped = line.strip() - - if not stripped or stripped.startswith("#"): - return _decode_value(lines, idx + 1, depth, delimiter, lenient) - - # Check for columnar array: name[N] (no braces, no colon with inline values) - m = ARRAY_HEADER_RE.match(stripped) - if m: - count = int(m.group(2)) - # Empty array case - if count == 0: - name = m.group(1) - if name: - return _decode_nested_object( - lines, idx, _get_indent_depth(line), delimiter, lenient - ) - return [], idx + 1 - next_idx = idx + 1 - if next_idx < len(lines): - next_line = lines[next_idx].strip() - if next_line and (next_line[0] in (BRANCH, LAST_BRANCH, ASCII_BRANCH, ASCII_LAST)): - if m.group(1): - return _decode_nested_object( - lines, idx, _get_indent_depth(line), delimiter, lenient - ) - return _decode_columnar_array(lines, idx, depth, delimiter, lenient, m) - - # Check for primitive array: name[N]: val1, val2, ... - m = PRIMITIVE_ARRAY_RE.match(stripped) - if m: - values_part = m.group(3).strip() - if values_part: - if m.group(1): - return _decode_nested_object( - lines, idx, _get_indent_depth(line), delimiter, lenient - ) - return _decode_primitive_array(m, delimiter, idx) - - # Check for list array: name[N]: - m = LIST_ARRAY_RE.match(stripped) - if m: - if m.group(1): - return _decode_nested_object(lines, idx, _get_indent_depth(line), delimiter, lenient) - return _decode_list_array(lines, idx, depth, delimiter, lenient, m) - - # Check for key:value (object) - m = KEY_VALUE_RE.match(stripped) - if m: - return _decode_nested_object(lines, idx, _get_indent_depth(line), delimiter, lenient) - - raise AGONColumnsError(f"Cannot parse line {idx}: {stripped}") - - -def _decode_columnar_array( - lines: list[str], - idx: int, - depth: int, - delimiter: str, - lenient: bool, - match: re.Match[str], -) -> tuple[Any, int]: - """Decode columnar array: name[N] followed by ├/└ field: values.""" - name = match.group(1) - count = int(match.group(2)) - idx += 1 - - columns: dict[str, list[Any]] = {} - field_order: list[str] = [] - - def _parse_columnar_cell(cell: str) -> Any: - # In columnar arrays, an empty cell means “key absent”, while the - # literal token "null" means the key is present with value None. - if not cell.strip(): - return _MISSING_CELL - return _parse_primitive(cell) - - while idx < len(lines): - line = lines[idx] - stripped = line.strip() - - if not stripped: - idx += 1 - continue - - if stripped.startswith("#"): - idx += 1 - continue - - m = COLUMN_LINE_RE.match(stripped) - if not m: - break - - field = m.group(1).strip() - # Don't strip trailing whitespace - it's part of the delimiter for empty values - values_str = m.group(2).lstrip() - - values = _split_column_values(values_str, delimiter) - columns[field] = [_parse_columnar_cell(v) for v in values] - field_order.append(field) - idx += 1 - - result: list[dict[str, Any]] = [] - for row_idx in range(count): - obj: dict[str, Any] = {} - for field in field_order: - vals = columns.get(field, []) - val = vals[row_idx] if row_idx < len(vals) else _MISSING_CELL - if val is not _MISSING_CELL: - obj[field] = val - result.append(obj) - - if name: - return {name: result}, idx - return result, idx - - -def _split_column_values(values_str: str, delimiter: str) -> list[str]: - """Split column values, respecting quotes.""" - result: list[str] = [] - current: list[str] = [] - in_quote = False - i = 0 - - while i < len(values_str): - if values_str[i : i + len(delimiter)] == delimiter and not in_quote: - result.append("".join(current)) - current = [] - i += len(delimiter) - continue - - c = values_str[i] - if c == '"' and not in_quote: - in_quote = True - current.append(c) - elif c == '"' and in_quote: - if i > 0 and values_str[i - 1] == "\\": - current.append(c) - else: - in_quote = False - current.append(c) - else: - current.append(c) - i += 1 - - result.append("".join(current)) - return result - - -def _decode_primitive_array( - match: re.Match[str], - delimiter: str, - idx: int, -) -> tuple[Any, int]: - """Decode primitive array: name[N]: val1, val2, ...""" - name = match.group(1) - values_str = match.group(3) - - if not values_str.strip(): - arr: list[Any] = [] - else: - values = _split_column_values(values_str, delimiter) - arr = [_parse_primitive(v) for v in values] - - if name: - return {name: arr}, idx + 1 - return arr, idx + 1 - - -def _decode_list_array( - lines: list[str], - idx: int, - depth: int, - delimiter: str, - lenient: bool, - match: re.Match[str], -) -> tuple[Any, int]: - """Decode list array: name[N]: followed by - items.""" - name = match.group(1) - count = int(match.group(2)) - idx += 1 - result: list[Any] = [] - base_depth = depth + 1 - - while idx < len(lines) and len(result) < count: - line = lines[idx] - if not line.strip() or line.strip().startswith("#"): - idx += 1 - continue - - line_depth = _get_indent_depth(line) - if line_depth < base_depth: - break - - stripped = line.strip() - if stripped.startswith("- "): - item_str = stripped[2:].strip() - kv_match = KEY_VALUE_RE.match(item_str) - if kv_match: - obj, idx = _decode_list_item_object(lines, idx, base_depth, delimiter, lenient) - result.append(obj) - else: - result.append(_parse_primitive(item_str)) - idx += 1 - else: - break - - if name: - return {name: result}, idx - return result, idx - - -def _decode_list_item_object( - lines: list[str], - idx: int, - base_depth: int, - delimiter: str, - lenient: bool, -) -> tuple[dict[str, Any], int]: - """Decode an object that starts with '- key: value'.""" - obj: dict[str, Any] = {} - item_depth = base_depth - - first_line = lines[idx].strip() - first_content = first_line[2:].strip() - m = KEY_VALUE_RE.match(first_content) - if m: - key = m.group(1).strip() - val_str = m.group(2).strip() - if val_str: - obj[key] = _parse_primitive(val_str) - else: - idx += 1 - while idx < len(lines) and ( - not lines[idx].strip() or lines[idx].strip().startswith("#") - ): - idx += 1 - if idx < len(lines): - next_line = lines[idx] - next_depth = _get_indent_depth(next_line) - if next_depth >= item_depth + 2: - val, idx = _decode_value(lines, idx, next_depth, delimiter, lenient) - obj[key] = val - else: - obj[key] = {} - else: - obj[key] = {} - while idx < len(lines): - line = lines[idx] - if not line.strip(): - idx += 1 - continue - line_depth = _get_indent_depth(line) - if line_depth <= item_depth: - break - stripped = line.strip() - - # Check for array patterns first - if ( - ARRAY_HEADER_RE.match(stripped) - or PRIMITIVE_ARRAY_RE.match(stripped) - or LIST_ARRAY_RE.match(stripped) - ): - nested, idx = _decode_value(lines, idx, line_depth, delimiter, lenient) - if isinstance(nested, dict): - obj.update(nested) - continue - - kv = KEY_VALUE_RE.match(stripped) - if kv: - k = kv.group(1).strip() - v_str = kv.group(2).strip() - if v_str: - obj[k] = _parse_primitive(v_str) - else: - idx += 1 - while idx < len(lines) and ( - not lines[idx].strip() or lines[idx].strip().startswith("#") - ): - idx += 1 - if idx < len(lines): - next_line = lines[idx] - next_depth = _get_indent_depth(next_line) - if next_depth > line_depth: - val, idx = _decode_value(lines, idx, next_depth, delimiter, lenient) - obj[k] = val - else: - obj[k] = {} - else: - obj[k] = {} - continue - idx += 1 - return obj, idx - idx += 1 - - while idx < len(lines): - line = lines[idx] - if not line.strip(): - idx += 1 - continue - line_depth = _get_indent_depth(line) - if line_depth <= item_depth: - break - - stripped = line.strip() - - # Check for array patterns first - m = ARRAY_HEADER_RE.match(stripped) - if m: - count = int(m.group(2)) - if count == 0: - name = m.group(1) - if name: - obj[name] = [] - idx += 1 - continue - next_idx = idx + 1 - if next_idx < len(lines): - next_line = lines[next_idx].strip() - if next_line and (next_line[0] in (BRANCH, LAST_BRANCH, ASCII_BRANCH, ASCII_LAST)): - nested, idx = _decode_columnar_array( - lines, idx, line_depth, delimiter, lenient, m - ) - if isinstance(nested, dict): - obj.update(nested) - continue - - m = LIST_ARRAY_RE.match(stripped) - if m: - nested, idx = _decode_list_array(lines, idx, line_depth, delimiter, lenient, m) - if isinstance(nested, dict): - obj.update(nested) - continue - - m = PRIMITIVE_ARRAY_RE.match(stripped) - if m: - nested, idx = _decode_primitive_array(m, delimiter, idx) - if isinstance(nested, dict): - obj.update(nested) - continue - - kv = KEY_VALUE_RE.match(stripped) - if kv: - key = kv.group(1).strip() - val_str = kv.group(2).strip() - if val_str: - obj[key] = _parse_primitive(val_str) - idx += 1 - else: - idx += 1 - while idx < len(lines) and ( - not lines[idx].strip() or lines[idx].strip().startswith("#") - ): - idx += 1 - if idx < len(lines): - next_line = lines[idx] - next_depth = _get_indent_depth(next_line) - if next_depth > line_depth: - val, idx = _decode_value(lines, idx, next_depth, delimiter, lenient) - obj[key] = val - else: - obj[key] = {} - else: - obj[key] = {} - else: - idx += 1 - - return obj, idx - - -def _decode_nested_object( - lines: list[str], - idx: int, - expected_depth: int, - delimiter: str, - lenient: bool, -) -> tuple[dict[str, Any], int]: - """Decode a nested object at a specific indent level.""" - obj: dict[str, Any] = {} - - while idx < len(lines): - line = lines[idx] - if not line.strip(): - idx += 1 - continue - - line_depth = _get_indent_depth(line) - if line_depth < expected_depth: - break - - stripped = line.strip() - - # Check for array patterns first - m = ARRAY_HEADER_RE.match(stripped) - if m: - nested, idx = _decode_columnar_array(lines, idx, line_depth, delimiter, lenient, m) - if isinstance(nested, dict): - obj.update(nested) - continue - - m = LIST_ARRAY_RE.match(stripped) - if m: - nested, idx = _decode_list_array(lines, idx, line_depth, delimiter, lenient, m) - if isinstance(nested, dict): - obj.update(nested) - continue - - m = PRIMITIVE_ARRAY_RE.match(stripped) - if m: - nested, idx = _decode_primitive_array(m, delimiter, idx) - if isinstance(nested, dict): - obj.update(nested) - continue - - kv = KEY_VALUE_RE.match(stripped) - if kv: - key = kv.group(1).strip() - val_str = kv.group(2).strip() - if val_str: - obj[key] = _parse_primitive(val_str) - idx += 1 - else: - idx += 1 - while idx < len(lines) and ( - not lines[idx].strip() or lines[idx].strip().startswith("#") - ): - idx += 1 - if idx < len(lines): - next_line = lines[idx] - next_depth = _get_indent_depth(next_line) - if next_depth > line_depth: - val, idx = _decode_value(lines, idx, next_depth, delimiter, lenient) - obj[key] = val - else: - obj[key] = {} - else: - obj[key] = {} - else: - break - - return obj, idx diff --git a/src/agon/formats/struct.py b/src/agon/formats/struct.py deleted file mode 100644 index 00efd53..0000000 --- a/src/agon/formats/struct.py +++ /dev/null @@ -1,1070 +0,0 @@ -"""AGONStruct format codec. - -AGONStruct reduces token usage by defining reusable struct templates for repeated -object structures. Instead of repeating field names for every instance, define -the shape once and instantiate with just values. - -Format structure: - @AGON struct - - @StructName: field1, field2, field3? - @ChildStruct(ParentStruct): extra_field - - - -Example: - @AGON struct - - @FR: fmt, raw - - - symbol: AAPL - regularMarketPrice: FR(150.00, 150.0) - regularMarketChange: FR(+2.50, 2.5) -""" - -from __future__ import annotations - -from collections import Counter -import re -from typing import Any - -from agon.errors import AGONStructError -from agon.formats.base import AGONFormat - -HEADER = "@AGON struct" -INDENT = " " - -# Regex patterns -NUMBER_RE = re.compile(r"^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$") - -# Struct definition: @StructName: field1, field2, field3? -# Or with inheritance: @ChildStruct(Parent1, Parent2): field1, field2 -STRUCT_DEF_RE = re.compile(r"^@(\w+)(?:\(([^)]+)\))?:\s*(.*)$") - -# Struct instantiation: StructName(val1, val2, val3) -STRUCT_INST_RE = re.compile(r"^(\w+)\(") - -# Key-value pattern -KEY_VALUE_RE = re.compile(r"^([^:]+):\s*(.*)$") - -# Array header pattern: name[N] or [N] (may have inline content after colon) -ARRAY_HEADER_RE = re.compile(r"^(\w*)\[(\d+)\]:?") - -# Struct definition storage: {name: (fields, optional_fields, parents)} -StructDef = tuple[list[str], set[str], list[str]] -StructRegistry = dict[str, StructDef] - - -class AGONStruct(AGONFormat): - """AGONStruct format encoder/decoder. - - Encodes JSON data using struct templates for repeated object structures. - Significantly reduces tokens for data with consistent nested patterns. - """ - - @staticmethod - def hint() -> str: - """Return a short hint instructing LLMs how to generate this format.""" - return "Return in AGON struct format: Start with @AGON struct header, define templates as @Struct: fields, instantiate as Struct(v1, v2)" - - @staticmethod - def encode( - data: object, - *, - include_header: bool = True, - min_occurrences: int = 3, - min_fields: int = 2, - ) -> str: - """Encode data to AGONStruct format. - - Args: - data: JSON-serializable data to encode. - include_header: Whether to include @AGON struct header. - min_occurrences: Minimum occurrences to create a struct (default: 3). - min_fields: Minimum fields for a struct to be worthwhile (default: 2). - - Returns: - AGONStruct encoded string. - """ - # Detect repeated object shapes - shapes = _detect_shapes(data) - struct_defs = _create_struct_definitions(shapes, min_occurrences, min_fields) - - # Build registry for encoding - registry: StructRegistry = {} - for name, fields, optional, parents in struct_defs: - _register_struct(registry, name, fields, optional, parents) - - lines: list[str] = [] - - if include_header: - lines.append(HEADER) - lines.append("") - - # Emit struct definitions even when headers are disabled. - # The header is used for auto-detect decoding, but LLM prompts need - # the struct templates to interpret instances like FR(v1, v2). - if struct_defs: - for name, fields, optional, parents in struct_defs: - fields_str = ", ".join(f + "?" if f in optional else f for f in fields) - if parents: - parents_str = ", ".join(parents) - lines.append(f"@{name}({parents_str}): {fields_str}") - else: - lines.append(f"@{name}: {fields_str}") - - lines.append("") - - _encode_value(data, lines, depth=0, registry=registry) - - return "\n".join(lines) - - @staticmethod - def decode(payload: str, *, lenient: bool = False) -> Any: - """Decode AGONStruct payload. - - Args: - payload: AGONStruct encoded string. - lenient: If True, allow best-effort parsing. - - Returns: - Decoded Python value. - - Raises: - AGONStructError: If payload is invalid. - """ - lines = payload.splitlines() - if not lines: - raise AGONStructError("Empty payload") - - idx = 0 - header_line = lines[idx].strip() - if not header_line.startswith("@AGON struct"): - raise AGONStructError(f"Invalid header: {header_line}") - idx += 1 - - # Parse struct definitions - registry: StructRegistry = {} - while idx < len(lines): - line = lines[idx].strip() - if not line: - idx += 1 - continue - if not line.startswith("@"): - break - # Parse struct definition - parsed = _parse_struct_def(line) - if parsed: - name, fields, optional, parents = parsed - _register_struct(registry, name, fields, optional, parents) - idx += 1 - - # Skip blank lines - while idx < len(lines) and not lines[idx].strip(): - idx += 1 - - if idx >= len(lines): - return None - - result, _ = _decode_value(lines, idx, depth=0, registry=registry, lenient=lenient) - return result - - -def _register_struct( - registry: StructRegistry, - name: str, - fields: list[str], - optional: set[str], - parents: list[str], -) -> None: - """Register a struct, resolving parent fields.""" - all_fields: list[str] = [] - all_optional: set[str] = set() - - # Resolve inherited fields from parents - for parent_name in parents: - parent = registry.get(parent_name) - if parent is None: - raise AGONStructError(f"Unknown parent struct: {parent_name}") - parent_fields, parent_optional, _ = parent - for f in parent_fields: - if f not in all_fields: - all_fields.append(f) - all_optional.update(parent_optional) - - # Add own fields - for f in fields: - if f not in all_fields: - all_fields.append(f) - all_optional.update(optional) - - registry[name] = (all_fields, all_optional, parents) - - -def _detect_shapes( - data: object, - shapes: Counter[tuple[str, ...]] | None = None, -) -> Counter[tuple[str, ...]]: - """Detect repeated object shapes in data.""" - if shapes is None: - shapes = Counter() - - if isinstance(data, dict): - # Only count shapes with primitive values - primitive_keys: tuple[str, ...] = tuple( - sorted(k for k, v in data.items() if not isinstance(v, dict | list)) - ) - if len(primitive_keys) >= 2: - shapes[primitive_keys] += 1 - - # Recurse into nested values - for v in data.values(): - _detect_shapes(v, shapes) - - elif isinstance(data, list): - for item in data: - _detect_shapes(item, shapes) - - return shapes - - -def _create_struct_definitions( - shapes: Counter[tuple[str, ...]], - min_occurrences: int, - min_fields: int, -) -> list[tuple[str, list[str], set[str], list[str]]]: - """Create struct definitions from detected shapes. - - Returns list of (name, fields, optional_fields, parents). - """ - struct_defs: list[tuple[str, list[str], set[str], list[str]]] = [] - used_names: set[str] = set() - - # Sort by frequency (most common first) then by field count (larger first) - sorted_shapes = sorted( - shapes.items(), - key=lambda x: (-x[1], -len(x[0])), - ) - - for fields, count in sorted_shapes: - if count < min_occurrences or len(fields) < min_fields: - continue - - name = _generate_struct_name(fields, used_names) - used_names.add(name) - struct_defs.append((name, list(fields), set(), [])) - - return struct_defs - - -def _generate_struct_name(fields: tuple[str, ...], used_names: set[str]) -> str: - """Generate a struct name from field names.""" - field_set = set(fields) - - # Common patterns - if field_set == {"fmt", "raw"}: - name = "FR" - elif field_set == {"low", "high"}: - name = "Range" - elif field_set == {"x", "y"}: - name = "Point" - elif field_set == {"lat", "lng"} or field_set == {"latitude", "longitude"}: - name = "Coord" - elif field_set == {"min", "max"}: - name = "Bounds" - elif len(fields) <= 3: - name = "".join(f[0].upper() for f in fields[:3]) - else: - name = "S" - - # Ensure unique - base_name = name - counter = 1 - while name in used_names: - name = f"{base_name}{counter}" - counter += 1 - - return name - - -def _parse_struct_def( - line: str, -) -> tuple[str, list[str], set[str], list[str]] | None: - """Parse a struct definition line. Returns (name, fields, optional, parents).""" - m = STRUCT_DEF_RE.match(line) - if not m: - return None - - name = m.group(1) - parents_str = m.group(2) - fields_str = m.group(3) - - parents: list[str] = [] - if parents_str: - parents = [p.strip() for p in parents_str.split(",")] - - fields: list[str] = [] - optional: set[str] = set() - - if fields_str: - for f in fields_str.split(","): - f = f.strip() - if f.endswith("?"): - f = f[:-1] - optional.add(f) - if f: - fields.append(f) - - return name, fields, optional, parents - - -def _can_use_struct(obj: dict[str, Any], fields: list[str], optional: set[str]) -> bool: - """Check if an object can be encoded as a struct instance.""" - # Object must have only primitive values - for v in obj.values(): - if isinstance(v, dict | list): - return False - - # All required fields must be present - obj_keys = set(obj.keys()) - required = set(fields) - optional - - if not required.issubset(obj_keys): - return False - - # Object can only have fields from the struct - return obj_keys.issubset(set(fields)) - - -def _find_matching_struct( - obj: dict[str, Any], - registry: StructRegistry, -) -> tuple[str, list[str], set[str]] | None: - """Find a struct that matches the object. Returns (name, fields, optional).""" - for name, (fields, optional, _) in registry.items(): - if _can_use_struct(obj, fields, optional): - return name, fields, optional - return None - - -def _needs_quoting(s: str) -> bool: - """Check if a string value needs quoting to preserve type.""" - if not s: - return False - # Quote if it looks like a number - if NUMBER_RE.match(s): - return True - # Quote if it looks like bool/null - lower = s.lower() - if lower in ("true", "false", "null"): - return True - # Quote if has leading/trailing whitespace (would be stripped on decode) - if s != s.strip(): - return True - # Quote if contains special chars - # ':' is included to avoid ambiguity with inline key-value parsing in lists. - return "," in s or ":" in s or "(" in s or ")" in s or "\\" in s or "\n" in s or '"' in s - - -def _quote_string(s: str) -> str: - """Quote and escape a string value.""" - escaped = s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n").replace("\r", "\\r") - return f'"{escaped}"' - - -def _encode_primitive(val: Any, *, for_struct_instance: bool = False) -> str: - """Encode a primitive value to string. - - Args: - val: The value to encode. - for_struct_instance: If True, use empty string for None (struct instances - can omit trailing None values). If False, use "null" explicitly - to distinguish from nested objects in key-value pairs. - """ - if val is None: - return "" if for_struct_instance else "null" - if isinstance(val, bool): - return "true" if val else "false" - if isinstance(val, int | float): - if isinstance(val, float): - if val != val: # NaN - return "" if for_struct_instance else "null" - if val == float("inf") or val == float("-inf"): - return "" if for_struct_instance else "null" - if val == 0.0 and str(val) == "-0.0": - return "0" - return str(val) - s = str(val) - # Empty string must be quoted to distinguish from null - if s == "": - return '""' - if _needs_quoting(s): - return _quote_string(s) - return s - - -def _unescape_value(s: str) -> str: - """Unescape special characters from struct instance values.""" - if not s: - return s - result: list[str] = [] - i = 0 - while i < len(s): - if s[i] == "\\" and i + 1 < len(s): - c = s[i + 1] - if c == ",": - result.append(",") - elif c == "(": - result.append("(") - elif c == ")": - result.append(")") - elif c == "n": - result.append("\n") - elif c == "r": - result.append("\r") - elif c == "\\": - result.append("\\") - else: - result.append(s[i + 1]) - i += 2 - else: - result.append(s[i]) - i += 1 - return "".join(result) - - -def _encode_struct_instance( - obj: dict[str, Any], - name: str, - fields: list[str], - optional: set[str], -) -> str: - """Encode an object as a struct instance.""" - values: list[str] = [] - - # Find last non-optional provided value index - last_idx = -1 - for i, field in enumerate(fields): - if field not in optional or field in obj: - last_idx = i - - for i, field in enumerate(fields): - if i > last_idx: - break - val = obj.get(field) - values.append(_encode_primitive(val, for_struct_instance=True)) - - return f"{name}({', '.join(values)})" - - -def _encode_value( - val: Any, - lines: list[str], - depth: int, - registry: StructRegistry, -) -> None: - """Encode a value, appending lines.""" - indent = INDENT * depth - - if val is None or isinstance(val, bool | int | float | str): - lines.append(f"{indent}{_encode_primitive(val)}") - return - - if isinstance(val, list): - _encode_array(val, lines, depth, registry) - return - - if isinstance(val, dict): - _encode_object(val, lines, depth, registry, name=None) - return - - lines.append(f"{indent}{_encode_primitive(str(val))}") - - -def _encode_array( - arr: list[Any], - lines: list[str], - depth: int, - registry: StructRegistry, -) -> None: - """Encode an array value.""" - indent = INDENT * depth - - # Empty array - scar lesson: check count=0 - if not arr: - lines.append(f"{indent}[0]:") - return - - # Check if all items can be same struct instances - all_same_struct = True - struct_info: tuple[str, list[str], set[str]] | None = None - for item in arr: - if isinstance(item, dict): - item_struct = _find_matching_struct(item, registry) - if item_struct is None: - all_same_struct = False - break - if struct_info is None: - struct_info = item_struct - elif struct_info[0] != item_struct[0]: - all_same_struct = False - break - else: - all_same_struct = False - break - - # Inline struct instance array - if all_same_struct and struct_info is not None: - name, fields, optional = struct_info - instances = [_encode_struct_instance(item, name, fields, optional) for item in arr] - lines.append(f"{indent}[{len(arr)}]: {', '.join(instances)}") - return - - # List format - lines.append(f"{indent}[{len(arr)}]:") - for item in arr: - if isinstance(item, dict): - _encode_list_item_object(item, lines, depth + 1, registry) - elif isinstance(item, list): - lines.append(f"{indent} -") - _encode_array(item, lines, depth + 2, registry) - else: - lines.append(f"{indent} - {_encode_primitive(item)}") - - -def _encode_list_item_object( - obj: dict[str, Any], - lines: list[str], - depth: int, - registry: StructRegistry, -) -> None: - """Encode an object as a list item.""" - indent = INDENT * depth - first = True - - for k, v in obj.items(): - prefix = f"{indent}- " if first else f"{indent} " - first = False - - if isinstance(v, dict): - struct_info = _find_matching_struct(v, registry) - if struct_info: - name, fields, optional = struct_info - lines.append(f"{prefix}{k}: {_encode_struct_instance(v, name, fields, optional)}") - else: - lines.append(f"{prefix}{k}:") - _encode_nested_object(v, lines, depth + 2, registry) - elif isinstance(v, list): - lines.append(f"{prefix}{k}:") - _encode_array(v, lines, depth + 2, registry) - else: - lines.append(f"{prefix}{k}: {_encode_primitive(v)}") - - -def _encode_object( - obj: dict[str, Any], - lines: list[str], - depth: int, - registry: StructRegistry, - name: str | None, -) -> None: - """Encode an object value.""" - indent = INDENT * depth - - # Check if whole object can be a struct instance - struct_info = _find_matching_struct(obj, registry) - if struct_info and name: - sname, fields, optional = struct_info - lines.append(f"{indent}{name}: {_encode_struct_instance(obj, sname, fields, optional)}") - return - - if name: - lines.append(f"{indent}{name}:") - depth += 1 - indent = INDENT * depth - - for k, v in obj.items(): - if isinstance(v, dict): - nested_struct = _find_matching_struct(v, registry) - if nested_struct: - sname, fields, optional = nested_struct - lines.append(f"{indent}{k}: {_encode_struct_instance(v, sname, fields, optional)}") - else: - lines.append(f"{indent}{k}:") - _encode_nested_object(v, lines, depth + 1, registry) - elif isinstance(v, list): - lines.append(f"{indent}{k}:") - _encode_array(v, lines, depth + 1, registry) - else: - lines.append(f"{indent}{k}: {_encode_primitive(v)}") - - -def _encode_nested_object( - obj: dict[str, Any], - lines: list[str], - depth: int, - registry: StructRegistry, -) -> None: - """Encode a nested object without prefix.""" - indent = INDENT * depth - - for k, v in obj.items(): - if isinstance(v, dict): - struct_info = _find_matching_struct(v, registry) - if struct_info: - sname, fields, optional = struct_info - lines.append(f"{indent}{k}: {_encode_struct_instance(v, sname, fields, optional)}") - else: - lines.append(f"{indent}{k}:") - _encode_nested_object(v, lines, depth + 1, registry) - elif isinstance(v, list): - lines.append(f"{indent}{k}:") - _encode_array(v, lines, depth + 1, registry) - else: - lines.append(f"{indent}{k}: {_encode_primitive(v)}") - - -# Decode helpers - - -def _get_indent_depth(line: str) -> int: - """Get indentation depth (number of 2-space indents).""" - stripped = line.lstrip(" ") - spaces = len(line) - len(stripped) - return spaces // 2 - - -def _unquote_string(s: str) -> str: - """Unquote and unescape a quoted string value.""" - if not (s.startswith('"') and s.endswith('"')): - return s - inner = s[1:-1] - result: list[str] = [] - i = 0 - while i < len(inner): - if inner[i] == "\\" and i + 1 < len(inner): - c = inner[i + 1] - if c == "n": - result.append("\n") - elif c == "r": - result.append("\r") - elif c == "\\": - result.append("\\") - elif c == '"': - result.append('"') - else: - result.append(inner[i + 1]) - i += 2 - else: - result.append(inner[i]) - i += 1 - return "".join(result) - - -def _parse_primitive(s: str) -> Any: - """Parse a primitive value from string.""" - s = s.strip() - if not s: - return None - - # Quoted string - preserve as string - if s.startswith('"') and s.endswith('"'): - return _unquote_string(s) - - lower = s.lower() - if lower == "null": - return None - if lower == "true": - return True - if lower == "false": - return False - - if NUMBER_RE.match(s): - if "." in s or "e" in s.lower(): - return float(s) - return int(s) - - return s - - -def _parse_struct_instance(s: str, registry: StructRegistry) -> dict[str, Any] | None: - """Parse a struct instance like StructName(val1, val2).""" - m = STRUCT_INST_RE.match(s) - if not m: - return None - - struct_name = m.group(1) - struct_def = registry.get(struct_name) - if not struct_def: - return None - - fields, optional, _ = struct_def - - # Find matching closing paren - start = len(struct_name) + 1 # After "StructName(" - values = _parse_instance_values(s[start:]) - - # Build object from values - obj: dict[str, Any] = {} - for i, field in enumerate(fields): - if i < len(values): - val_str = values[i].strip() - if val_str: - val_str = _unescape_value(val_str) - obj[field] = _parse_primitive(val_str) - elif field not in optional: - obj[field] = None - elif field not in optional: - obj[field] = None - - return obj - - -def _parse_instance_values(s: str) -> list[str]: - """Parse comma-separated values from inside parentheses.""" - values: list[str] = [] - current: list[str] = [] - depth = 0 - in_quotes = False - i = 0 - - while i < len(s): - c = s[i] - - if c == "\\" and i + 1 < len(s): - current.append(c) - current.append(s[i + 1]) - i += 2 - continue - - if c == '"': - in_quotes = not in_quotes - current.append(c) - elif c == "(" and not in_quotes: - depth += 1 - current.append(c) - elif c == ")" and not in_quotes: - if depth == 0: - values.append("".join(current)) - break - depth -= 1 - current.append(c) - elif c == "," and depth == 0 and not in_quotes: - values.append("".join(current)) - current = [] - else: - current.append(c) - i += 1 - - return values - - -def _decode_value( - lines: list[str], - idx: int, - depth: int, - registry: StructRegistry, - lenient: bool, -) -> tuple[Any, int]: - """Decode a value from lines starting at idx.""" - if idx >= len(lines): - return None, idx - - line = lines[idx] - stripped = line.strip() - - if not stripped: - return _decode_value(lines, idx + 1, depth, registry, lenient) - - # Check for array header - m = ARRAY_HEADER_RE.match(stripped) - if m: - return _decode_array(lines, idx, depth, registry, lenient, m) - - # Check for bullet list item - if stripped.startswith("- "): - return _decode_list_item_object(lines, idx, depth, registry, lenient) - - # Check for key: value - m = KEY_VALUE_RE.match(stripped) - if m: - return _decode_object(lines, idx, depth, registry, lenient) - - # Try as struct instance - inst = _parse_struct_instance(stripped, registry) - if inst is not None: - return inst, idx + 1 - - return _parse_primitive(stripped), idx + 1 - - -def _decode_array( - lines: list[str], - idx: int, - depth: int, - registry: StructRegistry, - lenient: bool, - match: re.Match[str], -) -> tuple[Any, int]: - """Decode an array from [N]: header.""" - name = match.group(1) - count = int(match.group(2)) - - # Empty array - scar lesson - if count == 0: - if name: - return {name: []}, idx + 1 - return [], idx + 1 - - # Check for inline struct instances on same line - line = lines[idx].strip() - colon_idx = line.find(":") - if colon_idx >= 0: - after_colon = line[colon_idx + 1 :].strip() - if after_colon and STRUCT_INST_RE.match(after_colon): - instances = _split_struct_instances(after_colon) - result: list[Any] = [] - for inst_str in instances: - inst = _parse_struct_instance(inst_str.strip(), registry) - if inst: - result.append(inst) - else: - result.append(_parse_primitive(inst_str.strip())) - if name: - return {name: result}, idx + 1 - return result, idx + 1 - - idx += 1 - result = [] - base_depth = depth + 1 - - while idx < len(lines) and len(result) < count: - line = lines[idx] - if not line.strip(): - idx += 1 - continue - - line_depth = _get_indent_depth(line) - if line_depth < base_depth: - break - - stripped = line.strip() - - if stripped.startswith("- "): - content = stripped[2:].strip() - inst = _parse_struct_instance(content, registry) - if inst is not None: - result.append(inst) - idx += 1 - continue - - # If this is a quoted string list item, treat it as a primitive. - # This avoids ambiguity with inline object syntax when the string - # contains ':' (e.g. "keyword match: foo"). - if content.startswith('"') and content.endswith('"'): - result.append(_parse_primitive(content)) - idx += 1 - continue - - kv = KEY_VALUE_RE.match(content) - if kv: - obj, idx = _decode_list_item_object(lines, idx, base_depth, registry, lenient) - result.append(obj) - else: - result.append(_parse_primitive(content)) - idx += 1 - else: - inst = _parse_struct_instance(stripped, registry) - if inst is not None: - result.append(inst) - else: - result.append(_parse_primitive(stripped)) - idx += 1 - - if name: - return {name: result}, idx - return result, idx - - -def _split_struct_instances(s: str) -> list[str]: - """Split a string of struct instances like 'FR(a, b), FR(c, d)'.""" - results: list[str] = [] - current: list[str] = [] - depth = 0 - in_quotes = False - i = 0 - - while i < len(s): - c = s[i] - - # Handle escape sequences inside quotes - if c == "\\" and i + 1 < len(s) and in_quotes: - current.append(c) - current.append(s[i + 1]) - i += 2 - continue - - if c == '"': - in_quotes = not in_quotes - current.append(c) - elif c == "(" and not in_quotes: - depth += 1 - current.append(c) - elif c == ")" and not in_quotes: - depth -= 1 - current.append(c) - if depth == 0: - results.append("".join(current).strip()) - current = [] - elif c == "," and depth == 0 and not in_quotes: - if current: - results.append("".join(current).strip()) - current = [] - else: - current.append(c) - i += 1 - - if current: - text = "".join(current).strip() - if text: - results.append(text) - - return results - - -def _decode_list_item_object( - lines: list[str], - idx: int, - _base_depth: int, - registry: StructRegistry, - lenient: bool, -) -> tuple[dict[str, Any], int]: - """Decode an object starting with '- key: value'.""" - obj: dict[str, Any] = {} - line_depth = _get_indent_depth(lines[idx]) - - first_line = lines[idx].strip() - content = first_line[2:].strip() if first_line.startswith("- ") else first_line - - m = KEY_VALUE_RE.match(content) - if m: - key = m.group(1).strip() - val_str = m.group(2).strip() - - if val_str: - inst = _parse_struct_instance(val_str, registry) - if inst is not None: - obj[key] = inst - else: - obj[key] = _parse_primitive(val_str) - idx += 1 - else: - idx += 1 - if idx < len(lines) and _get_indent_depth(lines[idx]) > line_depth: - nested, idx = _decode_value(lines, idx, line_depth + 1, registry, lenient) - obj[key] = nested if nested is not None else {} - else: - obj[key] = {} - else: - idx += 1 - - # Parse continuation lines - while idx < len(lines): - line = lines[idx] - if not line.strip(): - idx += 1 - continue - - cont_depth = _get_indent_depth(line) - if cont_depth <= line_depth: - break - - stripped = line.strip() - m = KEY_VALUE_RE.match(stripped) - if not m: - break - - key = m.group(1).strip() - val_str = m.group(2).strip() - - if val_str: - inst = _parse_struct_instance(val_str, registry) - if inst is not None: - obj[key] = inst - else: - obj[key] = _parse_primitive(val_str) - idx += 1 - else: - idx += 1 - if idx < len(lines) and _get_indent_depth(lines[idx]) > cont_depth: - nested, idx = _decode_value(lines, idx, cont_depth + 1, registry, lenient) - obj[key] = nested if nested is not None else {} - else: - obj[key] = {} - - return obj, idx - - -def _decode_object( - lines: list[str], - idx: int, - _depth: int, - registry: StructRegistry, - lenient: bool, -) -> tuple[dict[str, Any], int]: - """Decode an object from key: value pairs.""" - result: dict[str, Any] = {} - if idx >= len(lines): - return result, idx - - base_depth = _get_indent_depth(lines[idx]) - - while idx < len(lines): - line = lines[idx] - if not line.strip(): - idx += 1 - continue - - line_depth = _get_indent_depth(line) - if line_depth < base_depth: - break - - stripped = line.strip() - - # Check for array header - m = ARRAY_HEADER_RE.match(stripped) - if m: - name = m.group(1) - if name: - nested, idx = _decode_array(lines, idx, line_depth, registry, lenient, m) - if isinstance(nested, dict): - result.update(nested) - continue - - m = KEY_VALUE_RE.match(stripped) - if not m: - break - - key = m.group(1).strip() - val_str = m.group(2).strip() - - if val_str: - inst = _parse_struct_instance(val_str, registry) - if inst is not None: - result[key] = inst - else: - result[key] = _parse_primitive(val_str) - idx += 1 - else: - idx += 1 - if idx < len(lines): - next_depth = _get_indent_depth(lines[idx]) - if next_depth > line_depth: - nested, idx = _decode_value(lines, idx, next_depth, registry, lenient) - result[key] = nested - else: - result[key] = {} - else: - result[key] = {} - - return result, idx diff --git a/src/agon/formats/text.py b/src/agon/formats/text.py deleted file mode 100644 index d82551c..0000000 --- a/src/agon/formats/text.py +++ /dev/null @@ -1,837 +0,0 @@ -r"""AGONText format codec. - -AGONText is a row-based encoding with radical simplicity. -It uses indentation for hierarchy and tabular format for arrays of objects. - -Format structure: - @AGON text - @D= # optional, default: \t - - -Example: - @AGON text - - products[3]{sku name price} - A123 Widget 9.99 - B456 Gadget 19.99 - C789 Gizmo 29.99 -""" - -from __future__ import annotations - -import re -from typing import Any - -from agon.errors import AGONTextError -from agon.formats.base import AGONFormat - -HEADER = "@AGON text" -DEFAULT_DELIMITER = "\t" -INDENT = " " # 2 spaces per level - -# Characters that require quoting -SPECIAL_CHARS = frozenset(["@", "#", "-"]) -BOOL_NULL = frozenset(["true", "false", "null"]) - -# Regex for numbers -NUMBER_RE = re.compile(r"^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$") - - -class AGONText(AGONFormat): - """AGONText format encoder/decoder. - - Encodes JSON data to a row-based text format optimized for - LLM consumption with significant token savings. - """ - - @staticmethod - def hint() -> str: - """Return a short hint instructing LLMs how to generate this format.""" - return "Return in AGON text format: Start with @AGON text header, encode arrays as name[N]{fields} with tab-delimited rows" - - @staticmethod - def encode( - data: object, - *, - delimiter: str = DEFAULT_DELIMITER, - include_header: bool = True, - ) -> str: - """Encode data to AGONText format. - - Args: - data: JSON-serializable data to encode. - delimiter: Field delimiter for tabular data (default: tab). - include_header: Whether to include @AGON text header. - - Returns: - AGONText encoded string. - """ - lines: list[str] = [] - - if include_header: - lines.append(HEADER) - if delimiter != DEFAULT_DELIMITER: - lines.append(f"@D={_escape_delimiter(delimiter)}") - lines.append("") # blank line after header - - _encode_value(data, lines, depth=0, delimiter=delimiter, name=None) - - return "\n".join(lines) - - @staticmethod - def decode(payload: str, *, lenient: bool = False) -> Any: - """Decode AGONText payload. - - Args: - payload: AGONText encoded string. - lenient: If True, allow length mismatches and best-effort parsing. - - Returns: - Decoded Python value. - - Raises: - AGONTextError: If payload is invalid. - """ - lines = payload.splitlines() - if not lines: - raise AGONTextError("Empty payload") - - # Parse header - idx = 0 - header_line = lines[idx].strip() - if not header_line.startswith("@AGON text"): - raise AGONTextError(f"Invalid header: {header_line}") - idx += 1 - - # Parse delimiter - delimiter = DEFAULT_DELIMITER - if idx < len(lines) and lines[idx].startswith("@D="): - delimiter = _parse_delimiter(lines[idx][3:].strip()) - idx += 1 - - # Skip blank lines after header - while idx < len(lines) and not lines[idx].strip(): - idx += 1 - - if idx >= len(lines): - return None - - result, _ = _decode_value(lines, idx, depth=0, delimiter=delimiter, lenient=lenient) - return result - - -def _escape_delimiter(d: str) -> str: - """Escape delimiter for @D= declaration.""" - # Unreachable via public API: DEFAULT_DELIMITER is "\t" and encode() only calls - # _escape_delimiter() when delimiter != DEFAULT_DELIMITER. - if d == "\t": # pragma: no cover - return "\\t" - if d == "\n": - return "\\n" - return d - - -def _parse_delimiter(d: str) -> str: - """Parse delimiter from @D= declaration.""" - if d == "\\t": - return "\t" - if d == "\\n": - return "\n" - return d - - -def _needs_quote(s: str, delimiter: str) -> bool: - """Check if string needs quoting.""" - if not s: - return True - # Leading/trailing whitespace - if s != s.strip(): - return True - # Contains delimiter - if delimiter in s: - return True - # Contains newlines or special chars - if "\n" in s or "\r" in s or "\\" in s or '"' in s: - return True - # Starts with special char - if s[0] in SPECIAL_CHARS: - return True - # Looks like number/bool/null - return bool(s.lower() in BOOL_NULL or NUMBER_RE.match(s)) - - -def _quote_string(s: str) -> str: - """Quote and escape a string value.""" - escaped = ( - s.replace("\\", "\\\\") - .replace('"', '\\"') - .replace("\n", "\\n") - .replace("\r", "\\r") - .replace("\t", "\\t") - ) - return f'"{escaped}"' - - -def _unquote_string(s: str) -> str: - """Unquote and unescape a string value.""" - if not (s.startswith('"') and s.endswith('"')): - return s - inner = s[1:-1] - result = [] - i = 0 - while i < len(inner): - if inner[i] == "\\" and i + 1 < len(inner): - c = inner[i + 1] - if c == "n": - result.append("\n") - elif c == "r": - result.append("\r") - elif c == "t": - result.append("\t") - elif c == "\\": - result.append("\\") - elif c == '"': - result.append('"') - else: - result.append(inner[i + 1]) - i += 2 - else: - result.append(inner[i]) - i += 1 - return "".join(result) - - -def _encode_primitive(val: Any, delimiter: str) -> str: - """Encode a primitive value to string.""" - if val is None: - return "null" - if isinstance(val, bool): - return "true" if val else "false" - if isinstance(val, int | float): - # Handle special float values - if isinstance(val, float): - if val != val: # NaN - return "null" - if val == float("inf") or val == float("-inf"): - return "null" - if val == 0.0 and str(val) == "-0.0": - return "0" - return str(val) - # String - s = str(val) - if _needs_quote(s, delimiter): - return _quote_string(s) - return s - - -def _parse_primitive(s: str) -> Any: - """Parse a primitive value from string.""" - s = s.strip() - if not s: - return None - - # Quoted string - if s.startswith('"') and s.endswith('"'): - return _unquote_string(s) - - # Boolean/null - lower = s.lower() - if lower == "null": - return None - if lower == "true": - return True - if lower == "false": - return False - - # Number - if NUMBER_RE.match(s): - if "." in s or "e" in s.lower(): - return float(s) - return int(s) - - # Plain string - return s - - -def _is_uniform_array(arr: list[Any]) -> tuple[bool, list[str]]: - """Check if array is uniform objects, return (is_uniform, fields).""" - if not arr: - return False, [] - - if not all(isinstance(x, dict) for x in arr): - return False, [] - - # Get all keys in order - all_keys: set[str] = set() - for obj in arr: - all_keys.update(obj.keys()) - - if not all_keys: - return False, [] - - # Check all objects have only primitive values - for obj in arr: - for v in obj.values(): - if isinstance(v, dict | list): - return False, [] - - # Return keys in consistent order (first seen order from union) - key_order: list[str] = [] - for obj in arr: - for k in obj: - if k not in key_order: - key_order.append(k) - - return True, key_order - - -def _is_primitive_array(arr: list[Any]) -> bool: - """Check if array contains only primitives.""" - return all(not isinstance(x, dict | list) for x in arr) - - -def _encode_value( - val: Any, - lines: list[str], - depth: int, - delimiter: str, - name: str | None, -) -> None: - """Encode a value, appending lines.""" - indent = INDENT * depth - - if val is None or isinstance(val, bool | int | float | str): - # Primitive value - if name: - lines.append(f"{indent}{name}: {_encode_primitive(val, delimiter)}") - else: - lines.append(f"{indent}{_encode_primitive(val, delimiter)}") - return - - if isinstance(val, list): - _encode_array(val, lines, depth, delimiter, name) - return - - if isinstance(val, dict): - _encode_object(val, lines, depth, delimiter, name) - return - - # Fallback: treat as string - if name: - lines.append(f"{indent}{name}: {_encode_primitive(str(val), delimiter)}") - else: - lines.append(f"{indent}{_encode_primitive(str(val), delimiter)}") - - -def _encode_array( - arr: list[Any], - lines: list[str], - depth: int, - delimiter: str, - name: str | None, -) -> None: - """Encode an array value.""" - indent = INDENT * depth - - if not arr: - # Empty array - if name: - lines.append(f"{indent}{name}[0]:") - else: - lines.append(f"{indent}[0]:") - return - - # Check for uniform objects (tabular format) - is_uniform, fields = _is_uniform_array(arr) - if is_uniform and fields: - header = delimiter.join(fields) - if name: - lines.append(f"{indent}{name}[{len(arr)}]{{{header}}}") - else: - lines.append(f"{indent}[{len(arr)}]{{{header}}}") - - for obj in arr: - row_values = [] - for f in fields: - if f in obj: - row_values.append(_encode_primitive(obj[f], delimiter)) - else: - row_values.append("") - lines.append(f"{indent}{delimiter.join(row_values)}") - return - - # Check for primitive array (inline format) - if _is_primitive_array(arr): - values = delimiter.join(_encode_primitive(v, delimiter) for v in arr) - if name: - lines.append(f"{indent}{name}[{len(arr)}]: {values}") - else: - lines.append(f"{indent}[{len(arr)}]: {values}") - return - - # Mixed/nested array (list format) - if name: - lines.append(f"{indent}{name}[{len(arr)}]:") - else: - lines.append(f"{indent}[{len(arr)}]:") - - for item in arr: - if isinstance(item, dict): - # Nested object in list - encode with - prefix - _encode_list_item_object(item, lines, depth + 1, delimiter) - else: - lines.append(f"{indent} - {_encode_primitive(item, delimiter)}") - - -def _encode_list_item_object( - obj: dict[str, Any], - lines: list[str], - depth: int, - delimiter: str, -) -> None: - """Encode an object as a list item (with - prefix for first line).""" - indent = INDENT * depth - first = True - - for k, v in obj.items(): - prefix = f"{indent}- " if first else f"{indent} " - first = False - - if isinstance(v, dict): - # Nested object - lines.append(f"{prefix}{k}:") - for nk, nv in v.items(): - if isinstance(nv, dict | list): - _encode_value(nv, lines, depth + 2, delimiter, nk) - else: - lines.append(f"{indent} {nk}: {_encode_primitive(nv, delimiter)}") - elif isinstance(v, list): - lines.append(f"{prefix}{k}:") - _encode_value(v, lines, depth + 2, delimiter, None) - else: - lines.append(f"{prefix}{k}: {_encode_primitive(v, delimiter)}") - - -def _encode_object( - obj: dict[str, Any], - lines: list[str], - depth: int, - delimiter: str, - name: str | None, -) -> None: - """Encode an object value.""" - indent = INDENT * depth - - if name: - lines.append(f"{indent}{name}:") - depth += 1 - indent = INDENT * depth - - for k, v in obj.items(): - if isinstance(v, dict): - _encode_value(v, lines, depth, delimiter, k) - elif isinstance(v, list): - # Always encode lists (even empty ones) with array format - _encode_value(v, lines, depth, delimiter, k) - else: - lines.append(f"{indent}{k}: {_encode_primitive(v, delimiter)}") - - -# Decode helpers - -TABULAR_HEADER_RE = re.compile(r"^(\w*)\[(\d+)\]\{(.+)\}$") -PRIMITIVE_ARRAY_RE = re.compile(r"^(\w*)\[(\d+)\]:\s*(.*)$") -LIST_ARRAY_RE = re.compile(r"^(\w*)\[(\d+)\]:$") -KEY_VALUE_RE = re.compile(r"^([^:]+):\s*(.*)$") - - -def _get_indent_depth(line: str) -> int: - """Get indentation depth (number of 2-space indents).""" - stripped = line.lstrip(" ") - spaces = len(line) - len(stripped) - return spaces // 2 - - -def _decode_array_field( - lines: list[str], - idx: int, - depth: int, - delimiter: str, - lenient: bool, -) -> tuple[Any, int]: - """Try to decode a line as an array field.""" - if idx >= len(lines): - return None, idx - - line = lines[idx] - stripped = line.strip() - - m = TABULAR_HEADER_RE.match(stripped) - if m: - return _decode_tabular_array(lines, idx, depth, delimiter, lenient, m) - - m = PRIMITIVE_ARRAY_RE.match(stripped) - if m: - values_part = m.group(3).strip() - if values_part: - return _decode_primitive_array(m, delimiter, idx) - # Fall through to list array check if no inline values - - m = LIST_ARRAY_RE.match(stripped) - if m: - return _decode_list_array(lines, idx, depth, delimiter, lenient, m) - - return None, idx - - -def _decode_value( - lines: list[str], - idx: int, - depth: int, - delimiter: str, - lenient: bool, -) -> tuple[Any, int]: - """Decode a value from lines starting at idx.""" - if idx >= len(lines): - return None, idx - - line = lines[idx] - - # Check indentation matches expected depth - if _get_indent_depth(line) < depth: - return None, idx - - stripped = line.strip() - - if not stripped or stripped.startswith("#"): - return _decode_value(lines, idx + 1, depth, delimiter, lenient) - - # Check for array patterns - # If it's a named array, treat as object start - m = TABULAR_HEADER_RE.match(stripped) - if m: - if m.group(1): - return _decode_object(lines, idx, depth, delimiter, lenient) - return _decode_tabular_array(lines, idx, depth, delimiter, lenient, m) - - m = PRIMITIVE_ARRAY_RE.match(stripped) - if m: - values_part = m.group(3).strip() - if values_part: - if m.group(1): - return _decode_object(lines, idx, depth, delimiter, lenient) - return _decode_primitive_array(m, delimiter, idx) - # Fall through to list array check - - m = LIST_ARRAY_RE.match(stripped) - if m: - if m.group(1): - return _decode_object(lines, idx, depth, delimiter, lenient) - return _decode_list_array(lines, idx, depth, delimiter, lenient, m) - - # Check for key:value (object) - m = KEY_VALUE_RE.match(stripped) - if m: - return _decode_object(lines, idx, depth, delimiter, lenient) - - raise AGONTextError(f"Cannot parse line {idx}: {stripped}") - - -def _decode_tabular_array( - lines: list[str], - idx: int, - depth: int, - delimiter: str, - lenient: bool, - match: re.Match[str], -) -> tuple[Any, int]: - """Decode tabular array: name[N]{fields}.""" - name = match.group(1) - count = int(match.group(2)) - fields_str = match.group(3) - fields = [f.strip() for f in fields_str.split(delimiter)] - - idx += 1 - result: list[dict[str, Any]] = [] - - while idx < len(lines) and len(result) < count: - row_line = lines[idx].strip() - if not row_line or row_line.startswith("#"): - idx += 1 - continue - - values = _split_row(row_line, delimiter) - - obj: dict[str, Any] = {} - for i, field in enumerate(fields): - if i < len(values): - raw = values[i] - val = _parse_primitive(raw) - if val is not None or raw.strip(): - obj[field] = val - result.append(obj) - idx += 1 - - if len(result) < count and not lenient: - raise AGONTextError(f"Expected {count} rows, got {len(result)}") - - if name: - return {name: result}, idx - return result, idx - - -def _split_row(values_str: str, delimiter: str) -> list[str]: - """Split delimiter-separated values, respecting quotes.""" - result: list[str] = [] - current: list[str] = [] - in_quote = False - i = 0 - - while i < len(values_str): - if values_str[i : i + len(delimiter)] == delimiter and not in_quote: - result.append("".join(current)) - current = [] - i += len(delimiter) - continue - - c = values_str[i] - if c == '"' and not in_quote: - in_quote = True - current.append(c) - elif c == '"' and in_quote: - if i > 0 and values_str[i - 1] == "\\": - current.append(c) - else: - in_quote = False - current.append(c) - else: - current.append(c) - i += 1 - - result.append("".join(current)) - return result - - -def _decode_primitive_array(match: re.Match[str], delimiter: str, idx: int) -> tuple[Any, int]: - """Decode primitive array: name[N]: v1v2...""" - name = match.group(1) - values_str = match.group(3) - - if not values_str.strip(): - arr: list[Any] = [] - else: - values = _split_row(values_str, delimiter) - arr = [_parse_primitive(v) for v in values] - - if name: - return {name: arr}, idx + 1 - return arr, idx + 1 - - -def _decode_list_array( - lines: list[str], - idx: int, - depth: int, - delimiter: str, - lenient: bool, - match: re.Match[str], -) -> tuple[Any, int]: - """Decode list array: name[N]: followed by - items.""" - name = match.group(1) - count = int(match.group(2)) - idx += 1 - result: list[Any] = [] - base_depth = depth + 1 - - while idx < len(lines) and len(result) < count: - line = lines[idx] - if not line.strip() or line.strip().startswith("#"): - idx += 1 - continue - - line_depth = _get_indent_depth(line) - if line_depth < base_depth: - break - - stripped = line.strip() - if stripped.startswith("- "): - # List item - item_str = stripped[2:].strip() - - # Check if it's a key:value (nested object) - kv_match = KEY_VALUE_RE.match(item_str) - if kv_match: - # Nested object starting with first key - obj, idx = _decode_list_item_object(lines, idx, base_depth, delimiter, lenient) - result.append(obj) - else: - result.append(_parse_primitive(item_str)) - idx += 1 - else: - break - - # If array has a name, wrap in object - if name: - return {name: result}, idx - return result, idx - - -def _decode_list_item_object( - lines: list[str], - idx: int, - base_depth: int, - delimiter: str, - lenient: bool, -) -> tuple[dict[str, Any], int]: - """Decode an object that starts with '- key: value'.""" - obj: dict[str, Any] = {} - item_depth = base_depth # The '- ' line depth - - # Parse first line (starts with -) - first_line = lines[idx].strip() - first_content = first_line[2:].strip() # Remove '- ' - m = KEY_VALUE_RE.match(first_content) - if m: - key = m.group(1).strip() - val_str = m.group(2).strip() - if val_str: - obj[key] = _parse_primitive(val_str) - else: - # Value is on subsequent lines - could be object, array, etc. - idx += 1 - nested_val, idx = _decode_value(lines, idx, item_depth + 2, delimiter, lenient) - obj[key] = nested_val if nested_val is not None else {} - # Continue to parse more keys at item_depth + 1 - while idx < len(lines): - line = lines[idx] - if not line.strip(): - idx += 1 - continue - line_depth = _get_indent_depth(line) - if line_depth <= item_depth: - break - stripped = line.strip() - - # Check for array patterns first - nested, new_idx = _decode_array_field(lines, idx, line_depth, delimiter, lenient) - if nested is not None: - if isinstance(nested, dict): - obj.update(nested) - idx = new_idx - continue - - kv = KEY_VALUE_RE.match(stripped) - if kv: - k = kv.group(1).strip() - v_str = kv.group(2).strip() - if v_str: - obj[k] = _parse_primitive(v_str) - else: - idx += 1 - nested, idx = _decode_value(lines, idx, line_depth + 1, delimiter, lenient) - obj[k] = nested if nested is not None else {} - continue - idx += 1 - return obj, idx - idx += 1 - - # Parse continuation lines at deeper indent - while idx < len(lines): - line = lines[idx] - if not line.strip(): - idx += 1 - continue - line_depth = _get_indent_depth(line) - if line_depth <= item_depth: - break - - stripped = line.strip() - # Check for array patterns first - nested, new_idx = _decode_array_field(lines, idx, line_depth, delimiter, lenient) - if nested is not None: - if isinstance(nested, dict): - obj.update(nested) - idx = new_idx - continue - kv = KEY_VALUE_RE.match(stripped) - if kv: - key = kv.group(1).strip() - val_str = kv.group(2).strip() - if val_str: - obj[key] = _parse_primitive(val_str) - idx += 1 - else: - # Nested value (could be object or array) - idx += 1 - nested, idx = _decode_value(lines, idx, line_depth + 1, delimiter, lenient) - obj[key] = nested if nested is not None else {} - else: - idx += 1 - - return obj, idx - - -def _decode_object( - lines: list[str], - idx: int, - depth: int, - delimiter: str, - lenient: bool, -) -> tuple[dict[str, Any], int]: - """Decode an object from key:value pairs.""" - result: dict[str, Any] = {} - if idx >= len(lines): - return result, idx - - base_depth = _get_indent_depth(lines[idx]) - - while idx < len(lines): - line = lines[idx] - if not line.strip() or line.strip().startswith("#"): - idx += 1 - continue - - line_depth = _get_indent_depth(line) - if line_depth < base_depth: - break - - stripped = line.strip() - - # Check for array patterns first (they can match KEY_VALUE_RE falsely) - # e.g., "filings[1]:" would match as key="filings[1]", value="" - nested, new_idx = _decode_array_field(lines, idx, line_depth, delimiter, lenient) - if nested is not None: - # If it's a named array like {name: [...]} merge it - if isinstance(nested, dict): - result.update(nested) - else: - # Shouldn't happen for named arrays, but handle gracefully - break - idx = new_idx - continue - - m = KEY_VALUE_RE.match(stripped) - if not m: - break - - key = m.group(1).strip() - val_str = m.group(2).strip() - - if val_str: - result[key] = _parse_primitive(val_str) - idx += 1 - else: - # No inline value - check for nested structure - idx += 1 - if idx < len(lines): - next_line = lines[idx] - next_depth = _get_indent_depth(next_line) - if next_depth > line_depth: - nested, idx = _decode_value(lines, idx, next_depth, delimiter, lenient) - result[key] = nested - else: - result[key] = {} - else: - result[key] = {} - - return result, idx From 001f316de01f0fffeeba58625ef1cad107df2792 Mon Sep 17 00:00:00 2001 From: harvey Date: Thu, 25 Dec 2025 12:20:10 -0500 Subject: [PATCH 2/7] tests: covers rust and removes extranneous python tests --- .gitignore | 5 ++ codecov.yml | 32 ++++++++ noxfile.py | 6 +- tests/test_columns.py | 177 ++++++++---------------------------------- tests/test_core.py | 5 +- tests/test_struct.py | 133 ++++--------------------------- tests/test_text.py | 133 +++++++------------------------ 7 files changed, 117 insertions(+), 374 deletions(-) create mode 100644 codecov.yml diff --git a/.gitignore b/.gitignore index ec0b010..50db2a4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# Rust +target/ +Cargo.lock +*.lcov + # Python __pycache__/ *.py[cod] diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..b23742c --- /dev/null +++ b/codecov.yml @@ -0,0 +1,32 @@ +# Codecov configuration for combined Rust + Python coverage +# https://docs.codecov.com/docs/codecov-yaml + +coverage: + precision: 2 + round: down + range: "70...100" + + status: + project: + default: + target: auto + threshold: 5% + patch: + default: + target: 80% + +# Flag configuration for separate language tracking +flags: + python: + paths: + - python/ + carryforward: true + rust: + paths: + - crates/ + carryforward: true + +comment: + layout: "reach,diff,flags,files" + behavior: default + require_changes: false diff --git a/noxfile.py b/noxfile.py index bcf9201..58c91e7 100644 --- a/noxfile.py +++ b/noxfile.py @@ -3,7 +3,7 @@ import nox PYTHON_VERSIONS = ["3.11", "3.12", "3.13"] -LOCATIONS = ["src", "tests"] +LOCATIONS = ["python", "tests"] @nox.session(python=PYTHON_VERSIONS[-1]) @@ -14,8 +14,8 @@ def lint(session: nox.Session) -> None: session.install("ruff", "basedpyright", "codespell") session.run("ruff", "check", *LOCATIONS) session.run("ruff", "format", "--check", *LOCATIONS) - session.run("basedpyright", "src") - session.run("codespell", "src", "tests") + session.run("basedpyright", "python") + session.run("codespell", "python", "tests") @nox.session(python=PYTHON_VERSIONS) diff --git a/tests/test_columns.py b/tests/test_columns.py index 801daea..6668294 100644 --- a/tests/test_columns.py +++ b/tests/test_columns.py @@ -10,8 +10,7 @@ import pytest -from agon import AGON, AGONColumnsError -from agon.formats.columns import AGONColumns +from agon import AGON, AGONColumns class TestAGONColumnsBasic: @@ -19,7 +18,7 @@ class TestAGONColumnsBasic: def test_encode_simple_object(self) -> None: data = {"name": "Alice", "age": 30, "active": True} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert "@AGON columns" in encoded assert "name: Alice" in encoded assert "age: 30" in encoded @@ -27,7 +26,7 @@ def test_encode_simple_object(self) -> None: def test_encode_decode_roundtrip_simple(self) -> None: data = {"name": "Alice", "age": 30} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data @@ -39,25 +38,16 @@ def test_encode_decode_roundtrip_nested(self) -> None: "city": "Seattle", }, } - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data - def test_encode_falls_back_to_string_for_unknown_types(self) -> None: - class Custom: - def __str__(self) -> str: # pragma: no cover - return "CUSTOM" - - encoded = AGONColumns.encode({"x": Custom()}) - decoded = AGONColumns.decode(encoded) - assert decoded == {"x": "CUSTOM"} - class TestAGONColumnsColumnar: """Tests for columnar array encoding (uniform objects).""" def test_encode_columnar_array(self, simple_data: list[dict[str, Any]]) -> None: - encoded = AGONColumns.encode(simple_data) + encoded = AGONColumns.encode(simple_data, include_header=True) assert "[3]" in encoded assert "├" in encoded or "|" in encoded assert "└" in encoded or "`" in encoded @@ -93,7 +83,7 @@ def test_decode_columnar_array_unnamed(self) -> None: assert decoded[0] == {"sku": "A123", "name": "Widget", "price": 9.99} def test_roundtrip_columnar_array(self, simple_data: list[dict[str, Any]]) -> None: - encoded = AGONColumns.encode(simple_data) + encoded = AGONColumns.encode(simple_data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == simple_data @@ -113,22 +103,6 @@ def test_columnar_with_missing_values(self) -> None: assert users[1] == {"id": 2, "name": "Bob"} assert users[2] == {"id": 3, "name": "Carol", "email": "carol@example.com"} - def test_ascii_tree_chars(self) -> None: - data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] - encoded = AGONColumns.encode(data, use_ascii=True) - assert "|" in encoded - assert "`" in encoded - assert "├" not in encoded - assert "└" not in encoded - - def test_decode_ascii_tree_chars(self) -> None: - payload = "@AGON columns\n\nusers[2]\n| id: 1\t2\n` name: Alice\tBob\n" - decoded = AGONColumns.decode(payload) - users = decoded["users"] - assert len(users) == 2 - assert users[0] == {"id": 1, "name": "Alice"} - assert users[1] == {"id": 2, "name": "Bob"} - def test_decode_columnar_array_field_shorter_than_count(self) -> None: payload = "@AGON columns\n\nusers[2]\n└ id: 1\n" decoded = AGONColumns.decode(payload) @@ -139,11 +113,6 @@ def test_decode_columnar_array_null_cell_means_present_none(self) -> None: decoded = AGONColumns.decode(payload) assert decoded == {"users": [{"email": None}, {}]} - def test_decode_columnar_array_escaped_quote_inside_cell(self) -> None: - payload = '@AGON columns\n\nitems[2]\n└ s: "a\\"b"\t"c"\n' - decoded = AGONColumns.decode(payload) - assert decoded == {"items": [{"s": 'a"b'}, {"s": "c"}]} - class TestAGONColumnsQuotingRoundtrip: """Roundtrip tests for quoting/unquoting strings in columns format.""" @@ -159,43 +128,17 @@ def test_roundtrip_strings_requiring_quotes(self) -> None: {"s": "a\nline"}, {"s": 'quote: "x"'}, ] - encoded = AGONColumns.encode(data) - decoded = AGONColumns.decode(encoded) - assert decoded == data - - -class TestAGONColumnsDirectives: - """Tests for @D= delimiter directive parsing.""" - - def test_decode_custom_delimiter_declaration(self) -> None: - payload = '@AGON columns\n@D=\\n\n\nitems[1]\n└ s: "123"\n' - decoded = AGONColumns.decode(payload) - assert decoded == {"items": [{"s": "123"}]} - - def test_decode_tab_delimiter_declaration(self) -> None: - payload = '@AGON columns\n@D=\\t\n\nitems[2]\n└ s: "a"\t"b"\n' - decoded = AGONColumns.decode(payload) - assert decoded == {"items": [{"s": "a"}, {"s": "b"}]} - - def test_encode_emits_delimiter_declaration_for_non_default(self) -> None: - data = [{"id": 1}, {"id": 2}] - encoded = AGONColumns.encode(data, delimiter=",", use_ascii=True) - assert "@D=," in encoded + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data - def test_decode_custom_comma_delimiter_splits_quoted_values(self) -> None: - payload = '@AGON columns\n@D=,\n\nitems[2]\n└ s: "a,b","c"\n' - decoded = AGONColumns.decode(payload) - assert decoded == {"items": [{"s": "a,b"}, {"s": "c"}]} - class TestAGONColumnsPrimitiveArrays: """Tests for primitive array encoding.""" def test_encode_primitive_array(self) -> None: data = {"tags": ["admin", "ops", "dev"]} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert "[3]:" in encoded def test_decode_primitive_array(self) -> None: @@ -205,7 +148,7 @@ def test_decode_primitive_array(self) -> None: def test_roundtrip_primitive_array(self) -> None: data = {"numbers": [1, 2, 3, 4, 5]} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data @@ -215,7 +158,7 @@ class TestAGONColumnsMixedArrays: def test_encode_mixed_array(self) -> None: data = {"items": [42, "hello", True, None]} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert "items[4]:" in encoded def test_decode_list_array_with_objects(self) -> None: @@ -250,24 +193,9 @@ def test_decode_list_array_with_primitives(self) -> None: decoded = AGONColumns.decode(payload) assert decoded == {"items": [1, None, "x"]} - def test_decode_list_array_skips_blank_and_comment_lines(self) -> None: - payload = textwrap.dedent( - """\ - @AGON columns - - items[2]: - # comment line - - 1 - - - 2 - """ - ) - decoded = AGONColumns.decode(payload) - assert decoded == {"items": [1, 2]} - def test_roundtrip_list_item_object_with_nested_object(self) -> None: data = {"items": [{"id": 1, "meta": {"tags": ["a", "b"], "flag": True}}]} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data @@ -277,25 +205,25 @@ class TestAGONColumnsPrimitives: def test_encode_null(self) -> None: data = {"value": None} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert "value:" in encoded def test_encode_booleans(self) -> None: data = {"active": True, "deleted": False} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert "active: true" in encoded assert "deleted: false" in encoded def test_encode_numbers(self) -> None: data = {"integer": 42, "float": 3.14, "negative": -17} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert "integer: 42" in encoded assert "float: 3.14" in encoded assert "negative: -17" in encoded def test_encode_special_floats(self) -> None: data = {"nan": float("nan"), "inf": float("inf")} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert "nan:" in encoded assert "inf:" in encoded @@ -320,59 +248,30 @@ class TestAGONColumnsQuoting: def test_quote_string_with_delimiter(self) -> None: # Tab is the delimiter, so strings containing tabs need quoting data = {"text": "hello\tworld"} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert '"hello\\tworld"' in encoded def test_quote_string_with_leading_space(self) -> None: data = {"text": " leading space"} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert '" leading space"' in encoded def test_quote_string_with_special_char(self) -> None: data = {"tag": "@mention"} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert '"@mention"' in encoded def test_quote_string_looks_like_number(self) -> None: data = {"code": "42"} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert '"42"' in encoded def test_roundtrip_quoted_strings(self) -> None: data = {"text": 'Say "hello"', "path": "C:\\Users"} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data - def test_decode_quoted_string_with_unknown_escape(self) -> None: - payload = '@AGON columns\n\nv: "a\\q"\n' - decoded = AGONColumns.decode(payload) - assert decoded == {"v": "aq"} - - def test_unquote_string_is_noop_for_unquoted_input(self) -> None: - from agon.formats.columns import _unquote_string - - assert _unquote_string("abc") == "abc" - - -class TestAGONColumnsDelimiters: - """Tests for custom delimiters.""" - - def test_encode_with_comma_delimiter(self) -> None: - # Tab is now the default, so test with comma to verify @D= is emitted - data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] - encoded = AGONColumns.encode(data, delimiter=",") - assert "@D=," in encoded - - def test_decode_with_tab_delimiter(self) -> None: - # Tab is now the default, so no @D= needed - payload = "@AGON columns\n\nusers[2]\n├ id: 1\t2\n└ name: Alice\tBob\n" - decoded = AGONColumns.decode(payload) - users = decoded["users"] - assert len(users) == 2 - assert users[0] == {"id": 1, "name": "Alice"} - assert users[1] == {"id": 2, "name": "Bob"} - class TestAGONColumnsNesting: """Tests for nested structures.""" @@ -387,12 +286,12 @@ def test_nested_object(self) -> None: }, }, } - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data def test_array_inside_object(self, nested_data: list[dict[str, Any]]) -> None: - encoded = AGONColumns.encode(nested_data) + encoded = AGONColumns.encode(nested_data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == nested_data @@ -402,32 +301,32 @@ class TestAGONColumnsEmptyAndStrings: def test_empty_array(self) -> None: data = {"items": []} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) assert "items[0]" in encoded decoded = AGONColumns.decode(encoded) assert decoded == {"items": []} def test_empty_object(self) -> None: data: dict[str, Any] = {} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == {} or decoded is None def test_single_element_array(self) -> None: data = [{"id": 1, "name": "Only"}] - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data def test_long_string(self) -> None: data = {"text": "x" * 1000} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data def test_unicode_string(self) -> None: data = {"text": "Hello 世界 🌍"} - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data @@ -437,7 +336,7 @@ def test_wide_table(self) -> None: {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8}, {"a": 10, "b": 20, "c": 30, "d": 40, "e": 50, "f": 60, "g": 70, "h": 80}, ] - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) decoded = AGONColumns.decode(encoded) assert decoded == data @@ -475,7 +374,7 @@ def test_agon_encode_columns_format(self, simple_data: list[dict[str, Any]]) -> assert result.header == "@AGON columns" def test_agon_decode_detects_columns_format(self, simple_data: list[dict[str, Any]]) -> None: - encoded = AGONColumns.encode(simple_data) + encoded = AGONColumns.encode(simple_data, include_header=True) decoded = AGON.decode(encoded) assert decoded == simple_data @@ -495,23 +394,13 @@ class TestAGONColumnsErrors: """Error handling tests.""" def test_invalid_header(self) -> None: - with pytest.raises(AGONColumnsError, match="Invalid header"): + with pytest.raises(ValueError): AGONColumns.decode("not a valid header") def test_empty_payload(self) -> None: - with pytest.raises(AGONColumnsError, match="Empty payload"): + with pytest.raises(ValueError): AGONColumns.decode("") - def test_cannot_parse_line_raises(self) -> None: - payload = "@AGON columns\n\n???\n" - with pytest.raises(AGONColumnsError, match=r"Cannot parse line"): - AGONColumns.decode(payload) - - def test_array_header_without_tree_lines_raises(self) -> None: - payload = "@AGON columns\n\n[2]\nnot-a-tree\n" - with pytest.raises(AGONColumnsError, match=r"Cannot parse line"): - AGONColumns.decode(payload) - class TestAGONColumnsHint: """Test hint method.""" @@ -532,7 +421,7 @@ def test_repeated_values_in_column(self) -> None: {"status": "active", "type": "user"}, {"status": "active", "type": "admin"}, ] - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) # Values should be grouped by column (tab-separated) assert "status: active\tactive\tactive" in encoded decoded = AGONColumns.decode(encoded) @@ -545,7 +434,7 @@ def test_numeric_sequences(self) -> None: {"price": 19.99, "qty": 20}, {"price": 29.99, "qty": 30}, ] - encoded = AGONColumns.encode(data) + encoded = AGONColumns.encode(data, include_header=True) # Values should be tab-separated assert "price: 9.99\t19.99\t29.99" in encoded assert "qty: 10\t20\t30" in encoded diff --git a/tests/test_core.py b/tests/test_core.py index c71d02f..158ef5a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -11,8 +11,7 @@ import orjson import pytest -from agon import AGON, AGONError -from agon.formats.text import AGONText +from agon import AGON, AGONError, AGONText def test_encode_json_format_returns_json() -> None: @@ -52,7 +51,7 @@ def test_encode_struct_includes_definitions_without_header() -> None: def test_decode_detects_text_payload() -> None: - payload = AGONText.encode({"x": 1}) + payload = AGONText.encode({"x": 1}, include_header=True) assert AGON.decode(payload) == {"x": 1} diff --git a/tests/test_struct.py b/tests/test_struct.py index c232873..7f5f13e 100644 --- a/tests/test_struct.py +++ b/tests/test_struct.py @@ -9,8 +9,7 @@ import pytest -from agon import AGONStructError -from agon.formats.struct import AGONStruct +from agon import AGONStruct class TestAGONStructBasic: @@ -18,7 +17,7 @@ class TestAGONStructBasic: def test_encode_simple_object(self) -> None: data = {"name": "Alice", "age": 30, "active": True} - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) assert "@AGON struct" in encoded assert "name: Alice" in encoded assert "age: 30" in encoded @@ -26,16 +25,16 @@ def test_encode_simple_object(self) -> None: def test_encode_decode_roundtrip_simple(self) -> None: data = {"name": "Alice", "age": 30} - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) decoded = AGONStruct.decode(encoded) assert decoded == data def test_empty_payload_raises_error(self) -> None: - with pytest.raises(AGONStructError, match="Empty payload"): + with pytest.raises(ValueError): AGONStruct.decode("") def test_invalid_header_raises_error(self) -> None: - with pytest.raises(AGONStructError, match="Invalid header"): + with pytest.raises(ValueError): AGONStruct.decode("@AGON text\nfoo: bar") @@ -49,7 +48,7 @@ def test_detects_repeated_shapes(self) -> None: "change": {"fmt": "+5.00", "raw": 5.0}, "volume": {"fmt": "1M", "raw": 1000000}, } - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) # Should detect FR struct for fmt/raw pattern assert "@FR: fmt, raw" in encoded or "@" in encoded @@ -60,7 +59,7 @@ def test_struct_definition_in_output(self) -> None: {"b": {"fmt": "2", "raw": 2}}, {"c": {"fmt": "3", "raw": 3}}, ] - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) # Should have struct definition assert "@AGON struct" in encoded @@ -76,13 +75,6 @@ def test_struct_definitions_emitted_without_header(self) -> None: assert "@AGON struct" not in encoded assert "@FR: fmt, raw" in encoded - def test_no_struct_for_single_occurrence(self) -> None: - data = {"price": {"fmt": "100", "raw": 100}} - encoded = AGONStruct.encode(data, min_occurrences=3) - # Only one occurrence, no struct should be created - # Check that nested object is expanded normally - assert "fmt:" in encoded or "raw:" in encoded - class TestAGONStructInstances: """Tests for struct instance encoding/decoding.""" @@ -123,7 +115,7 @@ def test_roundtrip_with_structs(self) -> None: "change": {"fmt": "5", "raw": 5}, "volume": {"fmt": "1M", "raw": 1000000}, } - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) decoded = AGONStruct.decode(encoded) assert decoded == data @@ -145,19 +137,6 @@ def test_decode_inherited_struct(self) -> None: decoded = AGONStruct.decode(payload) assert decoded == {"price": {"fmt": "100.00", "raw": 100.0, "currency": "USD"}} - def test_unknown_parent_raises_error(self) -> None: - payload = textwrap.dedent( - """\ - @AGON struct - - @Child(Unknown): field - - value: Child(1) - """ - ) - with pytest.raises(AGONStructError, match="Unknown parent struct"): - AGONStruct.decode(payload) - class TestAGONStructOptionalFields: """Tests for optional fields.""" @@ -189,60 +168,17 @@ def test_decode_optional_field_omitted(self) -> None: # Optional field omitted should not appear in result assert decoded == {"stock": {"symbol": "AAPL", "price": 150.0}} - def test_decode_optional_field_explicit_null(self) -> None: - payload = textwrap.dedent( - """\ - @AGON struct - - @Quote: symbol, price, volume? - - stock: Quote(AAPL, 150.0, ) - """ - ) - decoded = AGONStruct.decode(payload) - # Explicit empty means null for optional field (omitted) - assert decoded == {"stock": {"symbol": "AAPL", "price": 150.0}} - class TestAGONStructArrays: """Tests for arrays with struct instances.""" - def test_decode_inline_struct_array(self) -> None: - payload = textwrap.dedent( - """\ - @AGON struct - - @FR: fmt, raw - - [3]: FR("1", 1.0), FR("2", 2.0), FR("3", 3.0) - """ - ) - decoded = AGONStruct.decode(payload) - assert len(decoded) == 3 - assert decoded[0] == {"fmt": "1", "raw": 1.0} - assert decoded[1] == {"fmt": "2", "raw": 2.0} - assert decoded[2] == {"fmt": "3", "raw": 3.0} - - def test_decode_empty_array(self) -> None: - payload = textwrap.dedent( - """\ - @AGON struct - - @FR: fmt, raw - - prices[0]: - """ - ) - decoded = AGONStruct.decode(payload) - assert decoded == {"prices": []} - def test_roundtrip_array_of_structs(self) -> None: data = [ {"fmt": "1", "raw": 1}, {"fmt": "2", "raw": 2}, {"fmt": "3", "raw": 3}, ] - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) decoded = AGONStruct.decode(encoded) assert decoded == data @@ -250,54 +186,11 @@ def test_roundtrip_array_of_strings_with_colon(self) -> None: # quoted strings containing ':' must not be parsed as # inline key-value objects when they appear as list items. data = ["keyword match: for, object, return", "language match"] - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) decoded = AGONStruct.decode(encoded) assert decoded == data -class TestAGONStructEscaping: - """Tests for value escaping.""" - - def test_escape_comma_in_value(self) -> None: - payload = textwrap.dedent( - """\ - @AGON struct - - @Pair: a, b - - item: Pair(hello\\, world, test) - """ - ) - decoded = AGONStruct.decode(payload) - assert decoded == {"item": {"a": "hello, world", "b": "test"}} - - def test_escape_parentheses_in_value(self) -> None: - payload = textwrap.dedent( - """\ - @AGON struct - - @Pair: a, b - - item: Pair(func\\(x\\), result) - """ - ) - decoded = AGONStruct.decode(payload) - assert decoded == {"item": {"a": "func(x)", "b": "result"}} - - def test_escape_backslash_in_value(self) -> None: - payload = textwrap.dedent( - """\ - @AGON struct - - @Pair: a, b - - item: Pair(path\\\\file, test) - """ - ) - decoded = AGONStruct.decode(payload) - assert decoded == {"item": {"a": "path\\file", "b": "test"}} - - class TestAGONStructNestedObjects: """Tests for nested objects with structs.""" @@ -406,7 +299,7 @@ def test_roundtrip_financial_data(self) -> None: "fiftyTwoWeekHigh": {"fmt": "180.00", "raw": 180.0}, "fiftyTwoWeekLow": {"fmt": "120.00", "raw": 120.0}, } - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) decoded = AGONStruct.decode(encoded) assert decoded == data @@ -428,7 +321,7 @@ def test_roundtrip_array_of_records(self) -> None: "change": {"fmt": "+10", "raw": 10}, }, ] - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) decoded = AGONStruct.decode(encoded) assert decoded == data @@ -442,6 +335,6 @@ def test_roundtrip_mixed_content(self) -> None: "c": {"fmt": "3", "raw": 3}, }, } - encoded = AGONStruct.encode(data) + encoded = AGONStruct.encode(data, include_header=True) decoded = AGONStruct.decode(encoded) assert decoded == data diff --git a/tests/test_text.py b/tests/test_text.py index 3eea47a..7db59a0 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -10,8 +10,7 @@ import pytest -from agon import AGON, AGONTextError -from agon.formats.text import AGONText +from agon import AGON, AGONText class TestAGONTextBasic: @@ -19,7 +18,7 @@ class TestAGONTextBasic: def test_encode_simple_object(self) -> None: data = {"name": "Alice", "age": 30, "active": True} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert "@AGON text" in encoded assert "name: Alice" in encoded assert "age: 30" in encoded @@ -27,7 +26,7 @@ def test_encode_simple_object(self) -> None: def test_encode_decode_roundtrip_simple(self) -> None: data = {"name": "Alice", "age": 30} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) decoded = AGONText.decode(encoded) assert decoded == data @@ -39,13 +38,13 @@ def test_encode_decode_roundtrip_nested(self) -> None: "city": "Seattle", }, } - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) decoded = AGONText.decode(encoded) assert decoded == data def test_empty_object_roundtrip(self) -> None: data: dict[str, Any] = {} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) decoded = AGONText.decode(encoded) assert decoded == {} or decoded is None @@ -54,7 +53,7 @@ class TestAGONTextTabular: """Tests for tabular array encoding (uniform objects).""" def test_encode_tabular_array(self, simple_data: list[dict[str, Any]]) -> None: - encoded = AGONText.encode(simple_data) + encoded = AGONText.encode(simple_data, include_header=True) assert "[3]{" in encoded # Array header with 3 elements assert "id\tname\trole" in encoded or "id" in encoded @@ -95,7 +94,7 @@ def test_decode_tabular_array_unnamed(self) -> None: assert decoded[0] == {"sku": "A123", "name": "Widget", "price": 9.99} def test_roundtrip_tabular_array(self, simple_data: list[dict[str, Any]]) -> None: - encoded = AGONText.encode(simple_data) + encoded = AGONText.encode(simple_data, include_header=True) decoded = AGONText.decode(encoded) assert decoded == simple_data @@ -117,29 +116,13 @@ def test_tabular_with_missing_values(self) -> None: assert users[1] == {"id": 2, "name": "Bob"} # Missing email assert users[2] == {"id": 3, "email": "carol@example.com"} # Missing name - def test_tabular_lenient_truncation_and_strict_error(self) -> None: - payload = textwrap.dedent( - """\ - @AGON text - - products[3]{sku\tname} - A123\tWidget - """ - ) - - decoded_lenient = AGONText.decode(payload, lenient=True) - assert decoded_lenient == {"products": [{"sku": "A123", "name": "Widget"}]} - - with pytest.raises(AGONTextError): - AGONText.decode(payload, lenient=False) - class TestAGONTextPrimitiveArrays: """Tests for primitive array encoding.""" def test_encode_primitive_array(self) -> None: data = {"tags": ["admin", "ops", "dev"]} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert "[3]:" in encoded def test_decode_primitive_array(self) -> None: @@ -155,7 +138,7 @@ def test_decode_primitive_array(self) -> None: def test_roundtrip_primitive_array(self) -> None: data = {"numbers": [1, 2, 3, 4, 5]} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) decoded = AGONText.decode(encoded) assert decoded == data @@ -171,7 +154,7 @@ def test_decode_primitive_array_with_escaped_quote(self) -> None: def test_empty_array_roundtrip(self) -> None: data = {"items": []} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert "items[0]:" in encoded decoded = AGONText.decode(encoded) assert decoded == {"items": []} @@ -182,7 +165,7 @@ class TestAGONTextMixedArrays: def test_encode_mixed_array(self) -> None: data = {"items": [42, "hello", True, None]} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert "items[4]:" in encoded def test_decode_list_array_with_objects(self) -> None: @@ -238,21 +221,9 @@ def test_parses_tab_delimiter_header(self) -> None: ) assert AGONText.decode(payload) == {"s": "x"} - def test_encode_writes_newline_delimiter_header_roundtrip(self) -> None: - data = {"s": "x"} - encoded = AGONText.encode(data, delimiter="\n") - assert "@D=\\n" in encoded - assert AGONText.decode(encoded) == data - - def test_encode_writes_pipe_delimiter_header_roundtrip(self) -> None: - data = {"s": "x"} - encoded = AGONText.encode(data, delimiter="|") - assert "@D=|" in encoded - assert AGONText.decode(encoded) == data - def test_quotes_strings_that_look_like_primitives(self) -> None: data = {"b": "true", "n": "123", "z": "null"} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert 'b: "true"' in encoded assert 'n: "123"' in encoded assert 'z: "null"' in encoded @@ -260,7 +231,7 @@ def test_quotes_strings_that_look_like_primitives(self) -> None: def test_special_floats_decode_as_none(self) -> None: data = {"nan": float("nan"), "inf": float("inf"), "ninf": float("-inf")} - decoded = AGONText.decode(AGONText.encode(data)) + decoded = AGONText.decode(AGONText.encode(data, include_header=True)) assert decoded["nan"] is None assert decoded["inf"] is None assert decoded["ninf"] is None @@ -271,18 +242,18 @@ class TestAGONTextPrimitives: def test_encode_null(self) -> None: data = {"value": None} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert "value: null" in encoded def test_encode_booleans(self) -> None: data = {"active": True, "deleted": False} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert "active: true" in encoded assert "deleted: false" in encoded def test_encode_numbers(self) -> None: data = {"integer": 42, "float": 3.14, "negative": -17} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert "integer: 42" in encoded assert "float: 3.14" in encoded assert "negative: -17" in encoded @@ -290,7 +261,7 @@ def test_encode_numbers(self) -> None: def test_encode_special_floats(self) -> None: # NaN and Infinity should become null data = {"nan": float("nan"), "inf": float("inf")} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert "nan: null" in encoded assert "inf: null" in encoded @@ -314,33 +285,33 @@ class TestAGONTextQuoting: def test_quote_string_with_delimiter(self) -> None: data = {"text": "hello\tworld"} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert '"hello\\tworld"' in encoded or '"' in encoded def test_quote_string_with_leading_space(self) -> None: data = {"text": " leading space"} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert '" leading space"' in encoded def test_quote_string_with_special_char(self) -> None: data = {"tag": "@mention"} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert '"@mention"' in encoded def test_quote_string_looks_like_number(self) -> None: data = {"code": "42"} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) assert '"42"' in encoded def test_roundtrip_quoted_strings(self) -> None: data = {"text": 'Say "hello"', "path": "C:\\Users"} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) decoded = AGONText.decode(encoded) assert decoded == data def test_long_and_unicode_string_roundtrip(self) -> None: data = {"text": "Hello 世界 🌍" + ("x" * 1000)} - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) decoded = AGONText.decode(encoded) assert decoded == data @@ -351,53 +322,11 @@ def test_roundtrip_string_escaping_newlines_and_whitespace(self) -> None: "newline": "x\ny", "special": "@tag", } - encoded = AGONText.encode([data]) + encoded = AGONText.encode([data], include_header=True) decoded = AGONText.decode(encoded) assert decoded == [data] -class TestAGONTextDelimiters: - """Tests for custom delimiters.""" - - def test_encode_with_comma_delimiter(self) -> None: - data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] - encoded = AGONText.encode(data, delimiter=",") - assert "@D=," in encoded - - def test_decode_with_comma_delimiter(self) -> None: - payload = textwrap.dedent( - """\ - @AGON text - @D=, - - users[2]{id,name} - 1,Alice - 2,Bob - """ - ) - decoded = AGONText.decode(payload) - assert decoded == {"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]} - - def test_encode_with_pipe_delimiter(self) -> None: - data = [{"id": 1, "name": "Alice"}] - encoded = AGONText.encode(data, delimiter="|") - assert "@D=|" in encoded - - def test_tabular_quotes_and_custom_delimiter_roundtrip(self) -> None: - # Custom delimiter + quoted value containing delimiter - records = [{"a": "x|y", "b": "z"}, {"a": "p", "b": "q"}] - encoded = AGONText.encode(records, delimiter="|") - decoded = AGONText.decode(encoded) - assert decoded == records - - def test_newline_delimiter_header_roundtrip_for_primitives(self) -> None: - # Newline as a field delimiter isn't practical for tabular rows, but the - # header escaping/parsing should still work for primitive-only payloads. - encoded = AGONText.encode({"a": 1}, delimiter="\n") - assert "@D=\\n" in encoded - assert AGONText.decode(encoded) == {"a": 1} - - class TestAGONTextNesting: """Tests for nested structures.""" @@ -411,12 +340,12 @@ def test_nested_object(self) -> None: }, }, } - encoded = AGONText.encode(data) + encoded = AGONText.encode(data, include_header=True) decoded = AGONText.decode(encoded) assert decoded == data def test_array_inside_object(self, nested_data: list[dict[str, Any]]) -> None: - encoded = AGONText.encode(nested_data) + encoded = AGONText.encode(nested_data, include_header=True) decoded = AGONText.decode(encoded) assert decoded == nested_data @@ -454,7 +383,7 @@ def test_agon_encode_text_format(self, simple_data: list[dict[str, Any]]) -> Non assert result.header == "@AGON text" def test_agon_decode_detects_text_format(self, simple_data: list[dict[str, Any]]) -> None: - encoded = AGONText.encode(simple_data) + encoded = AGONText.encode(simple_data, include_header=True) decoded = AGON.decode(encoded) assert decoded == simple_data @@ -474,17 +403,13 @@ class TestAGONTextErrors: """Error handling tests.""" def test_invalid_header(self) -> None: - with pytest.raises(AGONTextError, match="Invalid header"): + with pytest.raises(ValueError): AGONText.decode("not a valid header") def test_empty_payload(self) -> None: - with pytest.raises(AGONTextError, match="Empty payload"): + with pytest.raises(ValueError): AGONText.decode("") - def test_invalid_payload_raises(self) -> None: - with pytest.raises(AGONTextError): - AGONText.decode("@AGON text\n\n???") - class TestAGONTextHint: """Test hint method.""" From e4ab127109666c647ae80743ffb1e122ecc52dcd Mon Sep 17 00:00:00 2001 From: harvey Date: Thu, 25 Dec 2025 12:20:51 -0500 Subject: [PATCH 3/7] feat: adds rust dev workflows --- .github/workflows/ci.yml | 30 ++++++- .github/workflows/publish.yml | 125 ++++++++++++++++++++++++--- .pre-commit-config.yaml | 43 +++++++-- Makefile | 71 +++++++-------- devtools/{lint.py => lint_python.py} | 6 +- devtools/lint_rust.py | 73 ++++++++++++++++ 6 files changed, 284 insertions(+), 64 deletions(-) rename devtools/{lint.py => lint_python.py} (89%) create mode 100644 devtools/lint_rust.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2f7c8b7..37fd457 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -42,6 +42,9 @@ jobs: with: fetch-depth: 0 + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + - name: Set up Python ${{ matrix.python }} uses: actions/setup-python@v6 with: @@ -78,6 +81,11 @@ jobs: with: fetch-depth: 0 + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: llvm-tools-preview + - name: Set up Python 3.13 uses: actions/setup-python@v6 with: @@ -100,12 +108,30 @@ jobs: - name: Combine coverage data and display human readable report run: nox --session=coverage - - name: Create coverage report + - name: Create Python coverage report run: nox --session=coverage -- xml - - name: Upload coverage report + # Rust coverage + - name: Install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + + - name: Generate Rust coverage + run: cargo llvm-cov --manifest-path crates/agon-core/Cargo.toml --lcov --output-path rust-coverage.lcov + + - name: Upload Python coverage to Codecov uses: codecov/codecov-action@v5 with: + files: coverage.xml + flags: python name: agon-python token: ${{ secrets.CODECOV_TOKEN }} verbose: true + + - name: Upload Rust coverage to Codecov + uses: codecov/codecov-action@v5 + with: + files: rust-coverage.lcov + flags: rust + name: agon-rust + token: ${{ secrets.CODECOV_TOKEN }} + verbose: true diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index a788a2a..e042913 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -10,23 +10,126 @@ permissions: contents: read jobs: - build-and-publish: - runs-on: ubuntu-latest + # Build wheels for Linux + linux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: aarch64 + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-python@v6 + with: + python-version: "3.13" + + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist + sccache: "true" + manylinux: auto + + - name: Upload wheels + uses: actions/upload-artifact@v6 + with: + name: wheels-linux-${{ matrix.platform.target }} + path: dist + + # Build wheels for macOS + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-13 + target: x86_64 + - runner: macos-14 + target: aarch64 steps: - - name: Checkout - uses: actions/checkout@v6 + - uses: actions/checkout@v6 + + - uses: actions/setup-python@v6 + with: + python-version: "3.13" + + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist + sccache: "true" + + - name: Upload wheels + uses: actions/upload-artifact@v6 with: - fetch-depth: 0 + name: wheels-macos-${{ matrix.platform.target }} + path: dist - - name: Install uv - uses: astral-sh/setup-uv@v7 + # Build wheels for Windows + windows: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: windows-latest + target: x64 + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-python@v6 with: - version: "0.8.3" - enable-cache: true python-version: "3.13" + architecture: ${{ matrix.platform.target }} + + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist + sccache: "true" + + - name: Upload wheels + uses: actions/upload-artifact@v6 + with: + name: wheels-windows-${{ matrix.platform.target }} + path: dist + + # Build source distribution + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + + - name: Upload sdist + uses: actions/upload-artifact@v6 + with: + name: wheels-sdist + path: dist - - name: Build package - run: uv build + # Publish to PyPI + publish: + name: Publish to PyPI + runs-on: ubuntu-latest + needs: [linux, macos, windows, sdist] + steps: + - name: Download all artifacts + uses: actions/download-artifact@v7 + with: + path: dist + pattern: wheels-* + merge-multiple: true - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 25d71f4..bf08ade 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,28 +16,55 @@ repos: - id: detect-private-key - id: debug-statements + # Python: codespell + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + args: ["--write-changes", "--skip", "*.lock,*.json"] + files: ^(python|tests|devtools|docs)/|^README\.md$ + + # Python: ruff check + format - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.9 + rev: v0.14.10 hooks: - - id: ruff - args: [--fix, --exit-non-zero-on-fix] + - id: ruff-check + args: [--fix] + files: ^(python|tests|devtools)/ - id: ruff-format + files: ^(python|tests|devtools)/ + # Python: basedpyright - repo: local hooks: - id: basedpyright name: basedpyright (uv) entry: uv run basedpyright language: system - args: [src] + args: [python, tests, devtools] pass_filenames: false - - repo: https://github.com/codespell-project/codespell - rev: v2.4.1 + # Rust: cargo fmt + - repo: local hooks: - - id: codespell - args: ["--skip", "*.lock,*.json"] + - id: cargo-fmt + name: cargo fmt + entry: cargo fmt --manifest-path crates/agon-core/Cargo.toml -- + language: system + types: [rust] + pass_filenames: true + + # Rust: cargo clippy + - repo: local + hooks: + - id: cargo-clippy + name: cargo clippy + entry: cargo clippy --manifest-path crates/agon-core/Cargo.toml --all-targets --fix --allow-dirty --allow-staged -- -D warnings + language: system + types: [rust] + pass_filenames: false + # CI: GitHub workflows and actions - repo: https://github.com/python-jsonschema/check-jsonschema rev: 0.30.0 hooks: diff --git a/Makefile b/Makefile index 33084b9..7129fb5 100644 --- a/Makefile +++ b/Makefile @@ -5,40 +5,40 @@ .PHONY: default install fix test nox upgrade build docs clean pre-commit help -default: install fix test +default: install build fix test install: uv sync --dev +build: + uv run maturin develop --manifest-path crates/agon-core/Cargo.toml + fix: - uv run python devtools/lint.py + uv run python devtools/lint_rust.py & uv run python devtools/lint_python.py & wait -test: install - uv run pytest tests -s +test: build + @echo "🦀 Running Rust tests with coverage..." + cargo llvm-cov --manifest-path crates/agon-core/Cargo.toml --fail-under-lines 70 + cargo llvm-cov report --manifest-path crates/agon-core/Cargo.toml --lcov --output-path rust-coverage.lcov + @echo "" + @echo "🐍 Running Python tests with coverage..." + uv run pytest tests -v + @echo "" @echo "✅ All tests passed" -nox: +nox: build uv run nox upgrade: uv sync --upgrade --dev -build: - uv build - docs: install uv run mkdocs serve --livereload clean: - -rm -rf dist/ - -rm -rf *.egg-info/ - -rm -rf .pytest_cache/ - -rm -rf .mypy_cache/ - -rm -rf .nox/ - -rm -rf .venv/ - -rm -rf htmlcov/ - -rm -rf .coverage* - -rm -rf coverage.xml + -rm -rf dist/ target/ *.egg-info/ + -rm -rf .pytest_cache/ .mypy_cache/ .nox/ htmlcov/ + -rm -rf .coverage* coverage.xml rust-coverage.lcov -find . -type d -name "__pycache__" -exec rm -rf {} + pre-commit: @@ -48,29 +48,20 @@ pre-commit: help: @echo "AGON Development Makefile" @echo "" - @echo "🚀 Quick Start:" - @echo " make - Install deps, lint, run tests" - @echo "" - @echo "📦 Installation:" - @echo " make install - Install all dependencies" - @echo " make upgrade - Upgrade all dependencies" - @echo "" - @echo "🔍 Code Quality:" - @echo " make fix - Auto-fix linting and formatting issues" - @echo " make pre-commit - Install and run pre-commit hooks" - @echo "" - @echo "🧪 Testing:" - @echo " make test - Run all tests (single Python version)" - @echo " make test-unit - Run unit tests (single Python version)" - @echo " make nox - Run all nox sessions (all Python versions)" - @echo " make nox-unit - Run unit tests (all Python versions)" - @echo " make nox-lint - Run lint session via nox" + @echo "Quick Start:" + @echo " make - Install, build Rust, lint, test" + @echo " make test - Build Rust and run tests" @echo "" - @echo "🧹 Cleanup:" - @echo " make clean - Clean build/cache files" + @echo "Development:" + @echo " make install - Install Python dependencies" + @echo " make build - Build and install Rust extension" + @echo " make fix - Format and lint (Python + Rust)" @echo "" - @echo "🔧 Build:" - @echo " make build - Build distribution packages" + @echo "Testing:" + @echo " make test - Run Rust + Python tests with coverage" + @echo " make nox - Run nox sessions (builds Rust first)" @echo "" - @echo "📚 Docs:" - @echo " make docs - Serve docs locally (http://127.0.0.1:8000/)" + @echo "Other:" + @echo " make docs - Serve docs locally" + @echo " make clean - Clean build artifacts" + @echo " make upgrade - Upgrade dependencies" diff --git a/devtools/lint.py b/devtools/lint_python.py similarity index 89% rename from devtools/lint.py rename to devtools/lint_python.py index a27d248..99f2023 100644 --- a/devtools/lint.py +++ b/devtools/lint_python.py @@ -5,8 +5,8 @@ from rich import print as rprint # Update as needed. -SRC_PATHS = ["src", "tests", "devtools"] -DOC_PATHS = ["README.md"] +SRC_PATHS = ["python", "tests", "devtools"] +DOC_PATHS = ["README.md", "docs"] reconfigure(emoji=not get_console().options.legacy_windows) # No emojis on legacy windows. @@ -31,7 +31,7 @@ def main(): if errcount != 0: rprint(f"[bold red]:x: Lint failed with {errcount} errors.[/bold red]") else: - rprint("[bold green]:white_check_mark: Lint passed![/bold green]") + rprint("[bold green]:white_check_mark: Python lint passed![/bold green]") rprint() return errcount diff --git a/devtools/lint_rust.py b/devtools/lint_rust.py new file mode 100644 index 0000000..c38f4e2 --- /dev/null +++ b/devtools/lint_rust.py @@ -0,0 +1,73 @@ +import subprocess + +from funlog import log_calls +from rich import get_console, reconfigure +from rich import print as rprint + +# Rust crate paths. +MANIFEST_PATH = "crates/agon-core/Cargo.toml" + +reconfigure(emoji=not get_console().options.legacy_windows) # No emojis on legacy windows. + + +def main(): + """Run Rust linting checks and report errors. + + Returns: + int: The number of errors encountered during linting + """ + rprint() + + errcount = 0 + errcount += run(["cargo", "fmt", "--manifest-path", MANIFEST_PATH]) + errcount += run( + [ + "cargo", + "clippy", + "--manifest-path", + MANIFEST_PATH, + "--all-targets", + "--fix", + "--allow-dirty", + "--allow-staged", + "--", + "-D", + "warnings", + ] + ) + + rprint() + + if errcount != 0: + rprint(f"[bold red]:x: Rust lint failed with {errcount} errors.[/bold red]") + else: + rprint("[bold green]:white_check_mark: Rust lint passed![/bold green]") + rprint() + + return errcount + + +@log_calls(level="warning", show_timing_only=True) +def run(cmd: list[str]) -> int: + """Execute a command and handle its output. + + Args: + cmd: The command to run as a list of strings + + Returns: + int: 0 if the command succeeded, 1 if it failed + """ + rprint() + rprint(f"[bold green]>> {' '.join(cmd)}[/bold green]") + errcount = 0 + try: + subprocess.run(cmd, text=True, check=True) + except subprocess.CalledProcessError as e: + rprint(f"[bold red]Error: {e}[/bold red]") + errcount = 1 + + return errcount + + +if __name__ == "__main__": + exit(main()) From 1d0d06f0a3607e99d88029cd66377b71dfc978cd Mon Sep 17 00:00:00 2001 From: harvey Date: Thu, 25 Dec 2025 12:40:13 -0500 Subject: [PATCH 4/7] refactor: renames AGONText to AGONRows --- crates/agon-core/src/formats/mod.rs | 22 +-- .../src/formats/{text.rs => rows.rs} | 26 +-- crates/agon-core/src/lib.rs | 22 +-- python/agon/__init__.py | 4 +- python/agon/core.py | 44 ++--- tests/test_benchmarks.py | 2 +- tests/test_columns.py | 2 +- tests/test_core.py | 50 ++--- tests/{test_text.py => test_rows.py} | 184 +++++++++--------- 9 files changed, 171 insertions(+), 185 deletions(-) rename crates/agon-core/src/formats/{text.rs => rows.rs} (98%) rename tests/{test_text.py => test_rows.py} (69%) diff --git a/crates/agon-core/src/formats/mod.rs b/crates/agon-core/src/formats/mod.rs index 9fe73fd..1a443e0 100644 --- a/crates/agon-core/src/formats/mod.rs +++ b/crates/agon-core/src/formats/mod.rs @@ -1,13 +1,13 @@ //! AGON encoding formats //! //! This module contains implementations of the three AGON formats: -//! - text: Row-based tabular encoding (similar to TOON) +//! - rows: Row-based tabular encoding (format name: "text") //! - columns: Columnar encoding with type clustering //! - struct_fmt: Template-based encoding for nested patterns pub mod columns; +pub mod rows; pub mod struct_fmt; -pub mod text; use rayon::prelude::*; use serde_json::Value as JsonValue; @@ -27,7 +27,7 @@ pub struct EncodingResult { /// Headers for each format pub fn get_header(format: &str) -> &'static str { match format { - "text" => "@AGON text", + "rows" => "@AGON rows", "columns" => "@AGON columns", "struct" => "@AGON struct", "json" => "", @@ -86,7 +86,7 @@ pub fn encode_auto_parallel( /// Encode data with all formats in parallel pub fn encode_all_parallel(data: &JsonValue) -> Result> { - let formats = ["json", "text", "columns", "struct"]; + let formats = ["json", "rows", "columns", "struct"]; // Use rayon to encode all formats in parallel let results: Vec> = formats @@ -121,7 +121,7 @@ pub fn encode_all_parallel(data: &JsonValue) -> Result> { fn encode_with_format(data: &JsonValue, format: &str) -> Result { let (text, header) = match format { "json" => (serde_json::to_string(data)?, String::new()), - "text" => (text::encode(data, false)?, get_header("text").to_string()), + "rows" => (rows::encode(data, false)?, get_header("rows").to_string()), "columns" => ( columns::encode(data, false)?, get_header("columns").to_string(), @@ -150,7 +150,7 @@ mod tests { #[test] fn test_get_header() { - assert_eq!(get_header("text"), "@AGON text"); + assert_eq!(get_header("rows"), "@AGON rows"); assert_eq!(get_header("columns"), "@AGON columns"); assert_eq!(get_header("struct"), "@AGON struct"); assert_eq!(get_header("json"), ""); @@ -184,7 +184,7 @@ mod tests { let formats: Vec<&str> = results.iter().map(|r| r.format.as_str()).collect(); assert!(formats.contains(&"json")); - assert!(formats.contains(&"text")); + assert!(formats.contains(&"rows")); assert!(formats.contains(&"columns")); assert!(formats.contains(&"struct")); } @@ -237,12 +237,12 @@ mod tests { } #[test] - fn test_encode_with_format_text() { + fn test_encode_with_format_rows() { let data = json!({"name": "test"}); - let result = encode_with_format(&data, "text").unwrap(); + let result = encode_with_format(&data, "rows").unwrap(); - assert_eq!(result.format, "text"); - assert_eq!(result.header, "@AGON text"); + assert_eq!(result.format, "rows"); + assert_eq!(result.header, "@AGON rows"); } #[test] diff --git a/crates/agon-core/src/formats/text.rs b/crates/agon-core/src/formats/rows.rs similarity index 98% rename from crates/agon-core/src/formats/text.rs rename to crates/agon-core/src/formats/rows.rs index e5c9e88..dd32223 100644 --- a/crates/agon-core/src/formats/text.rs +++ b/crates/agon-core/src/formats/rows.rs @@ -1,9 +1,9 @@ -//! AGONText format encoder/decoder +//! AGONRows format encoder/decoder //! //! Row-based encoding with tabular format for arrays of uniform objects. //! //! Format structure: -//! @AGON text +//! @AGON rows //! @D= # optional, default: \t //! @@ -13,7 +13,7 @@ use std::sync::LazyLock; use crate::error::{AgonError, Result}; -const HEADER: &str = "@AGON text"; +const HEADER: &str = "@AGON rows"; const DEFAULT_DELIMITER: &str = "\t"; const INDENT: &str = " "; @@ -27,7 +27,7 @@ static KEY_VALUE_RE: LazyLock = LazyLock::new(|| Regex::new(r"^([^:]+):\s static NUMBER_RE: LazyLock = LazyLock::new(|| Regex::new(r"^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$").unwrap()); -/// Encode data to AGONText format +/// Encode data to AGONRows format pub fn encode(data: &Value, include_header: bool) -> Result { let mut lines = Vec::new(); let delimiter = DEFAULT_DELIMITER; @@ -42,7 +42,7 @@ pub fn encode(data: &Value, include_header: bool) -> Result { Ok(lines.join("\n")) } -/// Decode AGONText payload +/// Decode AGONRows payload pub fn decode(payload: &str) -> Result { let lines: Vec<&str> = payload.lines().collect(); if lines.is_empty() { @@ -53,7 +53,7 @@ pub fn decode(payload: &str) -> Result { // Parse header let header_line = lines[idx].trim(); - if !header_line.starts_with("@AGON text") { + if !header_line.starts_with("@AGON rows") { return Err(AgonError::DecodingError(format!( "Invalid header: {}", header_line @@ -926,7 +926,7 @@ mod tests { fn test_encode_with_header() { let data = json!({"name": "test"}); let encoded = encode(&data, true).unwrap(); - assert!(encoded.starts_with("@AGON text")); + assert!(encoded.starts_with("@AGON rows")); } #[test] @@ -1002,13 +1002,13 @@ mod tests { #[test] fn test_decode_header_only() { - let result = decode("@AGON text\n\n").unwrap(); + let result = decode("@AGON rows\n\n").unwrap(); assert!(result.is_null()); } #[test] fn test_decode_simple_object() { - let payload = "@AGON text\n\nname: Alice\nage: 30"; + let payload = "@AGON rows\n\nname: Alice\nage: 30"; let decoded = decode(payload).unwrap(); assert_eq!(decoded["name"], "Alice"); assert_eq!(decoded["age"], 30); @@ -1016,7 +1016,7 @@ mod tests { #[test] fn test_decode_tabular_array() { - let payload = "@AGON text\n\n[2]{id\tname}\n1\tAlice\n2\tBob"; + let payload = "@AGON rows\n\n[2]{id\tname}\n1\tAlice\n2\tBob"; let decoded = decode(payload).unwrap(); assert!(decoded.is_array()); let arr = decoded.as_array().unwrap(); @@ -1027,7 +1027,7 @@ mod tests { #[test] fn test_decode_named_tabular_array() { - let payload = "@AGON text\n\nusers[2]{id\tname}\n1\tAlice\n2\tBob"; + let payload = "@AGON rows\n\nusers[2]{id\tname}\n1\tAlice\n2\tBob"; let decoded = decode(payload).unwrap(); assert!(decoded.is_object()); let users = decoded["users"].as_array().unwrap(); @@ -1036,7 +1036,7 @@ mod tests { #[test] fn test_decode_primitive_array() { - let payload = "@AGON text\n\nnums[3]: 1\t2\t3"; + let payload = "@AGON rows\n\nnums[3]: 1\t2\t3"; let decoded = decode(payload).unwrap(); let nums = decoded["nums"].as_array().unwrap(); assert_eq!(nums.len(), 3); @@ -1047,7 +1047,7 @@ mod tests { #[test] fn test_decode_custom_delimiter() { - let payload = "@AGON text\n@D=\\t\n\nname: test"; + let payload = "@AGON rows\n@D=\\t\n\nname: test"; let decoded = decode(payload).unwrap(); assert_eq!(decoded["name"], "test"); } diff --git a/crates/agon-core/src/lib.rs b/crates/agon-core/src/lib.rs index e0b6432..3f888a1 100644 --- a/crates/agon-core/src/lib.rs +++ b/crates/agon-core/src/lib.rs @@ -1,7 +1,7 @@ //! AGON Core: Rust implementation of AGON encoding formats //! //! All format classes inherit from AGONFormat base class: -//! - AGONText: Row-based tabular encoding +//! - AGONRows: Row-based tabular encoding //! - AGONColumns: Columnar encoding with type clustering //! - AGONStruct: Template-based encoding for nested patterns @@ -16,7 +16,7 @@ mod types; mod utils; pub use error::AgonError; -pub use formats::{columns, struct_fmt, text}; +pub use formats::{columns, rows, struct_fmt}; pub use types::JsonValue; // ============================================================================ @@ -214,40 +214,40 @@ fn project_obj( } // ============================================================================ -// AGONText - Row-based tabular encoding +// AGONRows - Row-based tabular encoding // ============================================================================ /// Row-based tabular encoding format. #[pyclass(extends=AGONFormat)] -struct AGONText; +struct AGONRows; #[pymethods] -impl AGONText { +impl AGONRows { #[new] fn new() -> (Self, AGONFormat) { - (AGONText, AGONFormat) + (AGONRows, AGONFormat) } #[staticmethod] #[pyo3(signature = (data, include_header = false))] fn encode(data: &Bound<'_, PyAny>, include_header: bool) -> PyResult { let value = types::py_to_json(data)?; - text::encode(&value, include_header).map_err(|e| e.into()) + rows::encode(&value, include_header).map_err(|e| e.into()) } #[staticmethod] fn decode(py: Python<'_>, payload: &str) -> PyResult> { - let value = text::decode(payload)?; + let value = rows::decode(payload)?; types::json_to_py(py, &value) } #[staticmethod] fn hint() -> String { - "Return in AGON text format: Start with @AGON text header, encode arrays as name[N]{fields} with tab-delimited rows".to_string() + "Return in AGON rows format: Start with @AGON rows header, encode arrays as name[N]{fields} with tab-delimited rows".to_string() } fn __repr__(&self) -> String { - "AGONText()".to_string() + "AGONRows()".to_string() } } @@ -400,7 +400,7 @@ fn encode_all_parallel(data: &Bound<'_, PyAny>) -> PyResult> #[pymodule] fn agon_core(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; - m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/python/agon/__init__.py b/python/agon/__init__.py index 69670f6..0729578 100644 --- a/python/agon/__init__.py +++ b/python/agon/__init__.py @@ -7,8 +7,8 @@ from agon.agon_core import ( AGONColumns, AGONFormat, + AGONRows, AGONStruct, - AGONText, EncodingResult, encode_all_parallel, encode_auto_parallel, @@ -22,8 +22,8 @@ "AGONEncoding", "AGONError", "AGONFormat", + "AGONRows", "AGONStruct", - "AGONText", "EncodingResult", "Format", "encode_all_parallel", diff --git a/python/agon/core.py b/python/agon/core.py index ff72467..61a37c3 100644 --- a/python/agon/core.py +++ b/python/agon/core.py @@ -2,12 +2,6 @@ AGON (Adaptive Guarded Object Notation) is a self-describing, token-efficient encoding for lists of JSON objects, optimized for LLM consumption. - -Core features: - - Key elimination: objects become positional rows with inline schema. - - Recursive encoding: nested arrays of objects are also encoded. - - Adaptive: automatically selects the best format for token efficiency. - - Self-describing: no training or config required. """ from __future__ import annotations @@ -20,23 +14,15 @@ import orjson -# Rust format classes (primary API - inherit from AGONFormat) -from agon.agon_core import ( - AGONColumns, - AGONFormat, - AGONStruct, - AGONText, -) -from agon.agon_core import ( - encode_auto_parallel as _rs_encode_auto_parallel, -) +# Rust py03 bindings +from agon.agon_core import AGONColumns, AGONFormat, AGONRows, AGONStruct +from agon.agon_core import encode_auto_parallel as _rs_encode_auto_parallel from agon.encoding import DEFAULT_ENCODING, count_tokens from agon.errors import AGONError -# Python format classes (for reference/fallback) - -Format = Literal["auto", "json", "text", "columns", "struct"] -ConcreteFormat = Literal["json", "text", "columns", "struct"] +# Type aliases +Format = Literal["auto", "json", "rows", "columns", "struct"] +ConcreteFormat = Literal["json", "rows", "columns", "struct"] @dataclass(frozen=True) @@ -84,7 +70,7 @@ class AGON: Formats: - "json": Raw JSON (baseline) - - "text": AGONText row-based format + - "rows": AGONRows row-based format - "columns": AGONColumns columnar format for wide tables - "struct": AGONStruct template format for repeated object shapes @@ -98,7 +84,7 @@ class AGON: # Format headers (for decoding) _headers: ClassVar[dict[ConcreteFormat, str]] = { "json": "", - "text": "@AGON text", + "rows": "@AGON rows", "columns": "@AGON columns", "struct": "@AGON struct", } @@ -106,14 +92,14 @@ class AGON: # Encoders - Rust for AGON formats, orjson for JSON _encoders: ClassVar[dict[ConcreteFormat, Callable[[Any], str]]] = { "json": lambda data: orjson.dumps(data).decode(), - "text": lambda data: str(AGONText.encode(data, include_header=False)), + "rows": lambda data: str(AGONRows.encode(data, include_header=False)), "columns": lambda data: str(AGONColumns.encode(data, include_header=False)), "struct": lambda data: str(AGONStruct.encode(data, include_header=False)), } # Decoders - Rust for AGON formats _decoders: ClassVar[dict[str, Callable[[str], Any]]] = { - "@AGON text": AGONText.decode, + "@AGON rows": AGONRows.decode, "@AGON columns": AGONColumns.decode, "@AGON struct": AGONStruct.decode, } @@ -134,7 +120,7 @@ def encode( format: Format to use: - "auto": Select best format based on token count (default) - "json": Raw JSON - - "text": AGONText row-based format + - "rows": AGONRows row-based format - "columns": AGONColumns columnar format for wide tables - "struct": AGONStruct template format for repeated shapes force: If True with format="auto", always use a non-JSON format. @@ -213,7 +199,7 @@ def decode( match format: case "json": return AGON._decode_json(text) - case "text" | "columns" | "struct": + case "rows" | "columns" | "struct": header = AGON._headers[format] if not text.startswith(header): text = f"{header}\n\n{text}" @@ -268,7 +254,7 @@ def hint(result_or_format: AGONEncoding | ConcreteFormat) -> str: Example: >>> result = AGON.encode(data, format="auto") >>> AGON.hint(result) # Generation instruction for selected format - 'Return in AGON text format: Start with @AGON text header, encode arrays as name[N]{fields} with tab-delimited rows' + 'Return in AGON rows format: Start with @AGON rows header, encode arrays as name[N]{fields} with tab-delimited rows' >>> AGON.hint("columns") # Generation instruction for columns format 'Return in AGON columns format: Start with @AGON columns header, transpose arrays to name[N] with ├/└ field: val1, val2, ...' """ @@ -281,8 +267,8 @@ def hint(result_or_format: AGONEncoding | ConcreteFormat) -> str: # Return hint for specific format match format_name: - case "text": - return AGONText.hint() + case "rows": + return AGONRows.hint() case "columns": return AGONColumns.hint() case "struct": diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 71b1f8c..095fef2 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -105,7 +105,7 @@ def test_fixture_benchmark(fixture_path: Path) -> None: ] = {} # tokens, savings, encode_ms, decode_ms for fmt, encoder, decoder in [ - ("text", lambda data: AGON.encode(data, format="text"), AGON.decode), # type: ignore[misc] + ("rows", lambda data: AGON.encode(data, format="rows"), AGON.decode), # type: ignore[misc] ("columns", lambda data: AGON.encode(data, format="columns"), AGON.decode), # type: ignore[misc] ("struct", lambda data: AGON.encode(data, format="struct"), AGON.decode), # type: ignore[misc] ]: diff --git a/tests/test_columns.py b/tests/test_columns.py index 6668294..fc59a72 100644 --- a/tests/test_columns.py +++ b/tests/test_columns.py @@ -387,7 +387,7 @@ def test_agon_auto_includes_columns_in_candidates( self, simple_data: list[dict[str, Any]] ) -> None: result = AGON.encode(simple_data, format="auto") - assert result.format in ("json", "text", "columns", "struct") + assert result.format in ("json", "rows", "columns", "struct") class TestAGONColumnsErrors: diff --git a/tests/test_core.py b/tests/test_core.py index 158ef5a..4ffe452 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,7 +1,7 @@ """Tests for the AGON core API. These tests target `agon.core` behavior (format selection, dispatch, and helpers). -Format-specific behavior lives in `tests/test_text.py`. +Format-specific behavior lives in `tests/test_rows.py`. """ from __future__ import annotations @@ -11,7 +11,7 @@ import orjson import pytest -from agon import AGON, AGONError, AGONText +from agon import AGON, AGONError, AGONRows def test_encode_json_format_returns_json() -> None: @@ -20,11 +20,11 @@ def test_encode_json_format_returns_json() -> None: assert orjson.loads(result.text) == data -def test_encode_text_format_uses_header() -> None: +def test_encode_rows_format_uses_header() -> None: data: dict[str, Any] = {"a": 1, "b": "x"} - result = AGON.encode(data, format="text") - assert result.format == "text" - assert result.header == "@AGON text" + result = AGON.encode(data, format="rows") + assert result.format == "rows" + assert result.header == "@AGON rows" def test_encode_routes_to_specific_formats(simple_data: list[dict[str, Any]]) -> None: @@ -32,9 +32,9 @@ def test_encode_routes_to_specific_formats(simple_data: list[dict[str, Any]]) -> assert res_json.format == "json" assert isinstance(orjson.loads(res_json.text), list) - res_text = AGON.encode(simple_data, format="text") - assert res_text.format == "text" - assert res_text.header == "@AGON text" + res_rows = AGON.encode(simple_data, format="rows") + assert res_rows.format == "rows" + assert res_rows.header == "@AGON rows" def test_encode_struct_includes_definitions_without_header() -> None: @@ -50,8 +50,8 @@ def test_encode_struct_includes_definitions_without_header() -> None: assert "@FR: fmt, raw" in result.text -def test_decode_detects_text_payload() -> None: - payload = AGONText.encode({"x": 1}, include_header=True) +def test_decode_detects_rows_payload() -> None: + payload = AGONRows.encode({"x": 1}, include_header=True) assert AGON.decode(payload) == {"x": 1} @@ -73,7 +73,7 @@ def test_decode_invalid_non_json_string_raises() -> None: def test_decode_agon_encoding_directly() -> None: """Test decoding AGONEncoding directly.""" data = [{"id": 1, "name": "Alice"}] - result = AGON.encode(data, format="text") + result = AGON.encode(data, format="rows") # Decode AGONEncoding directly decoded = AGON.decode(result) assert decoded == data @@ -82,7 +82,7 @@ def test_decode_agon_encoding_directly() -> None: def test_decode_with_format_parameter() -> None: """Test decoding with explicit format (no header needed).""" data = [{"id": 1, "name": "Alice"}] - result = AGON.encode(data, format="text") + result = AGON.encode(data, format="rows") # Decode using format parameter decoded = AGON.decode(result.text, format=result.format) assert decoded == data @@ -96,19 +96,19 @@ def test_project_data_delegates() -> None: def test_hint_with_agon_encoding_result() -> None: """hint() should accept AGONEncoding and return prescriptive generation instructions.""" data = [{"id": 1, "name": "Alice"}] - result = AGON.encode(data, format="text") + result = AGON.encode(data, format="rows") hint = AGON.hint(result) assert isinstance(hint, str) - assert "Return in AGON text format" in hint - assert "@AGON text header" in hint + assert "Return in AGON rows format" in hint + assert "@AGON rows header" in hint -def test_hint_with_format_string_text() -> None: +def test_hint_with_format_string_rows() -> None: """hint() should accept format string and return generation instructions.""" - hint = AGON.hint("text") + hint = AGON.hint("rows") assert isinstance(hint, str) - assert "Return in AGON text format" in hint - assert "@AGON text header" in hint + assert "Return in AGON rows format" in hint + assert "@AGON rows header" in hint assert "name[N]{fields}" in hint @@ -147,7 +147,7 @@ def test_hint_matches_encoding_format() -> None: """hint() should return matching hint for different encoded formats.""" data = [{"id": 1, "name": "Alice"}] - for fmt in ["text", "columns", "struct", "json"]: + for fmt in ["rows", "columns", "struct", "json"]: result = AGON.encode(data, format=fmt) # type: ignore[arg-type] hint_from_result = AGON.hint(result) hint_from_string = AGON.hint(fmt) # type: ignore[arg-type] @@ -206,14 +206,14 @@ def test_auto_format_selects_best() -> None: """Auto format should choose most token-efficient option.""" data: list[dict[str, Any]] = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] result = AGON.encode(data, format="auto") - assert result.format in ("json", "text", "columns", "struct") + assert result.format in ("json", "rows", "columns", "struct") def test_force_skips_json() -> None: """With force=True, auto should not select JSON.""" data: dict[str, Any] = {"a": 1} result = AGON.encode(data, format="auto", force=True) - assert result.format == "text" + assert result.format == "rows" def test_auto_min_savings_can_fall_back_to_json() -> None: @@ -275,9 +275,9 @@ def test_agon_encoding_repr() -> None: def test_agon_encoding_with_header() -> None: """with_header() prepends header for auto-detect decoding.""" data = [{"id": 1, "name": "Alice"}] - result = AGON.encode(data, format="text") + result = AGON.encode(data, format="rows") with_header = result.with_header() - assert with_header.startswith("@AGON text") + assert with_header.startswith("@AGON rows") # Can decode with auto-detect assert AGON.decode(with_header) == data diff --git a/tests/test_text.py b/tests/test_rows.py similarity index 69% rename from tests/test_text.py rename to tests/test_rows.py index 7db59a0..0b3d707 100644 --- a/tests/test_text.py +++ b/tests/test_rows.py @@ -1,6 +1,6 @@ -"""Tests for AGONText format. +"""Tests for AGONRows format. -Tests encoding and decoding of the AGONText row-based format. +Tests encoding and decoding of the AGONRows row-based format. """ from __future__ import annotations @@ -10,24 +10,24 @@ import pytest -from agon import AGON, AGONText +from agon import AGON, AGONRows -class TestAGONTextBasic: +class TestAGONRowsBasic: """Basic encoding/decoding tests.""" def test_encode_simple_object(self) -> None: data = {"name": "Alice", "age": 30, "active": True} - encoded = AGONText.encode(data, include_header=True) - assert "@AGON text" in encoded + encoded = AGONRows.encode(data, include_header=True) + assert "@AGON rows" in encoded assert "name: Alice" in encoded assert "age: 30" in encoded assert "active: true" in encoded def test_encode_decode_roundtrip_simple(self) -> None: data = {"name": "Alice", "age": 30} - encoded = AGONText.encode(data, include_header=True) - decoded = AGONText.decode(encoded) + encoded = AGONRows.encode(data, include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == data def test_encode_decode_roundtrip_nested(self) -> None: @@ -38,22 +38,22 @@ def test_encode_decode_roundtrip_nested(self) -> None: "city": "Seattle", }, } - encoded = AGONText.encode(data, include_header=True) - decoded = AGONText.decode(encoded) + encoded = AGONRows.encode(data, include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == data def test_empty_object_roundtrip(self) -> None: data: dict[str, Any] = {} - encoded = AGONText.encode(data, include_header=True) - decoded = AGONText.decode(encoded) + encoded = AGONRows.encode(data, include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == {} or decoded is None -class TestAGONTextTabular: +class TestAGONRowsTabular: """Tests for tabular array encoding (uniform objects).""" def test_encode_tabular_array(self, simple_data: list[dict[str, Any]]) -> None: - encoded = AGONText.encode(simple_data, include_header=True) + encoded = AGONRows.encode(simple_data, include_header=True) assert "[3]{" in encoded # Array header with 3 elements assert "id\tname\trole" in encoded or "id" in encoded @@ -61,7 +61,7 @@ def test_decode_tabular_array(self) -> None: # Named array at root level - decodes to object with array value payload = textwrap.dedent( """\ - @AGON text + @AGON rows products[3]{sku\tname\tprice} A123\tWidget\t9.99 @@ -69,7 +69,7 @@ def test_decode_tabular_array(self) -> None: C789\tGizmo\t29.99 """ ) - decoded = AGONText.decode(payload) + decoded = AGONRows.decode(payload) assert "products" in decoded products = decoded["products"] assert len(products) == 3 @@ -81,7 +81,7 @@ def test_decode_tabular_array_unnamed(self) -> None: # Unnamed array at root - decodes to bare array payload = textwrap.dedent( """\ - @AGON text + @AGON rows [3]{sku\tname\tprice} A123\tWidget\t9.99 @@ -89,19 +89,19 @@ def test_decode_tabular_array_unnamed(self) -> None: C789\tGizmo\t29.99 """ ) - decoded = AGONText.decode(payload) + decoded = AGONRows.decode(payload) assert len(decoded) == 3 assert decoded[0] == {"sku": "A123", "name": "Widget", "price": 9.99} def test_roundtrip_tabular_array(self, simple_data: list[dict[str, Any]]) -> None: - encoded = AGONText.encode(simple_data, include_header=True) - decoded = AGONText.decode(encoded) + encoded = AGONRows.encode(simple_data, include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == simple_data def test_tabular_with_missing_values(self) -> None: payload = textwrap.dedent( """\ - @AGON text + @AGON rows users[3]{id\tname\temail} 1\tAlice\talice@example.com @@ -109,7 +109,7 @@ def test_tabular_with_missing_values(self) -> None: 3\t\tcarol@example.com """ ) - decoded = AGONText.decode(payload) + decoded = AGONRows.decode(payload) users = decoded["users"] assert len(users) == 3 assert users[0] == {"id": 1, "name": "Alice", "email": "alice@example.com"} @@ -117,61 +117,61 @@ def test_tabular_with_missing_values(self) -> None: assert users[2] == {"id": 3, "email": "carol@example.com"} # Missing name -class TestAGONTextPrimitiveArrays: +class TestAGONRowsPrimitiveArrays: """Tests for primitive array encoding.""" def test_encode_primitive_array(self) -> None: data = {"tags": ["admin", "ops", "dev"]} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert "[3]:" in encoded def test_decode_primitive_array(self) -> None: payload = textwrap.dedent( """\ - @AGON text + @AGON rows tags[4]: admin\tops\tdev\tuser """ ) - decoded = AGONText.decode(payload) + decoded = AGONRows.decode(payload) assert decoded == {"tags": ["admin", "ops", "dev", "user"]} def test_roundtrip_primitive_array(self) -> None: data = {"numbers": [1, 2, 3, 4, 5]} - encoded = AGONText.encode(data, include_header=True) - decoded = AGONText.decode(encoded) + encoded = AGONRows.encode(data, include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == data def test_decode_primitive_array_with_escaped_quote(self) -> None: payload = textwrap.dedent( """\ - @AGON text + @AGON rows vals[2]: "a\\\"b"\t"c" """ ) - assert AGONText.decode(payload) == {"vals": ['a"b', "c"]} + assert AGONRows.decode(payload) == {"vals": ['a"b', "c"]} def test_empty_array_roundtrip(self) -> None: data = {"items": []} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert "items[0]:" in encoded - decoded = AGONText.decode(encoded) + decoded = AGONRows.decode(encoded) assert decoded == {"items": []} -class TestAGONTextMixedArrays: +class TestAGONRowsMixedArrays: """Tests for mixed-type array encoding (list format).""" def test_encode_mixed_array(self) -> None: data = {"items": [42, "hello", True, None]} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert "items[4]:" in encoded def test_decode_list_array_with_objects(self) -> None: payload = textwrap.dedent( """\ - @AGON text + @AGON rows records[2]: - name: Alice @@ -180,7 +180,7 @@ def test_decode_list_array_with_objects(self) -> None: age: 25 """ ) - decoded = AGONText.decode(payload) + decoded = AGONRows.decode(payload) records = decoded["records"] assert len(records) == 2 assert records[0] == {"name": "Alice", "age": 30} @@ -189,71 +189,71 @@ def test_decode_list_array_with_objects(self) -> None: def test_decode_list_array_header_with_no_inline_values(self) -> None: payload = textwrap.dedent( """\ - @AGON text + @AGON rows vals[2]: - 1 - 2 """ ) - assert AGONText.decode(payload) == {"vals": [1, 2]} + assert AGONRows.decode(payload) == {"vals": [1, 2]} def test_parses_newline_delimiter_header(self) -> None: # Delimiter may not be used in the body, but header parsing should accept it. payload = textwrap.dedent( """\ - @AGON text + @AGON rows @D=\\n s: "x" """ ) - assert AGONText.decode(payload) == {"s": "x"} + assert AGONRows.decode(payload) == {"s": "x"} def test_parses_tab_delimiter_header(self) -> None: payload = textwrap.dedent( """\ - @AGON text + @AGON rows @D=\\t s: "x" """ ) - assert AGONText.decode(payload) == {"s": "x"} + assert AGONRows.decode(payload) == {"s": "x"} def test_quotes_strings_that_look_like_primitives(self) -> None: data = {"b": "true", "n": "123", "z": "null"} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert 'b: "true"' in encoded assert 'n: "123"' in encoded assert 'z: "null"' in encoded - assert AGONText.decode(encoded) == data + assert AGONRows.decode(encoded) == data def test_special_floats_decode_as_none(self) -> None: data = {"nan": float("nan"), "inf": float("inf"), "ninf": float("-inf")} - decoded = AGONText.decode(AGONText.encode(data, include_header=True)) + decoded = AGONRows.decode(AGONRows.encode(data, include_header=True)) assert decoded["nan"] is None assert decoded["inf"] is None assert decoded["ninf"] is None -class TestAGONTextPrimitives: +class TestAGONRowsPrimitives: """Tests for primitive value handling.""" def test_encode_null(self) -> None: data = {"value": None} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert "value: null" in encoded def test_encode_booleans(self) -> None: data = {"active": True, "deleted": False} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert "active: true" in encoded assert "deleted: false" in encoded def test_encode_numbers(self) -> None: data = {"integer": 42, "float": 3.14, "negative": -17} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert "integer: 42" in encoded assert "float: 3.14" in encoded assert "negative: -17" in encoded @@ -261,14 +261,14 @@ def test_encode_numbers(self) -> None: def test_encode_special_floats(self) -> None: # NaN and Infinity should become null data = {"nan": float("nan"), "inf": float("inf")} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert "nan: null" in encoded assert "inf: null" in encoded def test_decode_primitives(self) -> None: payload = textwrap.dedent( """\ - @AGON text + @AGON rows value: 42 name: Alice @@ -276,43 +276,43 @@ def test_decode_primitives(self) -> None: missing: null """ ) - decoded = AGONText.decode(payload) + decoded = AGONRows.decode(payload) assert decoded == {"value": 42, "name": "Alice", "active": True, "missing": None} -class TestAGONTextQuoting: +class TestAGONRowsQuoting: """Tests for string quoting rules.""" def test_quote_string_with_delimiter(self) -> None: - data = {"text": "hello\tworld"} - encoded = AGONText.encode(data, include_header=True) + data = {"rows": "hello\tworld"} + encoded = AGONRows.encode(data, include_header=True) assert '"hello\\tworld"' in encoded or '"' in encoded def test_quote_string_with_leading_space(self) -> None: - data = {"text": " leading space"} - encoded = AGONText.encode(data, include_header=True) + data = {"rows": " leading space"} + encoded = AGONRows.encode(data, include_header=True) assert '" leading space"' in encoded def test_quote_string_with_special_char(self) -> None: data = {"tag": "@mention"} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert '"@mention"' in encoded def test_quote_string_looks_like_number(self) -> None: data = {"code": "42"} - encoded = AGONText.encode(data, include_header=True) + encoded = AGONRows.encode(data, include_header=True) assert '"42"' in encoded def test_roundtrip_quoted_strings(self) -> None: - data = {"text": 'Say "hello"', "path": "C:\\Users"} - encoded = AGONText.encode(data, include_header=True) - decoded = AGONText.decode(encoded) + data = {"rows": 'Say "hello"', "path": "C:\\Users"} + encoded = AGONRows.encode(data, include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == data def test_long_and_unicode_string_roundtrip(self) -> None: - data = {"text": "Hello 世界 🌍" + ("x" * 1000)} - encoded = AGONText.encode(data, include_header=True) - decoded = AGONText.decode(encoded) + data = {"rows": "Hello 世界 🌍" + ("x" * 1000)} + encoded = AGONRows.encode(data, include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == data def test_roundtrip_string_escaping_newlines_and_whitespace(self) -> None: @@ -322,12 +322,12 @@ def test_roundtrip_string_escaping_newlines_and_whitespace(self) -> None: "newline": "x\ny", "special": "@tag", } - encoded = AGONText.encode([data], include_header=True) - decoded = AGONText.decode(encoded) + encoded = AGONRows.encode([data], include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == [data] -class TestAGONTextNesting: +class TestAGONRowsNesting: """Tests for nested structures.""" def test_nested_object(self) -> None: @@ -340,19 +340,19 @@ def test_nested_object(self) -> None: }, }, } - encoded = AGONText.encode(data, include_header=True) - decoded = AGONText.decode(encoded) + encoded = AGONRows.encode(data, include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == data def test_array_inside_object(self, nested_data: list[dict[str, Any]]) -> None: - encoded = AGONText.encode(nested_data, include_header=True) - decoded = AGONText.decode(encoded) + encoded = AGONRows.encode(nested_data, include_header=True) + decoded = AGONRows.decode(encoded) assert decoded == nested_data def test_decode_object_with_named_arrays(self) -> None: payload = textwrap.dedent( """\ - @AGON text + @AGON rows root: nums[2]: 1\t2 @@ -365,7 +365,7 @@ def test_decode_object_with_named_arrays(self) -> None: z: 2 """ ) - assert AGONText.decode(payload) == { + assert AGONRows.decode(payload) == { "root": { "nums": [1, 2], "rows": [{"a": 1, "b": 2}, {"a": 3, "b": 4}], @@ -374,47 +374,47 @@ def test_decode_object_with_named_arrays(self) -> None: } -class TestAGONTextIntegration: +class TestAGONRowsIntegration: """Integration tests with AGON core.""" - def test_agon_encode_text_format(self, simple_data: list[dict[str, Any]]) -> None: - result = AGON.encode(simple_data, format="text") - assert result.format == "text" - assert result.header == "@AGON text" + def test_agon_encode_rows_format(self, simple_data: list[dict[str, Any]]) -> None: + result = AGON.encode(simple_data, format="rows") + assert result.format == "rows" + assert result.header == "@AGON rows" - def test_agon_decode_detects_text_format(self, simple_data: list[dict[str, Any]]) -> None: - encoded = AGONText.encode(simple_data, include_header=True) + def test_agon_decode_detects_rows_format(self, simple_data: list[dict[str, Any]]) -> None: + encoded = AGONRows.encode(simple_data, include_header=True) decoded = AGON.decode(encoded) assert decoded == simple_data def test_agon_decode_encoding_directly(self, simple_data: list[dict[str, Any]]) -> None: - result = AGON.encode(simple_data, format="text") + result = AGON.encode(simple_data, format="rows") decoded = AGON.decode(result) assert decoded == simple_data - def test_agon_auto_includes_text_in_candidates(self, simple_data: list[dict[str, Any]]) -> None: - # Encode with auto should consider text format + def test_agon_auto_includes_rows_in_candidates(self, simple_data: list[dict[str, Any]]) -> None: + # Encode with auto should consider rows format result = AGON.encode(simple_data, format="auto") - # Result could be any format, but text should have been considered - assert result.format in ("json", "text", "columns", "struct") + # Result could be any format, but rows should have been considered + assert result.format in ("json", "rows", "columns", "struct") -class TestAGONTextErrors: +class TestAGONRowsErrors: """Error handling tests.""" def test_invalid_header(self) -> None: with pytest.raises(ValueError): - AGONText.decode("not a valid header") + AGONRows.decode("not a valid header") def test_empty_payload(self) -> None: with pytest.raises(ValueError): - AGONText.decode("") + AGONRows.decode("") -class TestAGONTextHint: +class TestAGONRowsHint: """Test hint method.""" def test_hint_returns_string(self) -> None: - hint = AGONText.hint() + hint = AGONRows.hint() assert isinstance(hint, str) - assert "AGON text" in hint + assert "AGON rows" in hint From d108da1516f272353057f236d00579a7b0aebc4a Mon Sep 17 00:00:00 2001 From: harvey Date: Thu, 25 Dec 2025 15:33:44 -0500 Subject: [PATCH 5/7] fix: uses fast byte-length token estimation if encoding not specified to optimize auto --- Cargo.toml | 4 +- Makefile | 7 +- crates/agon-core/src/error.rs | 2 +- crates/agon-core/src/formats/columns.rs | 144 ++++++++++----------- crates/agon-core/src/formats/mod.rs | 63 ++++++--- crates/agon-core/src/formats/rows.rs | 8 +- crates/agon-core/src/formats/struct_fmt.rs | 105 ++++++++------- crates/agon-core/src/lib.rs | 16 ++- crates/agon-core/src/types.rs | 8 +- crates/agon-core/src/utils.rs | 70 ++++++++-- pyproject.toml | 2 +- python/agon/__init__.py | 9 +- python/agon/core.py | 100 +++++++------- python/agon/encoding.py | 21 --- tests/test_benchmarks.py | 27 ++-- tests/test_core.py | 99 ++++++++++---- 16 files changed, 409 insertions(+), 276 deletions(-) delete mode 100644 python/agon/encoding.py diff --git a/Cargo.toml b/Cargo.toml index 6b9d10d..7cf3774 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,8 +3,8 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.1.0" -edition = "2021" +version = "0.2.0" +edition = "2024" license = "MIT" authors = ["Harvey Tseng "] repository = "https://github.com/Verdenroz/agon-python" diff --git a/Makefile b/Makefile index 7129fb5..785f424 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ .DEFAULT_GOAL := default -.PHONY: default install fix test nox upgrade build docs clean pre-commit help +.PHONY: default install fix test nox upgrade build docs clean pre-commit benchmarks help default: install build fix test @@ -29,6 +29,10 @@ test: build nox: build uv run nox +benchmarks: build + @echo "📊 Running AGON benchmarks..." + uv run pytest tests/test_benchmarks.py -s --no-cov -o addopts="" + upgrade: uv sync --upgrade --dev @@ -60,6 +64,7 @@ help: @echo "Testing:" @echo " make test - Run Rust + Python tests with coverage" @echo " make nox - Run nox sessions (builds Rust first)" + @echo " make benchmarks - Run performance benchmarks" @echo "" @echo "Other:" @echo " make docs - Serve docs locally" diff --git a/crates/agon-core/src/error.rs b/crates/agon-core/src/error.rs index 337812d..d26893a 100644 --- a/crates/agon-core/src/error.rs +++ b/crates/agon-core/src/error.rs @@ -1,7 +1,7 @@ //! Error types for AGON encoding/decoding -use pyo3::exceptions::PyValueError; use pyo3::PyErr; +use pyo3::exceptions::PyValueError; use thiserror::Error; #[derive(Error, Debug)] diff --git a/crates/agon-core/src/formats/columns.rs b/crates/agon-core/src/formats/columns.rs index 420500f..8f09e89 100644 --- a/crates/agon-core/src/formats/columns.rs +++ b/crates/agon-core/src/formats/columns.rs @@ -156,10 +156,10 @@ fn parse_primitive(s: &str) -> Value { if let Ok(i) = s.parse::() { return Value::Number(i.into()); } - if let Ok(f) = s.parse::() { - if let Some(n) = serde_json::Number::from_f64(f) { - return Value::Number(n); - } + if let Ok(f) = s.parse::() + && let Some(n) = serde_json::Number::from_f64(f) + { + return Value::Number(n); } Value::String(s.to_string()) @@ -428,54 +428,53 @@ fn decode_value( let base_depth = get_indent_depth(lines[idx]); // Check for array patterns: [N], [N]:, name[N], name[N]: - if let Some(bracket_pos) = line.find('[') { - if let Some(end_pos) = line.find(']') { - if end_pos > bracket_pos { - let name = &line[..bracket_pos]; - let count_str = &line[bracket_pos + 1..end_pos]; - if let Ok(count) = count_str.parse::() { - // If this is a named array (name[N]), it's part of an object - // Delegate to decode_object to parse the full object - if !name.is_empty() { - return decode_object(lines, idx, delimiter); - } + if let Some(bracket_pos) = line.find('[') + && let Some(end_pos) = line.find(']') + && end_pos > bracket_pos + { + let name = &line[..bracket_pos]; + let count_str = &line[bracket_pos + 1..end_pos]; + if let Ok(count) = count_str.parse::() { + // If this is a named array (name[N]), it's part of an object + // Delegate to decode_object to parse the full object + if !name.is_empty() { + return decode_object(lines, idx, delimiter); + } - // Unnamed array: [N] - // Check if next line has ├ or └ (columnar format) - if idx + 1 < lines.len() { - let next = lines[idx + 1].trim(); - if next.starts_with('├') || next.starts_with('└') { - return decode_columnar_array(lines, idx, "", count, delimiter); - } - } + // Unnamed array: [N] + // Check if next line has ├ or └ (columnar format) + if idx + 1 < lines.len() { + let next = lines[idx + 1].trim(); + if next.starts_with('├') || next.starts_with('└') { + return decode_columnar_array(lines, idx, "", count, delimiter); + } + } - // Check for inline primitive array: [N]: val1\tval2 - if let Some(colon_pos) = line.find("]:") { - let values_str = line[colon_pos + 2..].trim(); - if !values_str.is_empty() { - let values: Vec = - values_str.split(delimiter).map(parse_primitive).collect(); - return Ok((Value::Array(values), idx + 1)); - } - // Empty values after colon means list array: [N]: - return decode_list_array(lines, idx, base_depth, count, delimiter); - } + // Check for inline primitive array: [N]: val1\tval2 + if let Some(colon_pos) = line.find("]:") { + let values_str = line[colon_pos + 2..].trim(); + if !values_str.is_empty() { + let values: Vec = + values_str.split(delimiter).map(parse_primitive).collect(); + return Ok((Value::Array(values), idx + 1)); + } + // Empty values after colon means list array: [N]: + return decode_list_array(lines, idx, base_depth, count, delimiter); + } - // Bare [N] with no colon - could be empty array or non-columnar array - if count == 0 { - return Ok((Value::Array(vec![]), idx + 1)); - } - // Check if next line is a list item - if idx + 1 < lines.len() { - let next = lines[idx + 1].trim(); - if next.starts_with("- ") { - return decode_list_array(lines, idx, base_depth, count, delimiter); - } - } - // No colon, no columnar, no list - it's an empty array - return Ok((Value::Array(vec![]), idx + 1)); + // Bare [N] with no colon - could be empty array or non-columnar array + if count == 0 { + return Ok((Value::Array(vec![]), idx + 1)); + } + // Check if next line is a list item + if idx + 1 < lines.len() { + let next = lines[idx + 1].trim(); + if next.starts_with("- ") { + return decode_list_array(lines, idx, base_depth, count, delimiter); } } + // No colon, no columnar, no list - it's an empty array + return Ok((Value::Array(vec![]), idx + 1)); } } @@ -639,20 +638,18 @@ fn decode_object(lines: &[&str], idx: usize, delimiter: &str) -> Result<(Value, let stripped = line.trim(); // Check for array patterns: name[N] or name[N]: values - if let Some(bracket_pos) = stripped.find('[') { - if let Some(end_pos) = stripped.find(']') { - if end_pos > bracket_pos { - let name = &stripped[..bracket_pos]; - let count_str = &stripped[bracket_pos + 1..end_pos]; - if let Ok(count) = count_str.parse::() { - // This is an array pattern - decode it via decode_value - let (arr, new_idx) = - decode_array_in_object(lines, idx, name, count, delimiter)?; - result.insert(name.to_string(), arr); - idx = new_idx; - continue; - } - } + if let Some(bracket_pos) = stripped.find('[') + && let Some(end_pos) = stripped.find(']') + && end_pos > bracket_pos + { + let name = &stripped[..bracket_pos]; + let count_str = &stripped[bracket_pos + 1..end_pos]; + if let Ok(count) = count_str.parse::() { + // This is an array pattern - decode it via decode_value + let (arr, new_idx) = decode_array_in_object(lines, idx, name, count, delimiter)?; + result.insert(name.to_string(), arr); + idx = new_idx; + continue; } } @@ -833,19 +830,18 @@ fn decode_list_item_object( } // Check for array patterns - if let Some(bracket_pos) = stripped.find('[') { - if let Some(end_pos) = stripped.find(']') { - if end_pos > bracket_pos { - let arr_name = &stripped[..bracket_pos]; - let count_str = &stripped[bracket_pos + 1..end_pos]; - if let Ok(count) = count_str.parse::() { - let (arr, new_idx) = - decode_array_in_object(lines, idx, arr_name, count, delimiter)?; - obj.insert(arr_name.to_string(), arr); - idx = new_idx; - continue; - } - } + if let Some(bracket_pos) = stripped.find('[') + && let Some(end_pos) = stripped.find(']') + && end_pos > bracket_pos + { + let arr_name = &stripped[..bracket_pos]; + let count_str = &stripped[bracket_pos + 1..end_pos]; + if let Ok(count) = count_str.parse::() { + let (arr, new_idx) = + decode_array_in_object(lines, idx, arr_name, count, delimiter)?; + obj.insert(arr_name.to_string(), arr); + idx = new_idx; + continue; } } diff --git a/crates/agon-core/src/formats/mod.rs b/crates/agon-core/src/formats/mod.rs index 1a443e0..b05992a 100644 --- a/crates/agon-core/src/formats/mod.rs +++ b/crates/agon-core/src/formats/mod.rs @@ -24,6 +24,22 @@ pub struct EncodingResult { pub token_estimate: usize, } +/// Fast byte-length estimate for format comparison (avoids expensive tokenization) +/// Actual token count can be computed later if needed +fn estimate_tokens_fast(text: &str) -> usize { + // Rough heuristic: ~4 chars per token for English text + // This is only used for relative comparison between formats + text.len() / 4 +} + +/// Count tokens - either fast estimate or accurate tiktoken with specified encoding +fn count_tokens_for_comparison(text: &str, encoding: Option<&str>) -> usize { + match encoding { + Some(enc) => count_tokens(text, enc).unwrap_or_else(|_| estimate_tokens_fast(text)), + None => estimate_tokens_fast(text), + } +} + /// Headers for each format pub fn get_header(format: &str) -> &'static str { match format { @@ -40,17 +56,18 @@ pub fn encode_auto_parallel( data: &JsonValue, force: bool, min_savings: f64, + encoding: Option<&str>, ) -> Result { - let results = encode_all_parallel(data)?; + let results = encode_all_parallel_internal(data, encoding)?; // Find JSON baseline let json_result = results.iter().find(|r| r.format == "json"); let json_tokens = json_result.map(|r| r.token_estimate).unwrap_or(usize::MAX); - // Find best non-JSON result + // Find best result (exclude JSON if force=true) let best = results .iter() - .filter(|r| force || r.format != "json") + .filter(|r| !force || r.format != "json") .min_by_key(|r| r.token_estimate); match best { @@ -73,7 +90,7 @@ pub fn encode_auto_parallel( None => { // Fallback to JSON let text = serde_json::to_string(data)?; - let tokens = count_tokens(&text); + let tokens = estimate_tokens_fast(&text); Ok(EncodingResult { format: "json".to_string(), text, @@ -84,14 +101,22 @@ pub fn encode_auto_parallel( } } -/// Encode data with all formats in parallel +/// Encode data with all formats in parallel (public API - uses fast estimate) pub fn encode_all_parallel(data: &JsonValue) -> Result> { + encode_all_parallel_internal(data, None) +} + +/// Internal: Encode data with all formats in parallel +fn encode_all_parallel_internal( + data: &JsonValue, + encoding: Option<&str>, +) -> Result> { let formats = ["json", "rows", "columns", "struct"]; // Use rayon to encode all formats in parallel let results: Vec> = formats .par_iter() - .map(|format| encode_with_format(data, format)) + .map(|format| encode_with_format(data, format, encoding)) .collect(); // Collect results, filtering out errors @@ -110,7 +135,7 @@ pub fn encode_all_parallel(data: &JsonValue) -> Result> { format: "json".to_string(), text: text.clone(), header: String::new(), - token_estimate: count_tokens(&text), + token_estimate: count_tokens_for_comparison(&text, encoding), }); } @@ -118,7 +143,11 @@ pub fn encode_all_parallel(data: &JsonValue) -> Result> { } /// Encode data with a specific format -fn encode_with_format(data: &JsonValue, format: &str) -> Result { +fn encode_with_format( + data: &JsonValue, + format: &str, + encoding: Option<&str>, +) -> Result { let (text, header) = match format { "json" => (serde_json::to_string(data)?, String::new()), "rows" => (rows::encode(data, false)?, get_header("rows").to_string()), @@ -133,7 +162,7 @@ fn encode_with_format(data: &JsonValue, format: &str) -> Result _ => return Err(crate::error::AgonError::InvalidFormat(format.to_string())), }; - let token_estimate = count_tokens(&text); + let token_estimate = count_tokens_for_comparison(&text, encoding); Ok(EncodingResult { format: format.to_string(), @@ -197,7 +226,7 @@ mod tests { {"id": 3, "name": "Carol", "role": "user"} ]); - let result = encode_auto_parallel(&data, false, 0.0).unwrap(); + let result = encode_auto_parallel(&data, false, 0.0, None).unwrap(); // Should select a non-JSON format for tabular data assert!(!result.text.is_empty()); @@ -209,7 +238,7 @@ mod tests { let data = json!({"simple": "data"}); // With force=true, should never return JSON (if alternatives exist) - let result = encode_auto_parallel(&data, true, 0.0).unwrap(); + let result = encode_auto_parallel(&data, true, 0.0, None).unwrap(); // Result should be valid assert!(!result.text.is_empty()); @@ -220,7 +249,7 @@ mod tests { let data = json!({"a": 1}); // With high min_savings threshold, should fall back to JSON if savings aren't met - let result = encode_auto_parallel(&data, false, 0.99).unwrap(); + let result = encode_auto_parallel(&data, false, 0.99, None).unwrap(); // Should get a valid result regardless assert!(!result.text.is_empty()); @@ -229,7 +258,7 @@ mod tests { #[test] fn test_encode_with_format_json() { let data = json!({"key": "value"}); - let result = encode_with_format(&data, "json").unwrap(); + let result = encode_with_format(&data, "json", None).unwrap(); assert_eq!(result.format, "json"); assert!(result.header.is_empty()); @@ -239,7 +268,7 @@ mod tests { #[test] fn test_encode_with_format_rows() { let data = json!({"name": "test"}); - let result = encode_with_format(&data, "rows").unwrap(); + let result = encode_with_format(&data, "rows", None).unwrap(); assert_eq!(result.format, "rows"); assert_eq!(result.header, "@AGON rows"); @@ -248,7 +277,7 @@ mod tests { #[test] fn test_encode_with_format_columns() { let data = json!([{"id": 1}, {"id": 2}]); - let result = encode_with_format(&data, "columns").unwrap(); + let result = encode_with_format(&data, "columns", None).unwrap(); assert_eq!(result.format, "columns"); assert_eq!(result.header, "@AGON columns"); @@ -257,7 +286,7 @@ mod tests { #[test] fn test_encode_with_format_struct() { let data = json!({"a": {"fmt": "1", "raw": 1}}); - let result = encode_with_format(&data, "struct").unwrap(); + let result = encode_with_format(&data, "struct", None).unwrap(); assert_eq!(result.format, "struct"); assert_eq!(result.header, "@AGON struct"); @@ -266,7 +295,7 @@ mod tests { #[test] fn test_encode_with_format_invalid() { let data = json!({}); - let result = encode_with_format(&data, "invalid_format"); + let result = encode_with_format(&data, "invalid_format", None); assert!(result.is_err()); } diff --git a/crates/agon-core/src/formats/rows.rs b/crates/agon-core/src/formats/rows.rs index dd32223..65da203 100644 --- a/crates/agon-core/src/formats/rows.rs +++ b/crates/agon-core/src/formats/rows.rs @@ -189,10 +189,10 @@ fn parse_primitive(s: &str) -> Value { // Number if NUMBER_RE.is_match(s) { if s.contains('.') || s.to_lowercase().contains('e') { - if let Ok(f) = s.parse::() { - if let Some(n) = serde_json::Number::from_f64(f) { - return Value::Number(n); - } + if let Ok(f) = s.parse::() + && let Some(n) = serde_json::Number::from_f64(f) + { + return Value::Number(n); } } else if let Ok(i) = s.parse::() { return Value::Number(i.into()); diff --git a/crates/agon-core/src/formats/struct_fmt.rs b/crates/agon-core/src/formats/struct_fmt.rs index ff67e7e..2570f68 100644 --- a/crates/agon-core/src/formats/struct_fmt.rs +++ b/crates/agon-core/src/formats/struct_fmt.rs @@ -388,22 +388,21 @@ fn encode_array(arr: &[Value], lines: &mut Vec, depth: usize, registry: // If object has nested objects/arrays, use list item format to preserve them let has_nested = obj.values().any(|v| v.is_object() || v.is_array()); - if !has_nested { - if let Some(struct_name) = find_matching_struct(obj, registry) { - if let Some((fields, _, _)) = registry.get(&struct_name) { - let values: Vec = fields - .iter() - .map(|f| obj.get(f).map(format_primitive).unwrap_or_default()) - .collect(); - lines.push(format!( - "{} - {}({})", - indent, - struct_name, - values.join(", ") - )); - continue; - } - } + if !has_nested + && let Some(struct_name) = find_matching_struct(obj, registry) + && let Some((fields, _, _)) = registry.get(&struct_name) + { + let values: Vec = fields + .iter() + .map(|f| obj.get(f).map(format_primitive).unwrap_or_default()) + .collect(); + lines.push(format!( + "{} - {}({})", + indent, + struct_name, + values.join(", ") + )); + continue; } encode_list_item(obj, lines, depth + 1, registry); } else { @@ -430,23 +429,22 @@ fn encode_list_item( first = false; // Check if value can use a struct - if let Some(nested_obj) = v.as_object() { - if let Some(struct_name) = find_matching_struct(nested_obj, registry) { - if let Some((fields, _, _)) = registry.get(&struct_name) { - let values: Vec = fields - .iter() - .map(|f| nested_obj.get(f).map(format_primitive).unwrap_or_default()) - .collect(); - lines.push(format!( - "{}{}: {}({})", - prefix, - k, - struct_name, - values.join(", ") - )); - continue; - } - } + if let Some(nested_obj) = v.as_object() + && let Some(struct_name) = find_matching_struct(nested_obj, registry) + && let Some((fields, _, _)) = registry.get(&struct_name) + { + let values: Vec = fields + .iter() + .map(|f| nested_obj.get(f).map(format_primitive).unwrap_or_default()) + .collect(); + lines.push(format!( + "{}{}: {}({})", + prefix, + k, + struct_name, + values.join(", ") + )); + continue; } match v { @@ -484,23 +482,22 @@ fn encode_object( for (k, v) in obj { // Check if value can use a struct - if let Some(nested_obj) = v.as_object() { - if let Some(struct_name) = find_matching_struct(nested_obj, registry) { - if let Some((fields, _, _)) = registry.get(&struct_name) { - let values: Vec = fields - .iter() - .map(|f| nested_obj.get(f).map(format_primitive).unwrap_or_default()) - .collect(); - lines.push(format!( - "{}{}: {}({})", - actual_indent, - k, - struct_name, - values.join(", ") - )); - continue; - } - } + if let Some(nested_obj) = v.as_object() + && let Some(struct_name) = find_matching_struct(nested_obj, registry) + && let Some((fields, _, _)) = registry.get(&struct_name) + { + let values: Vec = fields + .iter() + .map(|f| nested_obj.get(f).map(format_primitive).unwrap_or_default()) + .collect(); + lines.push(format!( + "{}{}: {}({})", + actual_indent, + k, + struct_name, + values.join(", ") + )); + continue; } match v { @@ -584,10 +581,10 @@ fn parse_primitive(s: &str) -> Value { // Number if NUMBER_RE.is_match(s) { if s.contains('.') || s.to_lowercase().contains('e') { - if let Ok(f) = s.parse::() { - if let Some(n) = serde_json::Number::from_f64(f) { - return Value::Number(n); - } + if let Ok(f) = s.parse::() + && let Some(n) = serde_json::Number::from_f64(f) + { + return Value::Number(n); } } else if let Ok(i) = s.parse::() { return Value::Number(i.into()); diff --git a/crates/agon-core/src/lib.rs b/crates/agon-core/src/lib.rs index 3f888a1..f8adf27 100644 --- a/crates/agon-core/src/lib.rs +++ b/crates/agon-core/src/lib.rs @@ -137,7 +137,7 @@ fn build_keep_tree(keep_paths: &[String]) -> KeepTree { .children .entry(key) .or_insert_with(|| Some(Box::new(KeepTree::default()))); - if let Some(ref mut subtree) = entry { + if let Some(subtree) = entry { cur = subtree.as_mut(); } else { // Was None (keep whole), upgrade to subtree @@ -362,14 +362,15 @@ impl EncodingResult { // ============================================================================ #[pyfunction] -#[pyo3(signature = (data, force = false, min_savings = 0.10))] +#[pyo3(signature = (data, force = false, min_savings = 0.10, encoding = None))] fn encode_auto_parallel( data: &Bound<'_, PyAny>, force: bool, min_savings: f64, + encoding: Option<&str>, ) -> PyResult { let value = types::py_to_json(data)?; - let result = formats::encode_auto_parallel(&value, force, min_savings)?; + let result = formats::encode_auto_parallel(&value, force, min_savings, encoding)?; Ok(EncodingResult { format: result.format, text: result.text, @@ -378,6 +379,14 @@ fn encode_auto_parallel( }) } +/// Count tokens using tiktoken encoding +#[pyfunction] +#[pyo3(signature = (text, encoding = "o200k_base"))] +fn count_tokens(text: &str, encoding: &str) -> PyResult { + utils::count_tokens(text, encoding) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) +} + #[pyfunction] fn encode_all_parallel(data: &Bound<'_, PyAny>) -> PyResult> { let value = types::py_to_json(data)?; @@ -406,6 +415,7 @@ fn agon_core(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_function(wrap_pyfunction!(encode_auto_parallel, m)?)?; m.add_function(wrap_pyfunction!(encode_all_parallel, m)?)?; + m.add_function(wrap_pyfunction!(count_tokens, m)?)?; Ok(()) } diff --git a/crates/agon-core/src/types.rs b/crates/agon-core/src/types.rs index 2bc03b4..e0c95d4 100644 --- a/crates/agon-core/src/types.rs +++ b/crates/agon-core/src/types.rs @@ -25,10 +25,10 @@ pub fn py_to_json(obj: &Bound<'_, PyAny>) -> Result { return Ok(JsonValue::Number(n.into())); } // Try as float if i64 doesn't work (large numbers) - if let Ok(f) = obj.extract::() { - if let Some(n) = serde_json::Number::from_f64(f) { - return Ok(JsonValue::Number(n)); - } + if let Ok(f) = obj.extract::() + && let Some(n) = serde_json::Number::from_f64(f) + { + return Ok(JsonValue::Number(n)); } return Err(AgonError::InvalidData("Integer too large".to_string())); } diff --git a/crates/agon-core/src/utils.rs b/crates/agon-core/src/utils.rs index 5bbd3e6..ccd098c 100644 --- a/crates/agon-core/src/utils.rs +++ b/crates/agon-core/src/utils.rs @@ -1,20 +1,56 @@ //! Shared utilities for AGON encoding -use std::sync::OnceLock; +use std::collections::HashMap; +use std::sync::{LazyLock, RwLock}; use tiktoken_rs::CoreBPE; -/// Global tokenizer instance (o200k_base - used by GPT-4o/Claude) -static TOKENIZER: OnceLock = OnceLock::new(); +use crate::error::{AgonError, Result}; -fn get_tokenizer() -> &'static CoreBPE { - TOKENIZER.get_or_init(|| { - tiktoken_rs::o200k_base().expect("Failed to initialize o200k_base tokenizer") - }) +/// Cached tokenizer instances by encoding name +static TOKENIZERS: LazyLock>> = + LazyLock::new(|| RwLock::new(HashMap::new())); + +/// Get or create a tokenizer for the given encoding +fn get_tokenizer(encoding: &str) -> Result { + // Check cache first + { + let cache = TOKENIZERS.read().unwrap(); + if let Some(tokenizer) = cache.get(encoding) { + return Ok(tokenizer.clone()); + } + } + + // Create new tokenizer + let tokenizer = match encoding { + "o200k_base" => tiktoken_rs::o200k_base(), + "o200k_harmony" => tiktoken_rs::o200k_harmony(), + "cl100k_base" => tiktoken_rs::cl100k_base(), + "p50k_base" => tiktoken_rs::p50k_base(), + "p50k_edit" => tiktoken_rs::p50k_edit(), + "r50k_base" => tiktoken_rs::r50k_base(), + _ => { + return Err(AgonError::InvalidFormat(format!( + "Unknown encoding: {}", + encoding + ))); + } + } + .map_err(|e| AgonError::EncodingError(e.to_string()))?; + + // Cache it + { + let mut cache = TOKENIZERS.write().unwrap(); + cache.insert(encoding.to_string(), tokenizer.clone()); + } + + Ok(tokenizer) } -/// Count tokens using tiktoken's o200k_base encoding -pub fn count_tokens(text: &str) -> usize { - get_tokenizer().encode_ordinary(text).len() +/// Count tokens using the specified tiktoken encoding +/// Note: This is expensive (~1ms per 10KB). Use only when exact count is needed. +pub fn count_tokens(text: &str, encoding: &str) -> Result { + let tokenizer = get_tokenizer(encoding)?; + Ok(tokenizer.encode_ordinary(text).len()) } #[cfg(test)] @@ -23,8 +59,16 @@ mod tests { #[test] fn test_count_tokens() { - assert!(count_tokens("hello world") > 0); - assert!(count_tokens("a longer piece of text") > count_tokens("short")); - assert_eq!(count_tokens(""), 0); + assert!(count_tokens("hello world", "o200k_base").unwrap() > 0); + assert!( + count_tokens("a longer piece of text", "o200k_base").unwrap() + > count_tokens("short", "o200k_base").unwrap() + ); + assert_eq!(count_tokens("", "o200k_base").unwrap(), 0); + } + + #[test] + fn test_count_tokens_invalid_encoding() { + assert!(count_tokens("hello", "invalid_encoding").is_err()); } } diff --git a/pyproject.toml b/pyproject.toml index 2b1e61a..c1c2a32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,6 @@ classifiers = [ ] dependencies = [ "orjson>=3.11.5", - "tiktoken>=0.5.0", ] @@ -54,6 +53,7 @@ dev = [ "pytest-cov>=4.0.0", "pytest-sugar>=1.0.0", "pytest-xdist>=3.8.0", + "tiktoken>=0.5.0", # For benchmark token counting # Code quality "ruff>=0.11.9", "basedpyright>=1.29.1", diff --git a/python/agon/__init__.py b/python/agon/__init__.py index 0729578..81034cc 100644 --- a/python/agon/__init__.py +++ b/python/agon/__init__.py @@ -3,7 +3,9 @@ A self-describing, token-efficient data interchange format optimized for LLMs. """ -# Re-export Rust format classes (inherit from AGONFormat) +from importlib.metadata import version + +# Re-export Rust format classes from agon.agon_core import ( AGONColumns, AGONFormat, @@ -13,7 +15,7 @@ encode_all_parallel, encode_auto_parallel, ) -from agon.core import AGON, AGONEncoding, Format +from agon.core import AGON, AGONEncoding, Encoding, Format from agon.errors import AGONError __all__ = [ @@ -24,9 +26,10 @@ "AGONFormat", "AGONRows", "AGONStruct", + "Encoding", "EncodingResult", "Format", "encode_all_parallel", "encode_auto_parallel", ] -__version__ = "0.1.0" +__version__ = version("agon-python") diff --git a/python/agon/core.py b/python/agon/core.py index 61a37c3..6bf5649 100644 --- a/python/agon/core.py +++ b/python/agon/core.py @@ -16,14 +16,24 @@ # Rust py03 bindings from agon.agon_core import AGONColumns, AGONFormat, AGONRows, AGONStruct +from agon.agon_core import count_tokens as _rs_count_tokens from agon.agon_core import encode_auto_parallel as _rs_encode_auto_parallel -from agon.encoding import DEFAULT_ENCODING, count_tokens from agon.errors import AGONError # Type aliases Format = Literal["auto", "json", "rows", "columns", "struct"] ConcreteFormat = Literal["json", "rows", "columns", "struct"] +# Tiktoken encodings supported by tiktoken_rs +Encoding = Literal[ + "o200k_base", # GPT-4o, o1, o3 + "o200k_harmony", # GPT-OSS + "cl100k_base", # GPT-4, GPT-3.5-turbo + "p50k_base", # Codex, text-davinci-003 + "p50k_edit", # text-davinci-edit-001 + "r50k_base", # GPT-3 (davinci, curie, babbage, ada) +] + @dataclass(frozen=True) class AGONEncoding: @@ -61,6 +71,35 @@ def with_header(self) -> str: return self.text return f"{self.header}\n\n{self.text}" + def hint(self) -> str: + """Get a prescriptive hint instructing LLMs how to generate this format. + + NOTE: LLMs have not been trained on AGON, so generation accuracy cannot + be guaranteed. Use hints when asking LLMs to return AGON-formatted data, + but validate the output. Prefer sending AGON to LLMs (reliable) over + asking LLMs to generate AGON (experimental). + + Returns: + A short prescriptive hint instructing how to generate the format. + + Example: + >>> result = AGON.encode(data, format="auto") + >>> result.hint() + 'Return in AGON rows format: Start with @AGON rows header...' + """ + match self.format: + case "rows": + return AGONRows.hint() + case "columns": + return AGONColumns.hint() + case "struct": + return AGONStruct.hint() + case "json": + return "JSON: Standard compact JSON encoding" + case _: + msg = f"Unknown format: {self.format}" + raise AGONError(msg) + class AGON: """Self-describing encoder/decoder for AGON formats. @@ -111,7 +150,7 @@ def encode( format: Format = "auto", force: bool = False, min_savings: float = 0.10, - encoding: str = DEFAULT_ENCODING, + encoding: Encoding | None = None, ) -> AGONEncoding: """Encode data to the most token-efficient AGON format. @@ -125,7 +164,9 @@ def encode( - "struct": AGONStruct template format for repeated shapes force: If True with format="auto", always use a non-JSON format. min_savings: Minimum token savings ratio vs JSON to use non-JSON format. - encoding: Tiktoken encoding for token counting (default: o200k_base). + encoding: Tiktoken encoding for token counting. If None (default), + uses fast byte-length estimation. Set to "o200k_base" for accurate + token counts (slower). See `Encoding` type for options. Returns: EncodingResult containing: @@ -147,7 +188,8 @@ def encode( return AGONEncoding(format, text, header) # format == "auto": use Rust for fast parallel encoding and format selection - result = _rs_encode_auto_parallel(data, force, min_savings) + # encoding=None means use fast byte-length estimate, otherwise use specified tiktoken encoding + result = _rs_encode_auto_parallel(data, force, min_savings, encoding) selected_format = cast("ConcreteFormat", result.format) header = AGON._headers[selected_format] return AGONEncoding(selected_format, result.text, header) @@ -236,50 +278,20 @@ def project_data(data: list[dict[str, Any]], keep_paths: list[str]) -> list[dict return AGONFormat.project_data(data, keep_paths) @staticmethod - def hint(result_or_format: AGONEncoding | ConcreteFormat) -> str: - """Get a prescriptive hint instructing LLMs how to generate AGON format. + def count_tokens(text: str, *, encoding: Encoding = "o200k_base") -> int: + """Count tokens in text using the specified tiktoken encoding. - NOTE: LLMs have not been trained on AGON, so generation accuracy cannot - be guaranteed. Use hints when asking LLMs to return AGON-formatted data, - but validate the output. Prefer sending AGON to LLMs (reliable) over - asking LLMs to generate AGON (experimental). + Uses the Rust tiktoken implementation for performance. Args: - result_or_format: AGONEncoding result or format name ("text", "columns", - "struct", "json"). Returns generation instructions for that format. + text: Text to count tokens in. + encoding: Tiktoken encoding name. See `Encoding` type for options. + Default is "o200k_base" (GPT-4o). Use "cl100k_base" for GPT-4/GPT-3.5-turbo. Returns: - A short prescriptive hint instructing how to generate the format. + Number of tokens in the text. - Example: - >>> result = AGON.encode(data, format="auto") - >>> AGON.hint(result) # Generation instruction for selected format - 'Return in AGON rows format: Start with @AGON rows header, encode arrays as name[N]{fields} with tab-delimited rows' - >>> AGON.hint("columns") # Generation instruction for columns format - 'Return in AGON columns format: Start with @AGON columns header, transpose arrays to name[N] with ├/└ field: val1, val2, ...' + Raises: + ValueError: If the encoding is not supported. """ - # Extract format if AGONEncoding was passed - format_name = ( - result_or_format.format - if isinstance(result_or_format, AGONEncoding) - else result_or_format - ) - - # Return hint for specific format - match format_name: - case "rows": - return AGONRows.hint() - case "columns": - return AGONColumns.hint() - case "struct": - return AGONStruct.hint() - case "json": - return "JSON: Standard compact JSON encoding" - case _: - msg = f"Unknown format: {format_name}" - raise AGONError(msg) - - @staticmethod - def count_tokens(text: str, *, encoding: str = DEFAULT_ENCODING) -> int: - """Count tokens in text using the specified encoding.""" - return count_tokens(text, encoding=encoding) + return _rs_count_tokens(text, encoding) diff --git a/python/agon/encoding.py b/python/agon/encoding.py deleted file mode 100644 index 424bd3e..0000000 --- a/python/agon/encoding.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Shared token counting utilities for AGON.""" - -from __future__ import annotations - -from functools import lru_cache - -import tiktoken - -DEFAULT_ENCODING = "o200k_base" - - -@lru_cache(maxsize=16) -def get_encoding(name: str = DEFAULT_ENCODING) -> tiktoken.Encoding: - """Get a tiktoken encoding by name (cached).""" - return tiktoken.get_encoding(name) - - -def count_tokens(text: str, *, encoding: str = DEFAULT_ENCODING) -> int: - """Count tokens in text using the specified encoding.""" - enc = get_encoding(encoding) - return len(enc.encode(text)) diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 095fef2..157d350 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -126,26 +126,33 @@ def test_fixture_benchmark(fixture_path: Path) -> None: format_results[fmt] = (tokens, savings, encode_ms, decode_ms) - # Test auto selection + # Test auto selection with timing + t0 = time.perf_counter() result = AGON.encode(records, format="auto") + auto_encode_ms = (time.perf_counter() - t0) * 1000 auto_tokens = count_tokens(result.text) auto_savings = (1 - auto_tokens / max(1, raw_tokens)) * 100 # Verify auto decode (decode AGONEncoding directly) + t0 = time.perf_counter() decoded = AGON.decode(result) + auto_decode_ms = (time.perf_counter() - t0) * 1000 assert normalize_floats(decoded) == normalize_floats(records), "auto roundtrip failed" # Print results record_count = len(records) if isinstance(records, list) else 1 - print(f"\n{'=' * 60}") + print(f"\n{'=' * 70}") print(f"FIXTURE: {label}") - print(f"Bytes: {fixture_path.stat().st_size:,} Records: {record_count:,}") + print(f"Size: {fixture_path.stat().st_size / 1024:.1f} KB Records: {record_count:,}") print(f"JSON baseline (pretty): {raw_tokens:,} tokens") - print(f"{'-' * 60}") - print(f"{'Format':<10} {'Tokens':>8} {'Savings':>10} {'Encode':>10} {'Decode':>10}") - print(f"{'-' * 60}") + print(f"{'-' * 70}") + print(f"{'Format':<10} {'Tokens':>8} {'Savings':>10} {'Encode':>12} {'Decode':>12}") + print(f"{'-' * 70}") for fmt, (tokens, savings, enc_ms, dec_ms) in format_results.items(): - print(f"{fmt:<10} {tokens:>8,} {savings:>+9.1f}% {enc_ms:>9.2f}ms {dec_ms:>9.2f}ms") - print(f"{'-' * 60}") - print(f"{'auto':<10} {auto_tokens:>8,} {auto_savings:>+9.1f}% (selected: {result.format})") - print(f"{'=' * 60}") + print(f"{fmt:<10} {tokens:>8,} {savings:>+9.1f}% {enc_ms:>11.2f}ms {dec_ms:>11.2f}ms") + print(f"{'-' * 70}") + print( + f"{'auto':<10} {auto_tokens:>8,} {auto_savings:>+9.1f}% {auto_encode_ms:>11.2f}ms {auto_decode_ms:>11.2f}ms" + ) + print(f"Selected: {result.format}") + print(f"{'=' * 70}") diff --git a/tests/test_core.py b/tests/test_core.py index 4ffe452..ada6d28 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -94,64 +94,62 @@ def test_project_data_delegates() -> None: def test_hint_with_agon_encoding_result() -> None: - """hint() should accept AGONEncoding and return prescriptive generation instructions.""" + """AGONEncoding.hint() should return prescriptive generation instructions.""" data = [{"id": 1, "name": "Alice"}] result = AGON.encode(data, format="rows") - hint = AGON.hint(result) + hint = result.hint() assert isinstance(hint, str) assert "Return in AGON rows format" in hint assert "@AGON rows header" in hint -def test_hint_with_format_string_rows() -> None: - """hint() should accept format string and return generation instructions.""" - hint = AGON.hint("rows") +def test_hint_rows_format() -> None: + """hint() should return rows format instructions.""" + result = AGON.encode({"a": 1}, format="rows") + hint = result.hint() assert isinstance(hint, str) assert "Return in AGON rows format" in hint assert "@AGON rows header" in hint assert "name[N]{fields}" in hint -def test_hint_with_format_string_columns() -> None: - """hint() should return prescriptive columns format instructions.""" - hint = AGON.hint("columns") +def test_hint_columns_format() -> None: + """hint() should return columns format instructions.""" + result = AGON.encode([{"id": 1}], format="columns") + hint = result.hint() assert isinstance(hint, str) assert "Return in AGON columns format" in hint assert "@AGON columns header" in hint assert "├/└" in hint -def test_hint_with_format_string_struct() -> None: - """hint() should return prescriptive struct format instructions.""" - hint = AGON.hint("struct") +def test_hint_struct_format() -> None: + """hint() should return struct format instructions.""" + result = AGON.encode({"a": {"fmt": "1", "raw": 1}}, format="struct") + hint = result.hint() assert isinstance(hint, str) assert "Return in AGON struct format" in hint assert "@AGON struct header" in hint assert "@Struct" in hint or "Struct(" in hint -def test_hint_with_format_string_json() -> None: +def test_hint_json_format() -> None: """hint() should return JSON format hint.""" - hint = AGON.hint("json") + result = AGON.encode({"a": 1}, format="json") + hint = result.hint() assert isinstance(hint, str) assert "JSON" in hint -def test_hint_with_unknown_format_raises() -> None: - """hint() should raise AGONError for unknown format.""" - with pytest.raises(AGONError, match="Unknown format"): - AGON.hint("invalid_format") # type: ignore[arg-type] - - -def test_hint_matches_encoding_format() -> None: - """hint() should return matching hint for different encoded formats.""" +def test_hint_matches_across_formats() -> None: + """hint() should return consistent hints for each format.""" data = [{"id": 1, "name": "Alice"}] for fmt in ["rows", "columns", "struct", "json"]: result = AGON.encode(data, format=fmt) # type: ignore[arg-type] - hint_from_result = AGON.hint(result) - hint_from_string = AGON.hint(fmt) # type: ignore[arg-type] - assert hint_from_result == hint_from_string + hint = result.hint() + assert isinstance(hint, str) + assert len(hint) > 0 def test_count_tokens_positive() -> None: @@ -249,6 +247,59 @@ def test_encode_reports_json_fallback() -> None: assert res.text.startswith("[") +def test_encode_with_encoding_none_uses_fast_estimate() -> None: + """encoding=None (default) uses fast byte-length estimate.""" + data = [{"id": i, "name": f"User{i}"} for i in range(10)] + result = AGON.encode(data, format="auto", encoding=None) + assert result.format in ("json", "rows", "columns", "struct") + assert len(result.text) > 0 + + +def test_encode_with_encoding_specified_uses_tiktoken() -> None: + """encoding='o200k_base' uses tiktoken for accurate token counting.""" + data = [{"id": i, "name": f"User{i}"} for i in range(10)] + result = AGON.encode(data, format="auto", encoding="o200k_base") + assert result.format in ("json", "rows", "columns", "struct") + assert len(result.text) > 0 + + +def test_encode_both_encoding_modes_produce_valid_results() -> None: + """Both encoding modes should produce decodable results.""" + data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] + + # Fast estimate (default) + result_fast = AGON.encode(data, format="auto", encoding=None) + decoded_fast = AGON.decode(result_fast) + assert decoded_fast == data + + # Tiktoken + result_tiktoken = AGON.encode(data, format="auto", encoding="o200k_base") + decoded_tiktoken = AGON.decode(result_tiktoken) + assert decoded_tiktoken == data + + +def test_count_tokens_with_default_encoding() -> None: + """count_tokens uses o200k_base by default.""" + tokens = AGON.count_tokens("hello world") + assert tokens > 0 + assert isinstance(tokens, int) + + +def test_count_tokens_with_different_encodings() -> None: + """count_tokens supports multiple tiktoken encodings.""" + text = "The quick brown fox jumps over the lazy dog." + + # Different encodings may produce different token counts + o200k = AGON.count_tokens(text, encoding="o200k_base") + cl100k = AGON.count_tokens(text, encoding="cl100k_base") + + assert o200k > 0 + assert cl100k > 0 + # Token counts may differ between encodings + assert isinstance(o200k, int) + assert isinstance(cl100k, int) + + def test_agon_encoding_str_returns_text() -> None: """AGONEncoding str() returns the encoded text.""" data = [{"id": 1}] From 33dde812ac65391b1ca5340839dbfdaccf066851 Mon Sep 17 00:00:00 2001 From: harvey Date: Thu, 25 Dec 2025 16:56:50 -0500 Subject: [PATCH 6/7] docs: updates documentation --- README.md | 195 +++++++++++++-------- docs/api.md | 194 ++++++++++----------- docs/benchmarks.md | 185 +++++++++++++------- docs/concepts.md | 28 +-- docs/formats/columns.md | 22 +-- docs/formats/json.md | 6 +- docs/formats/{text.md => rows.md} | 62 +++---- docs/formats/struct.md | 18 +- docs/index.md | 271 +++++++++++++++++++----------- docs/javascripts/charts.js | 2 +- mkdocs.yml | 3 +- tests/test_benchmarks.py | 18 ++ 12 files changed, 602 insertions(+), 402 deletions(-) rename docs/formats/{text.md => rows.md} (84%) diff --git a/README.md b/README.md index cfed7ea..944bb61 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,15 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![CI](https://github.com/Verdenroz/agon-python/actions/workflows/ci.yml/badge.svg)](https://github.com/Verdenroz/agon-python/actions/workflows/ci.yml) [![codecov](https://codecov.io/gh/Verdenroz/agon-python/branch/master/graph/badge.svg)](https://codecov.io/gh/Verdenroz/agon-python) +[![Rust](https://img.shields.io/badge/rust-%23000000.svg?style=flat&logo=rust&logoColor=white)](https://www.rust-lang.org/) +[![PyO3](https://img.shields.io/badge/PyO3-v0.27-blue)](https://pyo3.rs/) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) +[![Documentation](https://img.shields.io/badge/docs-mkdocs-blue)](https://Verdenroz.github.io/agon-python/) **Adaptive Guarded Object Notation** - a self-describing, multi-format JSON encoding optimized for LLM prompts with one guarantee: **never worse than JSON**. +📚 **[Full Documentation](https://Verdenroz.github.io/agon-python/)** | 🚀 **[Quick Start](#quick-start)** | ⚡ **[Benchmarks](#benchmarks)** + ## Table of Contents - [Why AGON?](#why-agon) @@ -32,7 +38,7 @@ ```python result = AGON.encode(data, format="auto") -# Auto tries: text, columns, struct +# Auto tries: rows, columns, struct # Returns: whichever saves the most tokens # Falls back: to compact JSON if none are better ``` @@ -74,9 +80,9 @@ data = [ {"id": 3, "name": "Charlie", "role": "user"}, ] -# Encode with auto-selection (tries text/columns/struct, picks best or falls back to JSON) +# Encode with auto-selection (tries rows/columns/struct, picks best or falls back to JSON) result = AGON.encode(data, format="auto") -print(f"Selected format: {result.format}") # → "text" +print(f"Selected format: {result.format}") # → "rows" print(f"Encoded output:\n{result}") # Outputs clean format WITHOUT @AGON header: # [3]{id name role} @@ -115,11 +121,11 @@ data = [ result = AGON.encode(data, format="auto") # To ask an LLM to respond in AGON format, provide both: -# 1. Generation instructions via AGON.hint(result) +# 1. Generation instructions via result.hint() # 2. An example with header via result.with_header() prompt = f"""Analyze this user data and return enriched data in AGON format. -Instructions: {AGON.hint(result)} +Instructions: {result.hint()} Example output: {result.with_header()} @@ -127,7 +133,7 @@ Example output: Task: Add an is_admin boolean field and return in the same format.""" # Example LLM response (hypothetical - accuracy not guaranteed) -llm_response = """@AGON text +llm_response = """@AGON rows [3]{name role is_admin} Alice admin true @@ -146,12 +152,12 @@ print(f"Admin percentage: {admin_count / len(parsed) * 100:.1f}%") # → 33.3% ## How It Works -AGON provides three specialized **repetition-aware** encoding formats that are friendly to LLMs: +AGON provides three specialized **repetition-aware** encoding formats that are friendly to LLMs, powered by a **high-performance Rust core** for minimal latency: ### The Three Formats -1. **AGONText**: Row-based tabular encoding for arrays of uniform objects - - Similar to TOON's approach +1. **AGONRows**: Row-based tabular encoding for arrays of uniform objects + - Similar to [TOON format](https://github.com/toon-format/toon) - Best for: Uniform arrays with consistent fields - Example: User lists, transaction logs, simple metrics @@ -161,10 +167,17 @@ AGON provides three specialized **repetition-aware** encoding formats that are f - Example: Financial data with 20+ fields per record 3. **AGONStruct**: Template-based encoding for repeated nested patterns - - Declares struct templates (e.g., `S(...)`) once, reuses with values + - Similar to [TRON format](https://github.com/tron-format/tron-javascript) but with abbreviated struct names - Best for: Complex nested objects with repeated shapes - Example: Market data with nested `{fmt, raw}` or `{value, timestamp}` patterns +### Rust-Powered Performance + +AGON's core encoding/decoding is implemented in **Rust** with **PyO3** bindings, delivering: + +- **Parallel format selection**: Auto mode uses [Rayon](https://github.com/rayon-rs/rayon) to encode all formats concurrently +- **Native Python integration**: Format classes (`AGONRows`, `AGONColumns`, `AGONStruct`) exposed as Python objects via PyO3 + ### Adaptive Auto Mode ```python @@ -173,10 +186,10 @@ result = AGON.encode(data, format="auto") **How `auto` works:** -1. **Try all formats**: Encodes data with text, columns, struct +1. **Try all formats in parallel**: Rust encodes rows, columns, struct concurrently 2. **Count tokens**: Measures each encoding's token count 3. **Compare to JSON**: Calculates savings vs compact JSON baseline -4. **Apply threshold**: Requires minimum savings (default 5%) to use specialized format +4. **Apply threshold**: Requires minimum savings (default 10%) to use specialized format 5. **Select winner**: Returns format with best savings, or JSON if none meet threshold **The guarantee:** Auto mode *never* returns a format with more tokens than compact JSON. If all specialized formats are worse or marginally better, it returns JSON. @@ -184,12 +197,12 @@ result = AGON.encode(data, format="auto") **Example decision tree:** ``` Data shape analysis: - → Text: 96 tokens (30.9% better than JSON) ✅ Winner + → Rows: 96 tokens (30.9% better than JSON) ✅ Winner → Columns: 108 tokens (22.3% better than JSON) ❌ Not optimal → Struct: 130 tokens (6.5% better than JSON) ❌ Not optimal → JSON: 139 tokens (baseline) ❌ Fallback -Decision: Use text (best savings, exceeds 10% threshold) +Decision: Use rows (best savings, exceeds 10% threshold) ``` All non-JSON encodings start with an `@AGON ...` header so they can be decoded later. @@ -227,10 +240,10 @@ This example demonstrates encoding a list of hiking records with nested context | **JSON (pretty)** | 229 | — (baseline) | -64.7% 📉 | | | **JSON (compact)** | 139 | +39.3% ✅ | — (baseline) | | | **TOON** | 96 | **+58.1%** ✅ | **+30.9%** ✅ | | -| **AGON text** | 96 | **+58.1%** ✅ | **+30.9%** ✅ | Tied with TOON | +| **AGON rows** | 96 | **+58.1%** ✅ | **+30.9%** ✅ | Tied with TOON | | **AGON columns** | 108 | **+52.8%** ✅ | **+22.3%** ✅ | | | **AGON struct** | 130 | **+43.2%** ✅ | **+6.5%** ✅ | | -| **AGON auto** | **96** | **+58.1%** ✅ | **+30.9%** ✅ | **Winner** (selected `text`) | +| **AGON auto** | **96** | **+58.1%** ✅ | **+30.9%** ✅ | **Winner** (selected `rows`) | ### Format Encodings with Explanations @@ -240,18 +253,18 @@ context: task: Our favorite hikes together location: Boulder season: spring_2025 -friends[3]: ana luis sam -hikes[3]{id name distanceKm elevationGain companion wasSunny} -1 Blue Lake Trail 7.5 320 ana true -2 Ridge Overlook 9.2 540 luis false -3 Wildflower Loop 5.1 180 sam true +friends[3]: ana,luis,sam +hikes[3]{id,name,distanceKm,elevationGain,companion,wasSunny}: + 1,Blue Lake Trail,7.5,320,ana,true + 2,Ridge Overlook,9.2,540,luis,false + 3,Wildflower Loop,5.1,180,sam,true ``` -**How it works:** TOON uses YAML-like indentation for nested objects and CSV-style tab-delimited rows for arrays. The `[3]` declares array length and `{fields}` lists column headers—giving LLMs explicit structure to validate against. +**How it works:** TOON uses YAML-like indentation for nested objects and **comma-delimited** rows for arrays. The `[3]` declares array length and `{fields}` lists column headers—giving LLMs explicit structure to validate against. --- -**AGON text (96 tokens, +58.1% savings - identical to TOON!):** +**AGON rows (96 tokens, +58.1% savings - nearly identical to TOON!):** ``` context: task: Our favorite hikes together @@ -264,7 +277,7 @@ hikes[3]{id name distanceKm elevationGain companion wasSunny} 3 Wildflower Loop 5.1 180 sam true ``` -**Why auto selected this:** AGON's text format produces identical output to TOON for uniform arrays. Auto mode tried all three formats and chose text because it had the lowest token count (96 vs 108 for columns vs 130 for struct). +**How it works:** AGON rows uses the same structure as TOON but with **tab-delimited** rows instead of commas. Both achieve identical token counts (96 tokens) because the delimiter choice doesn't significantly affect tokenization. Auto mode chose rows because it had the lowest token count (96 vs 108 for columns vs 130 for struct). --- @@ -284,28 +297,31 @@ hikes[3] └ wasSunny: true false true ``` -**How it works:** Columnar format transposes the data, grouping same-type values together. This can be more token-efficient for wide tables (20+ columns) or numeric-heavy data where type clustering improves compression. Not selected here because text format is better for this data shape. +**How it works:** Columnar format transposes the data, grouping same-type values together. This can be more token-efficient for wide tables (20+ columns) or numeric-heavy data where type clustering improves compression. Not selected here because rows format is better for this data shape. --- -**AGON struct (130 tokens, +43.2% savings):** +**AGON struct (144 tokens, +37.1% savings):** ``` -@S: companion, distanceKm, elevationGain, id, name, wasSunny +@CDEI: companion, distanceKm, elevationGain, id, name, wasSunny context: task: Our favorite hikes together location: Boulder season: spring_2025 -friends: +friends [3]: - ana - luis - sam -hikes: - [3]: S(ana, 7.5, 320, 1, Blue Lake Trail, true), S(luis, 9.2, 540, 2, Ridge Overlook, false), S(sam, 5.1, 180, 3, Wildflower Loop, true) +hikes + [3]: + - CDEI(ana, 7.5, 320, 1, Blue Lake Trail, true) + - CDEI(luis, 9.2, 540, 2, Ridge Overlook, false) + - CDEI(sam, 5.1, 180, 3, Wildflower Loop, true) ``` -**How it works:** Struct format declares reusable templates (`S`) once at the top, then references them with just values. Excels at deeply nested data with repeated patterns. Not optimal here because the hikes array is already flat—text format is more efficient. +**How it works:** Struct format declares reusable templates (`@CDEI: fields`) once at the top, then instantiates them with just values `CDEI(...)`. The struct name is generated from the first letter of each field (Companion, DistanceKm, ElevationGain, Id → CDEI). ### When AGON Falls Back to JSON @@ -315,14 +331,14 @@ But what about data where specialized formats don't provide enough benefit? Let' |--------|--------|------------------------|----------| | **JSON (pretty)** | 142,791 | — (baseline) | | | **JSON (compact)** | 91,634 | **+35.8%** ✅ | | -| **AGON text** | 113,132 | **+20.8%** ✅ | | +| **AGON rows** | 113,132 | **+20.8%** ✅ | | | **AGON columns** | 113,132 | **+20.8%** ✅ | | | **AGON struct** | 89,011 | **+37.7%** ✅ (best format!) | | | **AGON auto** | **91,634** | **+35.8%** (returned compact JSON) | ✅ **Safe choice** | **AGON's safety net in action:** Even though `struct` format achieved the best savings (37.7%), when compared against *compact* JSON (the real alternative), struct only saved 2.9%—below the minimum threshold (default 10%). Rather than risk the encoding overhead for marginal gains, `auto` returned compact JSON, guaranteeing excellent performance with zero complexity. -**Key insight:** Text/columns formats actually *hurt* compared to compact JSON (113K vs 91K tokens), but `auto` intelligently avoided them. And while struct was marginally better, the gains weren't worth the format overhead. +**Key insight:** Rows/columns formats actually *hurt* compared to compact JSON (113K vs 91K tokens), but `auto` intelligently avoided them. And while struct was marginally better, the gains weren't worth the format overhead. **With AGON:** You get compact JSON back (35.8% better than pretty), paying zero format complexity, with zero risk. @@ -336,7 +352,7 @@ AGON excels in scenarios where data structure varies and intelligent format sele **When AGON helps most:** - Repeated nested patterns (AGONStruct: up to 49% savings vs pretty JSON) -- Uniform arrays (AGONText: up to 58% savings vs pretty JSON) +- Uniform arrays (AGONRows: up to 58% savings vs pretty JSON) - Mixed data types where adaptive selection matters **When AGON helps least:** @@ -348,11 +364,16 @@ AGON excels in scenarios where data structure varies and intelligent format sele ### Encoding ```python -# Auto (recommended) +from agon import AGON, Encoding + +# Auto (recommended) - uses fast byte-length estimation result = AGON.encode(data) +# Auto with accurate token counting (slower but precise) +result = AGON.encode(data, encoding="o200k_base") # or "cl100k_base", "p50k_base", etc. + # Choose a specific format -result = AGON.encode(data, format="text") +result = AGON.encode(data, format="rows") result = AGON.encode(data, format="columns") result = AGON.encode(data, format="struct") result = AGON.encode(data, format="json") @@ -365,11 +386,40 @@ result = AGON.encode(data, format="auto", min_savings=0.10) # require 10% savin ### Decoding ```python -# Auto-detect by header +# Decode AGONEncoding directly +result = AGON.encode(data, format="rows") +decoded = AGON.decode(result) + +# Decode string with auto-detection by header decoded = AGON.decode(payload_with_header) -# Or decode with an explicit format (header not required) -decoded = AGON.decode(payload_without_header, format="text") +# Decode string with explicit format (header not required) +decoded = AGON.decode(payload_without_header, format="rows") +``` + +### AGONEncoding Methods + +```python +result = AGON.encode(data, format="auto") + +# Get the encoded text (for use in LLM prompts) +text = str(result) # or just use result directly in f-strings +text = result.text # explicit access + +# Get character count +length = len(result) + +# Get format that was selected +format_used = result.format # "rows", "columns", "struct", or "json" + +# Get format header +header = result.header # "@AGON rows", "@AGON columns", etc. + +# Get text with header prepended (for auto-detect decoding) +with_header = result.with_header() + +# Get generation instructions for LLMs +hint = result.hint() ``` ### Helpers @@ -378,14 +428,9 @@ decoded = AGON.decode(payload_without_header, format="text") # Keep only specific fields (supports dotted paths like "user.profile.name" or "quotes.symbol") projected = AGON.project_data(data, ["id", "name"]) -# Get prescriptive generation instructions for LLMs (when asking LLMs to return AGON format) -result = AGON.encode(data, format="auto") -hint = AGON.hint(result) # Instructions for the selected format -# or -hint = AGON.hint("text") # Instructions for a specific format - -# Token counting helper -tokens = AGON.count_tokens("hello world") +# Token counting helper (uses Rust tiktoken implementation) +tokens = AGON.count_tokens("hello world") # default: o200k_base +tokens = AGON.count_tokens("hello world", encoding="cl100k_base") # GPT-4/3.5-turbo ``` ## Development @@ -430,34 +475,48 @@ make docs ## Benchmarks -AGON's adaptive approach yields variable results depending on data structure, demonstrating its intelligent format selection: +AGON's adaptive approach yields variable results depending on data structure and format used. Benchmarks on actual test fixtures from [`tests/data/`](tests/data/). -### Real-World Results +### Performance -Benchmarks on actual test fixtures from [`tests/data/`](tests/data/), showing token counts for all formats: +Encoding and decoding times for all formats across all datasets: -| Dataset | Type | JSON Pretty | JSON Compact | Text | Columns | Struct | **Auto** | **Selected** | -|---------|------|-------------|--------------|------|---------|--------|----------|--------------| -| [`toon.json`](tests/data/toon.json) | Hiking records (nested) | 229 | 139 (+39.3%) | 96 (+58.1%) | 108 (+52.8%) | 130 (+43.2%) | **96** | **text** | -| [`chart.json`](tests/data/chart.json) | 1,256 candles | 101,767 | 71,623 (+29.6%) | 51,541 (+49.4%) | 51,558 (+49.3%) | 61,595 (+39.5%) | **51,541** | **text** | -| [`quote.json`](tests/data/quote.json) | Single quote (nested) | 128,981 | 85,956 (+33.4%) | 67,251 (+47.9%) | 65,586 (+49.2%) | 65,698 (+49.1%) | **65,586** | **columns** | -| [`128KB.json`](tests/data/128KB.json) | 788 employee records | 77,346 | 62,378 (+19.4%) | 54,622 (+29.4%) | 54,292 (+29.8%) | 56,772 (+26.6%) | **54,292** | **columns** | -| [`historical.json`](tests/data/historical.json) | Historical OHLCV data | 84,094 | 55,228 (+34.3%) | 70,286 (+16.4%) | 70,286 (+16.4%) | 47,713 (+43.3%) | **47,713** | **struct** | -| [`gainers.json`](tests/data/gainers.json) | 100 complex quotes | 142,791 | 91,634 (+35.8%) | 113,132 (+20.8%) | 113,132 (+20.8%) | 89,011 (+37.7%) | **91,634** | **json** ⚠️ | -| [`scars.json`](tests/data/scars.json) | Error records | 2,600 | 2,144 (+17.5%) | 2,225 (+14.4%) | 2,230 (+14.2%) | 2,437 (+6.3%) | **2,144** | **json** ⚠️ | +| Dataset | Size | Records | JSON | Rows | Columns | Struct | Auto (selected) | +|---------|------|---------|------|------|---------|--------|-----------------| +| [toon.json](tests/data/toon.json) | 0.6 KB | 1 | 0.00 / 0.01 ms | 0.10 / 0.30 ms | 0.09 / 0.12 ms | 0.14 / 0.29 ms | **0.40 / 0.48 ms** (rows) | +| [scars.json](tests/data/scars.json) | 9.8 KB | 1 | 0.01 / 0.05 ms | 0.56 / 3.26 ms | 0.51 / 0.76 ms | 0.64 / 3.20 ms | **1.65 / 0.11 ms** (json) | +| [128KB.json](tests/data/128KB.json) | 249 KB | 788 | 0.16 / 0.91 ms | 16.82 / 22.68 ms | 14.10 / 17.28 ms | 19.49 / 60.26 ms | **27.94 / 19.91 ms** (rows) | +| [historical.json](tests/data/historical.json) | 127 KB | 1 | 1.05 / 2.50 ms | 20.72 / 131.49 ms | 21.09 / 30.78 ms | 31.90 / 68.84 ms | **36.22 / 68.35 ms** (struct) | +| [chart.json](tests/data/chart.json) | 196 KB | 1,256 | 0.50 / 1.30 ms | 26.46 / 33.20 ms | 25.27 / 31.50 ms | 35.97 / 57.79 ms | **36.55 / 33.39 ms** (rows) | +| [quote.json](tests/data/quote.json) | 283 KB | 1 | 0.62 / 1.91 ms | 47.15 / 92.92 ms | 42.86 / 52.45 ms | 67.44 / 102.22 ms | **63.21 / 45.21 ms** (columns) | +| [gainers.json](tests/data/gainers.json) | 257 KB | 100 | 0.72 / 2.06 ms | 47.46 / 241.39 ms | 42.46 / 68.67 ms | 62.38 / 139.56 ms | **71.10 / 141.88 ms** (struct) | -**Key insights from the data:** -- **text** format excels at uniform arrays (toon, chart) -- **columns** format wins for wide tables with many fields (quote, 128KB) -- **struct** format dominates deeply nested repeated patterns -- **json** fallback returns compact JSON when specialized formats don't meet `min_savings` threshold +### Token Efficiency +| Dataset | Type | JSON Pretty | JSON Compact | Rows | Columns | Struct | **Auto** | **Selected** | +|---------|------|-------------|--------------|------|---------|--------|----------|--------------| +| [toon.json](tests/data/toon.json) | Hiking records (nested) | 229 | 139 (+39.3%) | 96 (+58.1%) | 108 (+52.8%) | 144 (+37.1%) | **96** | **rows** | +| [scars.json](tests/data/scars.json) | Error records | 2,600 | 2,144 (+17.5%) | 2,225 (+14.4%) | 2,230 (+14.2%) | 2,448 (+5.8%) | **2,144** | **json** ⚠️ | +| [128KB.json](tests/data/128KB.json) | 788 employee records | 77,346 | 62,378 (+19.4%) | 54,622 (+29.4%) | 54,292 (+29.8%) | 59,926 (+22.5%) | **54,622** | **rows** | +| [historical.json](tests/data/historical.json) | Historical OHLCV data | 84,094 | 55,228 (+34.3%) | 70,286 (+16.4%) | 70,286 (+16.4%) | 48,969 (+41.8%) | **48,969** | **struct** | +| [chart.json](tests/data/chart.json) | 1,256 candles | 101,767 | 71,623 (+29.6%) | 51,541 (+49.4%) | 51,558 (+49.3%) | 65,364 (+35.8%) | **51,541** | **rows** | +| [quote.json](tests/data/quote.json) | Single quote (nested) | 128,981 | 85,956 (+33.4%) | 67,251 (+47.9%) | 65,586 (+49.2%) | 69,053 (+46.5%) | **65,586** | **columns** | +| [gainers.json](tests/data/gainers.json) | 100 complex quotes | 142,791 | 91,634 (+35.8%) | 113,132 (+20.8%) | 113,132 (+20.8%) | 89,012 (+37.7%) | **89,012** | **struct** | + +**Key insights:** +- **rows** format excels at uniform arrays (toon, chart, 128KB) +- **columns** format wins for wide tables with many fields (quote) +- **struct** format dominates deeply nested repeated patterns (historical, gainers) +- **json** fallback returns compact JSON when specialized formats don't meet `min_savings` threshold using compact JSON as its baseline. ### Running Benchmarks ```bash -# Print detailed token counts and savings for all test fixtures -uv run pytest tests/test_benchmarks.py -s --cov-fail-under=0 +# Run performance benchmarks (token counts + encode/decode times) +make benchmarks + +# Or directly with pytest +uv run pytest tests/test_benchmarks.py -s --no-cov -o addopts="" ``` The documentation site also includes a Benchmarks page with recent results and methodology. @@ -468,6 +527,10 @@ The documentation site also includes a Benchmarks page with recent results and m - **Website**: [https://toonformat.dev](https://toonformat.dev/) - **Github**: [https://github.com/toon-format/toon](https://github.com/toon-format/toon) +### TRON Format +- **Website** : [https://tron-format.github.io/](https://tron-format.github.io/) +- **GitHub**: [https://github.com/tron-format/tron-javascript](https://github.com/tron-format/tron-javascript) + ### LLM Token Optimization - [Anthropic's Prompt Engineering Guide](https://docs.anthropic.com/claude/docs/prompt-engineering) - [OpenAI's Tokenizer](https://platform.openai.com/tokenizer) diff --git a/docs/api.md b/docs/api.md index f318e4c..44442cc 100644 --- a/docs/api.md +++ b/docs/api.md @@ -20,7 +20,7 @@ AGON.encode( format: Format = "auto", force: bool = False, min_savings: float = 0.10, - encoding: str = "o200k_base" + encoding: Encoding | None = None ) -> AGONEncoding ``` @@ -29,10 +29,10 @@ AGON.encode( | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `data` | `object` | *required* | JSON-serializable Python data to encode | -| `format` | `Format` | `"auto"` | Format to use: `"auto"`, `"json"`, `"text"`, `"columns"`, `"struct"` | +| `format` | `Format` | `"auto"` | Format to use: `"auto"`, `"json"`, `"rows"`, `"columns"`, `"struct"` | | `force` | `bool` | `False` | If True with `format="auto"`, never fall back to JSON | | `min_savings` | `float` | `0.10` | Minimum token savings (0.0-1.0) required to use specialized format vs JSON | -| `encoding` | `str` | `"o200k_base"` | Token encoding to use for counting (tiktoken encoding name) | +| `encoding` | `Encoding | None` | `None` | Token encoding for accurate counting (e.g., `"o200k_base"`). If `None`, uses fast byte-length estimation | **Returns:** `AGONEncoding` - Result object with encoded text and metadata @@ -50,7 +50,7 @@ AGON.encode( # Auto-select best format result = AGON.encode(data, format="auto") - print(f"Selected: {result.format}") # → "text" + print(f"Selected: {result.format}") # → "rows" print(f"Tokens: {AGON.count_tokens(result.text)}") print(result) # Use directly in LLM prompts ``` @@ -59,7 +59,7 @@ AGON.encode( ```python # Force a specific format - result_text = AGON.encode(data, format="text") + result_rows = AGON.encode(data, format="rows") result_columns = AGON.encode(data, format="columns") result_struct = AGON.encode(data, format="struct") result_json = AGON.encode(data, format="json") @@ -108,7 +108,7 @@ AGON.decode(payload: str, format: ConcreteFormat | None = None) -> object | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `payload` | `AGONEncoding \| str` | *required* | Encoded data to decode | -| `format` | `ConcreteFormat \| None` | `None` | Optional format override (`"json"`, `"text"`, `"columns"`, `"struct"`) | +| `format` | `ConcreteFormat \| None` | `None` | Optional format override (`"json"`, `"rows"`, `"columns"`, `"struct"`) | **Returns:** `object` - Decoded Python data (list, dict, etc.) @@ -120,7 +120,7 @@ AGON.decode(payload: str, format: ConcreteFormat | None = None) -> object data = [{"id": 1, "name": "Alice"}] # Encode - result = AGON.encode(data, format="text") + result = AGON.encode(data, format="rows") # Decode - automatically uses result's format decoded = AGON.decode(result) @@ -131,13 +131,13 @@ AGON.decode(payload: str, format: ConcreteFormat | None = None) -> object ```python # AGON-encoded string with header - agon_string = """@AGON text + agon_string = """@AGON rows [2]{id name} 1 Alice 2 Bob""" - # Auto-detects "text" format from @AGON header + # Auto-detects "rows" format from @AGON header decoded = AGON.decode(agon_string) # → [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] ``` @@ -146,11 +146,11 @@ AGON.decode(payload: str, format: ConcreteFormat | None = None) -> object ```python # Decode without header by specifying format - agon_text_without_header = """[2]{id name} + agon_rows_without_header = """[2]{id name} 1 Alice 2 Bob""" - decoded = AGON.decode(agon_text_without_header, format="text") + decoded = AGON.decode(agon_rows_without_header, format="rows") # → [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] ``` @@ -242,77 +242,6 @@ AGON.project_data( --- -### AGON.hint() - -Get prescriptive generation instructions for LLMs (experimental feature for asking LLMs to return AGON-formatted data). - -**Signature:** - -```python -AGON.hint( - result_or_format: AGONEncoding | ConcreteFormat -) -> str -``` - -**Parameters:** - -| Parameter | Type | Description | -|-----------|------|-------------| -| `result_or_format` | `AGONEncoding \| ConcreteFormat` | Encoding result or format name (`"text"`, `"columns"`, `"struct"`, `"json"`) | - -**Returns:** `str` - Prescriptive hint instructing how to generate the format - -**Examples:** - -=== "From Encoding Result" - - ```python - data = [{"id": 1, "name": "Alice"}] - result = AGON.encode(data, format="auto") - - # Get hint for the selected format - hint = AGON.hint(result) - print(hint) - # → "Return in AGON text format: Start with @AGON text header, - # encode arrays as name[N]{fields} with tab-delimited rows" - ``` - -=== "From Format Name" - - ```python - # Get hint for specific format - hint_text = AGON.hint("text") - hint_columns = AGON.hint("columns") - hint_struct = AGON.hint("struct") - ``` - -=== "Use in LLM Prompts" - - ```python - data = [{"id": 1, "name": "Alice", "role": "admin"}] - result = AGON.encode(data, format="auto") - - # Ask LLM to respond in AGON format - prompt = f"""Analyze this data and return enriched results in AGON format. - - Instructions: {AGON.hint(result)} - - Example output: - {result.with_header()} - - Task: Add a "seniority" field (junior/mid/senior) based on role. - """ - ``` - -!!! warning "Experimental Feature" - - LLMs have **not** been trained on AGON format, so generation accuracy cannot be guaranteed. This is experimental—always validate LLM-generated AGON data. - - **Prefer:** Sending AGON to LLMs (reliable) - **Over:** Asking LLMs to generate AGON (experimental) - ---- - ### AGON.count_tokens() Count tokens in text using the specified encoding. @@ -322,7 +251,7 @@ Count tokens in text using the specified encoding. ```python AGON.count_tokens( text: str, - encoding: str = "o200k_base" + encoding: Encoding = "o200k_base" ) -> int ``` @@ -331,7 +260,7 @@ AGON.count_tokens( | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `text` | `str` | *required* | Text to count tokens for | -| `encoding` | `str` | `"o200k_base"` | Tiktoken encoding name | +| `encoding` | `Encoding` | `"o200k_base"` | Tiktoken encoding name (`"o200k_base"`, `"cl100k_base"`, etc.) | **Returns:** `int` - Number of tokens @@ -356,9 +285,9 @@ Result object returned by `AGON.encode()`. | Attribute | Type | Description | |-----------|------|-------------| -| `format` | `ConcreteFormat` | Format used: `"json"`, `"text"`, `"columns"`, `"struct"` | +| `format` | `ConcreteFormat` | Format used: `"json"`, `"rows"`, `"columns"`, `"struct"` | | `text` | `str` | Encoded output (ready for LLM prompts) | -| `header` | `str` | Format header (e.g., `"@AGON text"`) | +| `header` | `str` | Format header (e.g., `"@AGON rows"`) | **Methods:** @@ -367,7 +296,7 @@ Result object returned by `AGON.encode()`. Returns the encoded text (without header) for direct use in prompts. ```python -result = AGON.encode(data, format="text") +result = AGON.encode(data, format="rows") prompt = f"Analyze this data:\n\n{result}" # Converts to string via __str__() ``` @@ -376,7 +305,7 @@ prompt = f"Analyze this data:\n\n{result}" # Converts to string via __str__() Returns character count of the encoded text. ```python -result = AGON.encode(data, format="text") +result = AGON.encode(data, format="rows") char_count = len(result) # Character count ``` @@ -385,9 +314,9 @@ char_count = len(result) # Character count Returns debug representation. ```python -result = AGON.encode(data, format="text") +result = AGON.encode(data, format="rows") print(repr(result)) -# → AGONEncoding(format='text', length=45) +# → AGONEncoding(format='rows', length=45) ``` ### with_header() @@ -395,7 +324,7 @@ print(repr(result)) Returns encoded text with header prepended (for auto-detect decoding). ```python -result = AGON.encode(data, format="text") +result = AGON.encode(data, format="rows") # Without header (for sending to LLM) print(result.text) @@ -405,7 +334,7 @@ print(result.text) # With header (for decoding) print(result.with_header()) -# → @AGON text +# → @AGON rows # # [2]{id name} # 1 Alice @@ -417,32 +346,71 @@ print(result.with_header()) - **Without header** (`result.text` or `str(result)`): Send to LLM prompts - **With header** (`result.with_header()`): Store for later decoding, or ask LLM to return in same format +### hint() + +Get prescriptive generation instructions for LLMs (experimental feature for asking LLMs to return AGON-formatted data). + +```python +result = AGON.encode(data, format="auto") + +# Get hint for the selected format +hint = result.hint() +print(hint) +# → "Return in AGON rows format: Start with @AGON rows header, +# encode arrays as name[N]{fields} with tab-delimited rows" +``` + +**Example use in LLM prompts:** + +```python +data = [{"id": 1, "name": "Alice", "role": "admin"}] +result = AGON.encode(data, format="auto") + +# Ask LLM to respond in AGON format +prompt = f"""Analyze this data and return enriched results in AGON format. + +Instructions: {result.hint()} + +Example output: +{result.with_header()} + +Task: Add a "seniority" field (junior/mid/senior) based on role. +""" +``` + +!!! warning "Experimental Feature" + + LLMs have **not** been trained on AGON format, so generation accuracy cannot be guaranteed. This is experimental—always validate LLM-generated AGON data. + + **Prefer:** Sending AGON to LLMs (reliable) + **Over:** Asking LLMs to generate AGON (experimental) + --- ## Format-Specific Encoders For advanced use cases, you can access format-specific encoders directly. -### AGONText +### AGONRows ```python -from agon.formats import AGONText +from agon import AGONRows # Direct encoding with custom options -encoded = AGONText.encode( +encoded = AGONRows.encode( data, delimiter="\t", # Default: tab include_header=False # Default: False ) # Direct decoding -decoded = AGONText.decode(encoded) +decoded = AGONRows.decode(encoded) ``` ### AGONColumns ```python -from agon.formats import AGONColumns +from agon import AGONColumns # Direct encoding encoded = AGONColumns.encode( @@ -457,7 +425,7 @@ decoded = AGONColumns.decode(encoded) ### AGONStruct ```python -from agon.formats import AGONStruct +from agon import AGONStruct # Direct encoding encoded = AGONStruct.encode( @@ -476,7 +444,7 @@ decoded = AGONStruct.decode(encoded) - You need format-specific options (custom delimiters) - You're benchmarking or comparing formats - For most use cases, `AGON.encode(data, format="text")` is preferred. + For most use cases, `AGON.encode(data, format="rows")` is preferred. --- @@ -499,17 +467,17 @@ except AGONError as e: ### Format-Specific Exceptions -- `AGONTextError` - Errors specific to AGONText format +- `AGONRowsError` - Errors specific to AGONRows format - `AGONColumnsError` - Errors specific to AGONColumns format - `AGONStructError` - Errors specific to AGONStruct format ```python -from agon import AGONTextError, AGONColumnsError, AGONStructError +from agon import AGONRowsError, AGONColumnsError, AGONStructError try: - result = AGON.decode(malformed_agon_text, format="text") -except AGONTextError as e: - print(f"Text format error: {e}") + result = AGON.decode(malformed_agon_rows, format="rows") +except AGONRowsError as e: + print(f"Rows format error: {e}") ``` --- @@ -527,13 +495,23 @@ except AGONTextError as e: ## Type Aliases ```python -from agon import Format, ConcreteFormat +from agon import Format, ConcreteFormat, Encoding # Format includes "auto" -Format = Literal["auto", "json", "text", "columns", "struct"] +Format = Literal["auto", "json", "rows", "columns", "struct"] # ConcreteFormat excludes "auto" (actual encoding formats) -ConcreteFormat = Literal["json", "text", "columns", "struct"] +ConcreteFormat = Literal["json", "rows", "columns", "struct"] + +# Encoding - supported tiktoken encodings +Encoding = Literal[ + "o200k_base", # GPT-4o, o1, o3 + "o200k_harmony", # GPT-OSS + "cl100k_base", # GPT-4, GPT-3.5-turbo + "p50k_base", # Codex, text-davinci-003 + "p50k_edit", # text-davinci-edit-001 + "r50k_base", # GPT-3 (davinci, curie, babbage, ada) +] ``` --- @@ -544,7 +522,7 @@ ConcreteFormat = Literal["json", "text", "columns", "struct"] View how JSON is used as a safety net -### [AGONText Format](formats/text.md) +### [AGONRows Format](formats/rows.md) Complete guide to row-based encoding diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 00cd72c..6082a14 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -6,7 +6,7 @@ Real-world performance data demonstrating AGON's adaptive format selection and t ## Overview -These benchmarks measure token counts across 6 real-world datasets using tiktoken's `o200k_base` encoding (GPT-4, GPT-4 Turbo, GPT-4o). All results are reproducible—run `uv run pytest tests/test_benchmarks.py -v` to verify. +These benchmarks measure token counts across 7 real-world datasets using tiktoken's `o200k_base` encoding (GPT-4, GPT-4 Turbo, GPT-4o). All results are reproducible—run `make benchmarks` to verify. --- @@ -14,33 +14,91 @@ These benchmarks measure token counts across 6 real-world datasets using tiktoke | Dataset | Size | Description | Characteristics | |---------|------|-------------|-----------------| -| **toon.json** | 665 bytes | Hiking records with nested context | Uniform array (3 records, 6 fields), mixed nesting | -| **128KB.json** | 255 KB | Large structured data | Many nested arrays, wide tables | -| **chart.json** | 201 KB | Chart.js configuration | Deep nesting, array-heavy, metadata objects | -| **gainers.json** | 263 KB | Market gainers (100 quotes) | Complex irregular nested objects (20+ fields each) | -| **scars.json** | 10 KB | Error tracking data | Mixed structure, heterogeneous fields | -| **historical.json** | 130 KB | Historical time-series data | Repeated `{time, value}` pattern (struct candidate) | +| **toon.json** | 0.6 KB | Hiking records with nested context | Uniform array (3 records, 6 fields), mixed nesting | +| **scars.json** | 9.8 KB | Error tracking data | Mixed structure, heterogeneous fields | +| **128KB.json** | 249 KB | Large structured data (788 employee records) | Many nested arrays, wide tables | +| **historical.json** | 127 KB | Historical OHLCV data | Repeated `{time, value}` pattern (struct candidate) | +| **chart.json** | 196 KB | 1,256 candles | Deep nesting, array-heavy, metadata objects | +| **quote.json** | 283 KB | Single quote (nested) | Complex nested structure with 20+ fields | +| **gainers.json** | 257 KB | 100 complex quotes | Complex irregular nested objects (20+ fields each) | --- ## Results Summary -| Dataset | Pretty JSON | Compact JSON | AGONText | AGONColumns | AGONStruct | **Auto Selected** | **Savings** | +| Dataset | Pretty JSON | Compact JSON | AGONRows | AGONColumns | AGONStruct | **Auto Selected** | **Savings** | |---------|-------------|--------------|----------|-------------|------------|-------------------|-------------| -| **toon.json** | 229 | 139 | **96** | 108 | 130 | **text (96)** | **+58.1%** | -| **128KB.json** | 77,346 | 63,230 | 54,622 | **54,292** | 56,772 | **columns (54,292)** | **+29.8%** | -| **chart.json** | 101,767 | 71,802 | **51,541** | 51,558 | 61,595 | **text (51,541)** | **+49.4%** | -| **gainers.json** | 142,791 | **91,634** | 113,132 | 113,132 | 89,011 | **json (91,634)** | **+35.8%** | -| **scars.json** | 2,600 | **2,144** | 2,225 | 2,230 | 2,437 | **json (2,144)** | **+17.5%** | -| **historical.json** | 84,094 | 55,228 | 70,286 | 70,286 | **47,713** | **struct (47,713)** | **+43.3%** | +| **toon.json** | 229 | 139 | **96** | 108 | 144 | **rows (96)** | **+58.1%** | +| **scars.json** | 2,600 | **2,144** | 2,225 | 2,230 | 2,448 | **json (2,144)** | **+17.5%** | +| **128KB.json** | 77,346 | 62,378 | **54,622** | 54,292 | 59,926 | **rows (54,622)** | **+29.4%** | +| **historical.json** | 84,094 | 55,228 | 70,286 | 70,286 | **48,969** | **struct (48,969)** | **+41.8%** | +| **chart.json** | 101,767 | 71,623 | **51,541** | 51,558 | 65,364 | **rows (51,541)** | **+49.4%** | +| **quote.json** | 128,981 | 85,956 | 67,251 | **65,586** | 69,053 | **columns (65,586)** | **+49.2%** | +| **gainers.json** | 142,791 | 91,634 | 113,132 | 113,132 | **89,012** | **struct (89,012)** | **+37.7%** | !!! success "Safety Net Demonstrated" - **gainers.json** and **scars.json** show auto mode's safety guarantee in action: + **scars.json** shows auto mode's safety guarantee in action: - - Text/Columns formats made token counts **worse** than compact JSON (113K vs 91K for gainers) + - All AGON formats produce worse or marginal results compared to compact JSON - Auto mode **correctly fell back to JSON**, avoiding regression - - Auto selection uses the compact-JSON baseline for `min_savings` gating (see [AGON.encode](api.md#agonencode)), so `gainers.json` chose JSON even though savings against pretty JSON are high. + - Auto selection uses the compact-JSON baseline for `min_savings` gating (see [AGON.encode](api.md#agonencode)) + + **gainers.json** demonstrates adaptive format selection: + + - Rows/Columns formats made token counts **worse** than compact JSON (113K vs 91K) + - Auto mode selected Struct format (89,012 tokens), achieving 37.7% savings vs pretty JSON + +--- + +## Performance + +AGON's core encoding engine is built in **Rust** and exposed to Python via **PyO3**, delivering exceptional performance even on large datasets. + +### Encode Times + +Time to encode data to each format (in milliseconds): + +| Dataset | Size | Records | JSON | Rows | Columns | Struct | Auto (selected) | +|---------|------|---------|------|------|---------|--------|-----------------| +| [toon.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/toon.json) | 0.6 KB | 1 | 0.00 ms | 0.10 ms | 0.09 ms | 0.14 ms | **0.40 ms** (rows) | +| [scars.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/scars.json) | 9.8 KB | 1 | 0.01 ms | 0.56 ms | 0.51 ms | 0.64 ms | **1.65 ms** (json) | +| [128KB.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/128KB.json) | 249 KB | 788 | 0.16 ms | 16.82 ms | 14.10 ms | 19.49 ms | **27.94 ms** (rows) | +| [historical.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/historical.json) | 127 KB | 1 | 1.05 ms | 20.72 ms | 21.09 ms | 31.90 ms | **36.22 ms** (struct) | +| [chart.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/chart.json) | 196 KB | 1,256 | 0.50 ms | 26.46 ms | 25.27 ms | 35.97 ms | **36.55 ms** (rows) | +| [quote.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/quote.json) | 283 KB | 1 | 0.62 ms | 47.15 ms | 42.86 ms | 67.44 ms | **63.21 ms** (columns) | +| [gainers.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/gainers.json) | 257 KB | 100 | 0.72 ms | 47.46 ms | 42.46 ms | 62.38 ms | **71.10 ms** (struct) | + +### Decode Times + +Time to decode data from each format back to Python objects (in milliseconds): + +| Dataset | Size | Records | JSON | Rows | Columns | Struct | Auto (selected) | +|---------|------|---------|------|------|---------|--------|-----------------| +| [toon.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/toon.json) | 0.6 KB | 1 | 0.01 ms | 0.30 ms | 0.12 ms | 0.29 ms | **0.48 ms** (rows) | +| [scars.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/scars.json) | 9.8 KB | 1 | 0.05 ms | 3.26 ms | 0.76 ms | 3.20 ms | **0.11 ms** (json) | +| [128KB.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/128KB.json) | 249 KB | 788 | 0.91 ms | 22.68 ms | 17.28 ms | 60.26 ms | **19.91 ms** (rows) | +| [historical.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/historical.json) | 127 KB | 1 | 2.50 ms | 131.49 ms | 30.78 ms | 68.84 ms | **68.35 ms** (struct) | +| [chart.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/chart.json) | 196 KB | 1,256 | 1.30 ms | 33.20 ms | 31.50 ms | 57.79 ms | **33.39 ms** (rows) | +| [quote.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/quote.json) | 283 KB | 1 | 1.91 ms | 92.92 ms | 52.45 ms | 102.22 ms | **45.21 ms** (columns) | +| [gainers.json](https://github.com/Verdenroz/agon-python/blob/master/tests/data/gainers.json) | 257 KB | 100 | 2.06 ms | 241.39 ms | 68.67 ms | 139.56 ms | **141.88 ms** (struct) | + +### Rust + PyO3 Architecture + +AGON's performance comes from its **Rust core** with **zero-copy PyO3 bindings**: + +- **Parallel encoding**: Uses `rayon` for concurrent format evaluation in auto mode +- **Fast tokenization**: Rust implementation of `tiktoken` for accurate token counting +- **Memory efficient**: Minimal allocations, string operations optimized +- **Native speed**: Compiled Rust code with Python convenience + +```python +# Behind the scenes, this Rust code runs: +# - Parallel format encoding with rayon +# - Fast JSON parsing with serde_json +# - Efficient string building with zero allocations +result = AGON.encode(large_dataset, format="auto") +``` --- @@ -103,7 +161,7 @@ This encoding is used by: Each dataset tested with all formats: -1. **AGONText:** Row-based tabular encoding +1. **AGONRows:** Row-based tabular encoding 2. **AGONColumns:** Columnar transpose encoding 3. **AGONStruct:** Template-based encoding 4. **Auto mode:** Selects best of above or falls back to JSON @@ -125,7 +183,7 @@ savings_percent = ((baseline - agon) / baseline) * 100 View how JSON is used as a safety net -### [AGONText Format](formats/text.md) +### [AGONRows Format](formats/rows.md) Learn about the most common format @@ -146,66 +204,77 @@ window.benchmarkData = { "description": "Hiking records with nested context (3 records, 6 fields)", "pretty": 229, "compact": 139, - "text": 96, + "rows": 96, "columns": 108, - "struct": 130, - "auto_format": "text", + "struct": 144, + "auto_format": "rows", "auto_tokens": 96 }, + { + "name": "scars.json", + "description": "Error tracking data with nested structures", + "pretty": 2600, + "compact": 2144, + "rows": 2225, + "columns": 2230, + "struct": 2448, + "auto_format": "json", + "auto_tokens": 2144 + }, { "name": "128KB.json", - "description": "Large structured data (128KB)", + "description": "Large structured data (788 employee records)", "pretty": 77346, - "compact": 63230, - "text": 54622, + "compact": 62378, + "rows": 54622, "columns": 54292, - "struct": 56772, - "auto_format": "columns", - "auto_tokens": 54292 + "struct": 59926, + "auto_format": "rows", + "auto_tokens": 54622 + }, + { + "name": "historical.json", + "description": "Historical OHLCV time-series data", + "pretty": 84094, + "compact": 55228, + "rows": 70286, + "columns": 70286, + "struct": 48969, + "auto_format": "struct", + "auto_tokens": 48969 }, { "name": "chart.json", - "description": "Chart configuration with nested arrays", + "description": "Chart configuration with 1,256 candles", "pretty": 101767, - "compact": 71802, - "text": 51541, + "compact": 71623, + "rows": 51541, "columns": 51558, - "struct": 61595, - "auto_format": "text", + "struct": 65364, + "auto_format": "rows", "auto_tokens": 51541 }, + { + "name": "quote.json", + "description": "Single quote with complex nested structure", + "pretty": 128981, + "compact": 85956, + "rows": 67251, + "columns": 65586, + "struct": 69053, + "auto_format": "columns", + "auto_tokens": 65586 + }, { "name": "gainers.json", "description": "Market gainers with complex nested objects (100 quotes)", "pretty": 142791, "compact": 91634, - "text": 113132, + "rows": 113132, "columns": 113132, - "struct": 89011, - "auto_format": "json", - "auto_tokens": 91634 - }, - { - "name": "scars.json", - "description": "Error tracking data with nested structures", - "pretty": 2600, - "compact": 2144, - "text": 2225, - "columns": 2230, - "struct": 2437, - "auto_format": "json", - "auto_tokens": 2144 - }, - { - "name": "historical.json", - "description": "Historical time-series data", - "pretty": 84094, - "compact": 55228, - "text": 70286, - "columns": 70286, - "struct": 47713, + "struct": 89012, "auto_format": "struct", - "auto_tokens": 47713 + "auto_tokens": 89012 } ] }; diff --git a/docs/concepts.md b/docs/concepts.md index 5e006a1..38dd3c9 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -17,7 +17,7 @@ flowchart TD A[Start: Encode data with auto mode] A --> B[Compact JSON baseline] - A --> C[AGONText format] + A --> C[AGONRows format] A --> D[AGONColumns format] A --> E[AGONStruct format] @@ -41,7 +41,7 @@ flowchart TD **The 5-step process:** 1. **Baseline**: Encode data to compact JSON and count tokens -2. **Try specialized formats**: Encode with AGONText, AGONColumns, and AGONStruct +2. **Try specialized formats**: Encode with AGONRows, AGONColumns, and AGONStruct 3. **Measure**: Count tokens for each specialized format 4. **Compare**: Calculate savings percentage vs JSON baseline 5. **Decide**: @@ -121,7 +121,7 @@ AGON provides three encoding formats, each optimized for different data shapes: ### Format Comparison -#### AGONText +#### AGONRows **Row-based tabular encoding** ```agon @@ -182,7 +182,7 @@ change: FR("+5", 5.0) Different data shapes naturally favor different formats: -=== "Uniform Arrays → Text" +=== "Uniform Arrays → Rows" ```python data = [ @@ -192,7 +192,7 @@ Different data shapes naturally favor different formats: ] result = AGON.encode(data, format="auto") - # → Selects "text" format + # → Selects "rows" format # # [3]{id name score} # 1 Alice 95 @@ -200,7 +200,7 @@ Different data shapes naturally favor different formats: # 3 Charlie 92 ``` - **Why text wins:** Consistent structure, few fields, perfect for row-based encoding. + **Why rows wins:** Consistent structure, few fields, perfect for row-based encoding. === "Wide Tables → Columns" @@ -265,9 +265,9 @@ print(result.format) # → "json" |--------|--------|-------------------|--------------------| | Pretty JSON | 142,791 | baseline | -55.9% (worse) | | **Compact JSON** | **91,634** | +35.8% | **baseline** | -| AGONText | 113,132 | +20.8% | -23.4% (worse) | +| AGONRows | 113,132 | +20.8% | -23.4% (worse) | | AGONColumns | 113,132 | +20.8% | -23.4% (worse) | -| AGONStruct | 89,011 | +37.7% | **+2.9%** (below threshold) | +| AGONStruct | 89,012 | +37.7% | **+2.9%** (below threshold) | | **Auto Selection** | **91,634** | +35.8% | **0%** (safe fallback) | !!! success "Safety Net in Action" @@ -293,9 +293,9 @@ AGON and TOON are complementary approaches to JSON encoding: | **Best For** | Uniform arrays, consistent pipelines | Variable data shapes, risk-averse optimization | | **Token Efficiency** | 40-60% savings on good matches | 30-60% savings with safety guarantee | -### When They Produce Identical Output +### When They Produce Nearly Identical Output -For uniform arrays, `AGONText` and TOON produce **near identical output**: +For uniform arrays, `AGONRows` and TOON produce **nearly identical output**: === "TOON Output" @@ -311,7 +311,7 @@ For uniform arrays, `AGONText` and TOON produce **near identical output**: 3,Wildflower Loop,5.1,180,sam,true ``` -=== "AGON Text Output" +=== "AGON Rows Output" ```agon context: @@ -325,7 +325,7 @@ For uniform arrays, `AGONText` and TOON produce **near identical output**: 3 Wildflower Loop 5.1 180 sam true ``` -**The only difference is AGONText uses the `\t` delimiter** +**The only difference:** AGONRows uses tabs while TOON uses commas as delimiters Both: 96 tokens (+58.1% savings vs pretty JSON, +30.9% vs compact JSON) @@ -351,7 +351,7 @@ Both: 96 tokens (+58.1% savings vs pretty JSON, +30.9% vs compact JSON) AGON provides maximum value in these scenarios: - **Variable data pipelines** where data shape changes between requests -- **Uniform arrays** with consistent fields (AGONText: 40-60% savings) +- **Uniform arrays** with consistent fields (AGONRows: 40-60% savings) - **Wide tables** with 10+ columns (AGONColumns: 50-70% savings) - **Repeated nested patterns** like market data with `{fmt, raw}` everywhere (AGONStruct: 30-50% savings) - **Cost-sensitive applications** where every token counts @@ -386,7 +386,7 @@ Detailed documentation of all methods and parameters View how JSON is used as a safety net -### [AGONText Format](formats/text.md) +### [AGONRows Format](formats/rows.md) Complete guide to row-based encoding diff --git a/docs/formats/columns.md b/docs/formats/columns.md index 333d130..b5df467 100644 --- a/docs/formats/columns.md +++ b/docs/formats/columns.md @@ -79,7 +79,7 @@ Let's encode a simple employee table with 12 fields: |--------|--------|---------| | **Pretty JSON** | **309** | **baseline** | | Compact JSON | 190 | +38.5% | - | AGONText | 137 | +55.7% | + | AGONRows | 137 | +55.7% | | **AGONColumns** | **158** | **+48.9%** | **Why columns helps:** With 12 fields, grouping by type (all IDs together, all names together) provides better compression than row-based format. For even wider tables (20+ fields), the advantage increases. @@ -114,7 +114,7 @@ Let's encode a simple employee table with 12 fields: **ASCII mode:** ```python -from agon.formats import AGONColumns +from agon import AGONColumns # Use ASCII tree characters for compatibility encoded = AGONColumns.encode(data, use_ascii=True) @@ -132,7 +132,7 @@ encoded = AGONColumns.encode(data, use_ascii=True) **Custom delimiter:** ```python -from agon.formats import AGONColumns +from agon import AGONColumns # Use comma-space delimiter encoded = AGONColumns.encode(data, delimiter=", ") @@ -321,10 +321,10 @@ Real-world employee data: | Format | Tokens | Savings | |--------|--------|---------| | Pretty JSON | 381 | baseline | | Compact JSON | 231 | +39.4% | - | AGONText | 171 | +55.1% | + | AGONRows | 171 | +55.1% | | **AGONColumns** | **186** | **+51.2%** | - **Trade-off:** AGONText wins for this example (fewer fields), but as field count grows beyond 10, AGONColumns pulls ahead due to type clustering. + **Trade-off:** AGONRows wins for this example (fewer fields), but as field count grows beyond 10, AGONColumns pulls ahead due to type clustering. --- @@ -342,7 +342,7 @@ Real-world employee data: ## When AGONColumns Loses -- **Few fields** (2-5 fields) → AGONText wins with simpler row-based format +- **Few fields** (2-5 fields) → AGONRows wins with simpler row-based format - **Highly irregular structure** (fields vary between records) → JSON fallback - **Deeply nested objects** with no arrays → AGONStruct or JSON - **Heterogeneous data** per column (mixed types) → Row-based better @@ -369,7 +369,7 @@ result = AGON.encode(user_data, format="auto") For advanced use cases, use AGONColumns encoder directly: ```python -from agon.formats import AGONColumns +from agon import AGONColumns # Encode with default options encoded = AGONColumns.encode(data) @@ -469,7 +469,7 @@ assert decoded == data # Lossless For the same employee dataset with 12 fields: -=== "AGONText (Row-Based)" +=== "AGONRows (Row-Based)" ``` [3]{id name email age city state zip phone dept title salary start_date} @@ -502,7 +502,7 @@ For the same employee dataset with 12 fields: **Decision factors:** -- **2-10 fields:** Use AGONText (simpler, less overhead) +- **2-10 fields:** Use AGONRows (simpler, less overhead) - **10-15 fields:** Borderline—auto mode chooses based on data - **15+ fields:** Use AGONColumns (type clustering advantage wins) @@ -529,7 +529,7 @@ For the same employee dataset with 12 fields: Yes! Use ASCII mode for compatibility: ```python - from agon.formats import AGONColumns + from agon import AGONColumns encoded = AGONColumns.encode(data, use_ascii=True) # Uses | and ` instead of ├ and └ ``` @@ -566,7 +566,7 @@ For the same employee dataset with 12 fields: ## Next Steps -### [AGONText Format](text.md) +### [AGONRows Format](rows.md) Learn about row-based encoding for narrow tables diff --git a/docs/formats/json.md b/docs/formats/json.md index 77b48ab..89f5fbb 100644 --- a/docs/formats/json.md +++ b/docs/formats/json.md @@ -6,7 +6,7 @@ Understanding when and why AGON returns compact JSON. ## What is JSON Fallback? -JSON fallback is AGON's safety mechanism—when specialized formats (Text, Columns, Struct) don't provide sufficient token savings, auto mode returns **compact JSON** instead. +JSON fallback is AGON's safety mechanism—when specialized formats (Rows, Columns, Struct) don't provide sufficient token savings, auto mode returns **compact JSON** instead. This is **a feature, not a failure**. It's the guarantee that makes `format="auto"` safe to use everywhere. @@ -124,7 +124,7 @@ When should you expect JSON vs specialized formats? | Data Characteristic | Expected Format | |---------------------|-----------------| -| Uniform array, 3-10 fields | **Text** | +| Uniform array, 3-10 fields | **Rows** | | Uniform array, 10+ fields | **Columns** | | Repeated nested `{a, b}` pattern (3+ times) | **Struct** | | Mixed types, inconsistent structure | **JSON** | @@ -186,7 +186,7 @@ When should you expect JSON vs specialized formats? ## Next Steps -### [AGONText Format](text.md) +### [AGONRows Format](rows.md) Learn about the most common specialized format diff --git a/docs/formats/text.md b/docs/formats/rows.md similarity index 84% rename from docs/formats/text.md rename to docs/formats/rows.md index cb4e761..d65c414 100644 --- a/docs/formats/text.md +++ b/docs/formats/rows.md @@ -1,4 +1,4 @@ -# AGONText Format +# AGONRows Format Row-based tabular encoding for uniform arrays—AGON's most commonly selected format. @@ -6,7 +6,7 @@ Row-based tabular encoding for uniform arrays—AGON's most commonly selected fo ## Overview -AGONText is a **row-based encoding format** optimized for uniform arrays of objects with consistent field structure. It's similar to TOON's approach and produces identical output for uniform arrays. +AGONRows is a **row-based encoding format** optimized for uniform arrays of objects with consistent field structure. It's similar to TOON's approach and produces nearly identical output for uniform arrays (tabs vs commas as delimiters). **Best for:** @@ -31,7 +31,7 @@ Let's encode a simple user list: ] ``` -=== "Output (AGONText)" +=== "Output (AGONRows)" ``` [3]{id name role} @@ -84,10 +84,10 @@ value1 value2 value3 **Custom delimiter:** ```python -from agon.formats import AGONText +from agon import AGONRows # Use pipe delimiter instead of tab -encoded = AGONText.encode(data, delimiter="|") +encoded = AGONRows.encode(data, delimiter="|") ``` --- @@ -96,7 +96,7 @@ encoded = AGONText.encode(data, delimiter="|") ### Primitives -AGONText infers types from content—no type markers needed: +AGONRows infers types from content—no type markers needed: | Type | Example Input | Encoded Output | |------|--------------|----------------| @@ -215,7 +215,7 @@ Real-world data from `toon.json`: } ``` -=== "Output (AGONText)" +=== "Output (AGONRows)" ``` context: @@ -235,13 +235,13 @@ Real-world data from `toon.json`: |--------|--------|---------| | Pretty JSON | 229 | baseline | | Compact JSON | 139 | +39.3% | - | **AGONText** | **96** | **+58.1%** | + | **AGONRows** | **96** | **+58.1%** | **30.9% savings** vs compact JSON! --- -## When AGONText Wins +## When AGONRows Wins - **Uniform arrays** with 3+ records having identical field structure - **Consistent field types** (all records have same fields with same types) @@ -254,7 +254,7 @@ Real-world data from `toon.json`: --- -## When AGONText Loses +## When AGONRows Loses - **Wide tables** (10+ fields) → AGONColumns wins (type clustering) - **Irregular structure** (fields vary between records) → JSON fallback @@ -284,26 +284,26 @@ result = AGON.encode(employee_data, format="auto") ## Direct Usage -For advanced use cases, use AGONText encoder directly: +For advanced use cases, use AGONRows encoder directly: ```python -from agon.formats import AGONText +from agon import AGONRows # Encode with default options -encoded = AGONText.encode(data) +encoded = AGONRows.encode(data) # Custom delimiter -encoded = AGONText.encode(data, delimiter="|") +encoded = AGONRows.encode(data, delimiter="|") # Without header (for LLM prompts) -encoded = AGONText.encode(data, include_header=False) +encoded = AGONRows.encode(data, include_header=False) # With header (for decoding) -encoded_with_header = AGONText.encode(data, include_header=True) -# → @AGON text\n\n[3]{id...} +encoded_with_header = AGONRows.encode(data, include_header=True) +# → @AGON rows\n\n[3]{id...} # Decode -decoded = AGONText.decode(encoded) +decoded = AGONRows.decode(encoded) assert decoded == data # Lossless ``` @@ -316,7 +316,7 @@ assert decoded == data # Lossless ```python data = [] - result = AGON.encode(data, format="text") + result = AGON.encode(data, format="rows") # → [0]{} ``` @@ -325,7 +325,7 @@ assert decoded == data # Lossless ```python data = [{"id": 1, "name": "Alice"}] - result = AGON.encode(data, format="text") + result = AGON.encode(data, format="rows") # → [1]{id name} # 1 Alice ``` @@ -335,7 +335,7 @@ assert decoded == data # Lossless ```python data = [{"a": None, "b": None}] - result = AGON.encode(data, format="text") + result = AGON.encode(data, format="rows") # → [1]{a b} # ``` @@ -347,7 +347,7 @@ assert decoded == data # Lossless ```python data = [{"name": "Alice\tBob", "quote": "He said \"hi\""}] - result = AGON.encode(data, format="text") + result = AGON.encode(data, format="rows") # → [1]{name quote} # "Alice\tBob" "He said \"hi\"" ``` @@ -358,7 +358,7 @@ assert decoded == data # Lossless ## Comparison with TOON -For uniform arrays, AGONText and TOON produce **near identical output**: +For uniform arrays, AGONRows and TOON produce **near identical output**: === "JSON" @@ -379,7 +379,7 @@ For uniform arrays, AGONText and TOON produce **near identical output**: 3,Charlie,user ``` -=== "AGONText" +=== "AGONRows" ```agon [3]{id name role} @@ -391,7 +391,7 @@ For uniform arrays, AGONText and TOON produce **near identical output**: Both achieve the same token savings vs JSON. -**TOON and AGONText are near identical! The only difference is AGONText uses the `\t` delimiter.** +**TOON and AGONRows are nearly identical!** The only difference is that AGONRows uses tabs (`\t`) while TOON uses commas (`,`) as delimiters. --- @@ -413,14 +413,14 @@ Both achieve the same token savings vs JSON. ??? question "Can I customize the delimiter?" - Yes! Use AGONText encoder directly: + Yes! Use AGONRows encoder directly: ```python - from agon.formats import AGONText - encoded = AGONText.encode(data, delimiter="|") + from agon import AGONRows + encoded = AGONRows.encode(data, delimiter="|") ``` -??? question "Does AGONText handle nested objects?" +??? question "Does AGONRows handle nested objects?" Yes, with indentation: @@ -433,7 +433,7 @@ Both achieve the same token savings vs JSON. # age: 28 ``` -??? question "Is AGONText the same as TOON?" +??? question "Is AGONRows the same as TOON?" **For uniform arrays:** Yes, near identical output. @@ -453,7 +453,7 @@ Learn about template-based encoding ### [Benchmarks](../benchmarks.md) -See AGONText performance on real datasets +See AGONRows performance on real datasets ### [API Reference](../api.md) diff --git a/docs/formats/struct.md b/docs/formats/struct.md index ccd1141..a1098a9 100644 --- a/docs/formats/struct.md +++ b/docs/formats/struct.md @@ -59,7 +59,7 @@ Let's encode market data with a repeated `{fmt, raw}` pattern: |--------|--------|---------| | **Pretty JSON** | **128** | **baseline** | | Compact JSON | 75 | +41.4% | - | AGONText | 90 | +29.7% | + | AGONRows | 90 | +29.7% | | **AGONStruct** | **73** | **+43.0%** | **Why struct wins:** The repeated `{fmt, raw}` pattern appears 5 times. Traditional formats repeat both field names (`"fmt"` and `"raw"`) in every instance. AGONStruct defines the template once, then each instance only contains values—eliminating 10 redundant field name repetitions. @@ -167,7 +167,7 @@ AGONStruct automatically detects repeated object patterns with: - **Primitive values only:** Nested objects/arrays don't create structs ```python -from agon.formats import AGONStruct +from agon import AGONStruct # Detect patterns with 5+ occurrences (more aggressive) encoded = AGONStruct.encode(data, min_occurrences=5) @@ -295,7 +295,7 @@ Real-world financial quote data: |--------|--------|---------| | Pretty JSON | 285 | baseline | | Compact JSON | 167 | +41.4% | - | AGONText | 197 | +30.9% | + | AGONRows | 197 | +30.9% | | **AGONStruct** | **153** | **+46.3%** | **Why struct wins:** The `{fmt, raw}` pattern appears 7 times. Template definition costs ~10 tokens, but saves ~6 tokens per instance. At 7 instances, savings are `7 × 6 - 10 = 32 tokens`. @@ -321,7 +321,7 @@ Real-world financial quote data: - **Few fields** (1 field objects) → No savings from template - **Irregular nested structures** → Can't identify consistent pattern - **Deeply nested objects** → Struct only works for shallow primitives -- **Array-heavy data** → AGONText or AGONColumns better +- **Array-heavy data** → AGONRows or AGONColumns better **Example where template overhead hurts:** @@ -348,7 +348,7 @@ result = AGON.encode(data, format="auto") For advanced use cases, use AGONStruct encoder directly: ```python -from agon.formats import AGONStruct +from agon import AGONStruct # Encode with default options (min_occurrences=3, min_fields=2) encoded = AGONStruct.encode(data) @@ -489,7 +489,7 @@ assert decoded == data # Lossless For the same market data with 5 `{fmt, raw}` instances: -=== "AGONText (No Template)" +=== "AGONRows (No Template)" ``` price: @@ -525,7 +525,7 @@ For the same market data with 5 `{fmt, raw}` instances: **Tokens:** 73 (`fmt` and `raw` defined once in template, eliminated from instances) -**Key difference:** AGONText repeats field names in every nested object. AGONStruct defines template once, then instances reference it by name with positional values. +**Key difference:** AGONRows repeats field names in every nested object. AGONStruct defines template once, then instances reference it by name with positional values. --- @@ -554,7 +554,7 @@ For the same market data with 5 `{fmt, raw}` instances: Yes! Use the encoder directly: ```python - from agon.formats import AGONStruct + from agon import AGONStruct # Lower threshold (detect patterns with 2+ occurrences) encoded = AGONStruct.encode(data, min_occurrences=2) @@ -607,7 +607,7 @@ For the same market data with 5 `{fmt, raw}` instances: ## Next Steps -### [AGONText Format](text.md) +### [AGONRows Format](rows.md) Learn about row-based encoding for flat arrays diff --git a/docs/index.md b/docs/index.md index 5d623e0..5876c09 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,6 @@ # AGON -**Adaptive Guarded Object Notation** — Self-describing, token-efficient JSON encodings optimized for LLM prompts. +**Adaptive Guarded Object Notation** — Self-describing, token-efficient JSON encodings optimized for LLM prompts with one guarantee: **never worse than compact JSON**. ## What is AGON? @@ -16,65 +16,33 @@ Adaptive encoding with safety guarantees: ```python result = AGON.encode(data, format="auto") -# Auto tries: text, columns, struct +# Auto tries: rows, columns, struct # Returns: whichever saves the most tokens # Falls back: to compact JSON if none are better ``` ---- - -## Quick Example - -Here's AGON in action with a simple user list: - -=== "Python Code" - - ```python - from agon import AGON - - # Sample data - list of objects with repeated structure - data = [ - {"id": 1, "name": "Alice", "role": "admin"}, - {"id": 2, "name": "Bob", "role": "user"}, - {"id": 3, "name": "Charlie", "role": "user"}, - ] - - # Encode with auto-selection - result = AGON.encode(data, format="auto") - print(f"Selected format: {result.format}") # → "text" - - # Use directly in LLM prompts - prompt = f"""Analyze this user data: - - {result} - - What percentage are admins?""" +!!! info "Rust-Powered Performance" - # Verify lossless round-trip - decoded = AGON.decode(result) - assert decoded == data # Perfect reconstruction - ``` + AGON's core encoding engine is built in **Rust** with **PyO3 bindings**, delivering: -=== "AGONText Output" + - **Parallel format evaluation** using rayon for auto mode + - **Native speed** with Python convenience - ```agon - [3]{id name role} - 1 Alice admin - 2 Bob user - 3 Charlie user - ``` + See [Performance Benchmarks](benchmarks.md#performance) for detailed metrics. - Clean, tab-delimited format with array length `[3]` and field headers `{id name role}`. No `@AGON` header needed when sending to LLMs—only required for decoding. +--- -=== "Token Comparison" +## Quick Comparison: AGON vs TOON - | Format | Tokens | Savings | - |--------|--------|---------| - | **Pretty JSON** | 62 | baseline | - | **Compact JSON** | 37 | +40% | - | **AGON Text** | **26** | **+58%** | +| Aspect | TOON | AGON | +|--------|------|------| +| **Approach** | Single unified format | Multiple adaptive formats + JSON fallback | +| **Risk** | Can be worse than JSON on irregular data | **Never worse than JSON** (guaranteed) | +| **Format Selection** | Always applies TOON encoding | Auto-selects best format or falls back to JSON | +| **Best For** | Uniform arrays, consistent pipelines | Variable data shapes, risk-averse optimization | +| **Philosophy** | "One format for all JSON" | "Best format for each data shape, or JSON" | - AGON's adaptive selection identified that this uniform array is ideal for AGONText format, achieving **58% token savings** compared to pretty JSON, and **30% savings** even against compact JSON. +For uniform arrays, `AGONRows` produces nearly identical output to TOON (tabs vs commas as delimiters, achieving the same token count). For everything else, AGON's adaptive approach ensures you always get the best result. --- @@ -94,61 +62,107 @@ uv add agon-python --- -## How It Works +## Quick Start -AGON provides **three specialized formats**, each optimized for different data shapes: +### Basic Usage -### 1. AGONText - Row-Based Encoding +```python +from agon import AGON -Best for **uniform arrays** of objects with consistent fields. +# Sample data - list of objects with repeated structure +data = [ + {"id": 1, "name": "Alice", "role": "admin"}, + {"id": 2, "name": "Bob", "role": "user"}, + {"id": 3, "name": "Charlie", "role": "user"}, +] -```agon -[3]{id name role} -1 Alice admin -2 Bob user -3 Charlie user -``` +# Encode with auto-selection (tries rows/columns/struct, picks best or falls back to JSON) +result = AGON.encode(data, format="auto") +print(f"Selected format: {result.format}") # → "rows" -**Ideal for:** User lists, transaction logs, simple metrics -**Token savings:** 40-60% vs pretty JSON +# Use directly in LLM prompts - no header needed +prompt = f"""Analyze this user data: -### 2. AGONColumns - Columnar Encoding +{result} -Best for **wide tables** (many columns) or numeric-heavy data. +What percentage are admins?""" -```agon -users[3] -├ id: 1 2 3 -├ name: Alice Bob Charlie -└ role: admin user user +# Verify lossless round-trip +decoded = AGON.decode(result) +assert decoded == data # ✅ Perfect reconstruction ``` -**Ideal for:** Financial data (20+ fields), analytics tables -**Token savings:** 50-70% vs pretty JSON +### Format Outputs -### 3. AGONStruct - Template-Based Encoding +AGON provides **three specialized formats**, each optimized for different data shapes. Auto mode tries all formats in parallel and selects the best: -Best for **repeated nested patterns** like `{fmt, raw}` or `{value, timestamp}`. +=== "AGONRows" -```agon -@FR: fmt, raw + **Best for:** Uniform arrays of objects with consistent fields -price: FR("$100.00", 100.0) -change: FR("+5.00", 5.0) -``` + ```agon + [3]{id name role} + 1 Alice admin + 2 Bob user + 3 Charlie user + ``` + + **Ideal for:** User lists, transaction logs, simple metrics -**Ideal for:** Market data, API responses with nested structures -**Token savings:** 30-50% vs pretty JSON + **Similar to:** [TOON format](https://toonformat.dev) (tabs instead of commas) + +=== "AGONColumns" + + **Best for:** Wide tables (many columns) or numeric-heavy data + + ```agon + users[3] + ├ id: 1 2 3 + ├ name: Alice Bob Charlie + └ role: admin user user + ``` + + **Ideal for:** Financial data (20+ fields), analytics tables + +=== "AGONStruct" + + **Best for:** Repeated nested patterns like `{fmt, raw}` or `{value, timestamp}` + + ```agon + @FR: fmt, raw + + price: FR("$100.00", 100.0) + change: FR("+5.00", 5.0) + volume: FR("1.2M", 1200000) + ``` + + **Ideal for:** Market data, API responses with nested structures + + **Similar to:** [TRON format](https://tron-format.github.io/) (abbreviated struct names) + +=== "Token Comparison" + + Token counts for the user data example above: + + | Format | Tokens | Savings vs Pretty | Savings vs Compact | + |--------|--------|-------------------|---------------------| + | **Pretty JSON** | 62 | baseline | -68% | + | **Compact JSON** | 37 | +40% | baseline | + | **AGON Rows** | **26** | **+58%** | **+30%** ✅ | + | **AGON Columns** | 28 | +55% | +24% | + | **AGON Struct** | 35 | +44% | +5% | + + Auto mode selected **AGONRows** (best savings, exceeds 10% threshold vs compact JSON) --- ### Adaptive Auto Mode -The `format="auto"` mode tries all three formats and selects the winner: +The `format="auto"` mode tries all three formats in parallel and selects the winner: ```mermaid graph TD - A[Encode data with auto mode] --> B[Try AGONText] + A[Encode data with auto mode] --> B[Try AGONRows] A --> C[Try AGONColumns] A --> D[Try AGONStruct] A --> E[Compact JSON baseline] @@ -163,32 +177,91 @@ graph TD I --> K[Safe JSON fallback] ``` +**The guarantee:** Auto mode *never* returns a format with more tokens than compact JSON. If all specialized formats are worse or marginally better, it returns JSON. + --- -## Why Not Just Use a Fixed Format? +## Use Cases -!!! warning "The Fixed-Format Problem" +AGON excels in scenarios where data structure varies and intelligent format selection provides value: - Fixed-format encoders like TOON can be worse than JSON on irregular data: +### When AGON Helps Most - ```python - # Fixed format: Always applies encoding - toon_result = TOON.encode(complex_data) # Might be worse than JSON! +- **Variable data pipelines**: Data that changes shape (sometimes uniform arrays, sometimes nested objects) where auto-mode selects the optimal format +- **Uniform arrays**: Lists of consistent objects (AGONRows: up to 58% savings vs pretty JSON) +- **Wide tables**: Financial records, analytics data with 10+ columns (AGONColumns: up to 70% savings) +- **Repeated nested patterns**: Market data with `{fmt, raw}` structures (AGONStruct: up to 49% savings) +- **Cost-sensitive applications**: Where every token counts and honest fallback prevents wasted overhead +- **Data projection workflows**: Use cases where filtering fields before encoding is important (`AGON.project_data`) - # AGON: Adaptive with safety guarantee - agon_result = AGON.encode(complex_data, format="auto") # Never worse than JSON - ``` +### When AGON Helps Least + +- **Tiny payloads**: <50 tokens where encoding overhead exceeds savings +- **Highly irregular objects**: No repetition or consistent structure (auto-mode falls back to JSON) +- **Single-use data**: Unpredictable, one-off structures with no patterns + +!!! tip "When in doubt, use `format='auto'`" + + The safety guarantee means you can use auto mode everywhere. If specialized formats don't help, you'll get compact JSON—no harm done. + +--- + +## Experimental: Asking LLMs to Generate AGON + +**⚠️ Note:** LLMs have NOT been trained on AGON format, so accuracy cannot be guaranteed. This is an experimental feature. For production use, prefer **sending AGON to LLMs** (reliable) over **asking LLMs to generate AGON** (experimental, requires validation). + +```python +from agon import AGON + +data = [ + {"id": 1, "name": "Alice", "role": "admin"}, + {"id": 2, "name": "Bob", "role": "user"}, +] + +result = AGON.encode(data, format="auto") + +# To ask an LLM to respond in AGON format, provide both: +# 1. Generation instructions via result.hint() +# 2. An example with header via result.with_header() +prompt = f"""Analyze this user data and return enriched data in AGON format. + +Instructions: {result.hint()} + +Example output: +{result.with_header()} + +Task: Add an is_admin boolean field and return in the same format.""" + +# Decode LLM response using header to auto-detect format +# parsed = AGON.decode(llm_response) +``` + +See the [API Reference](api.md#hint) for details on `hint()` and `with_header()` methods. + +--- + +## Next Steps + +### [Core Concepts](concepts.md) + +Understand AGON's adaptive approach, format selection, and design principles + +### [API Reference](api.md) + +Complete documentation of all methods, parameters, and classes - **AGON's auto mode** guarantees you'll never regret using it. If specialized formats don't save enough tokens, it returns compact JSON +### [Format Documentation](formats/rows.md) -!!! success "AGON vs TOON" +Detailed guides for each specialized format: - AGON and TOON are complementary: +- [AGONRows Format](formats/rows.md) - Row-based tabular encoding +- [AGONColumns Format](formats/columns.md) - Columnar transpose encoding +- [AGONStruct Format](formats/struct.md) - Template-based encoding +- [JSON Fallback](formats/json.md) - When and why AGON returns JSON - - **TOON**: Single unified format, predictable encoding, great for uniform arrays - - **AGON**: Multiple adaptive formats + JSON fallback, best format per data shape +### [Benchmarks](benchmarks.md) - For uniform arrays, `AGONText` produces identical output to TOON. For everything else, AGON's adaptive approach ensures you always get the best result. +Real-world token savings and performance metrics across 7 datasets --- @@ -196,9 +269,9 @@ graph TD AGON is open source and welcomes contributions! -- [Report issues](https://github.com/Verdenroz/agon-python/issues) -- [Request features](https://github.com/Verdenroz/agon-python/issues/new) -- [Contribute](https://github.com/Verdenroz/agon-python/blob/master/CONTRIBUTING.md) +- **Issues**: [Report bugs or request features](https://github.com/Verdenroz/agon-python/issues) +- **Contributing**: See [development guide](https://github.com/Verdenroz/agon-python#development) +- **Related projects**: [TOON](https://toonformat.dev) | [TRON](https://tron-format.github.io/) --- diff --git a/docs/javascripts/charts.js b/docs/javascripts/charts.js index 6a02f3c..4a83b92 100644 --- a/docs/javascripts/charts.js +++ b/docs/javascripts/charts.js @@ -59,7 +59,7 @@ document.addEventListener('DOMContentLoaded', function() { stack: 'baseline' }, { - label: 'AGONText', + label: 'AGONRows', data: textSavings, backgroundColor: textColors, borderColor: colors.text, diff --git a/mkdocs.yml b/mkdocs.yml index 23bf3b6..cd1f0c1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -40,7 +40,7 @@ nav: - API: api.md - Formats: - JSON: formats/json.md - - Text: formats/text.md + - Rows: formats/rows.md - Columns: formats/columns.md - Struct: formats/struct.md - Benchmarks: benchmarks.md @@ -60,7 +60,6 @@ markdown_extensions: custom_fences: - name: mermaid class: mermaid - format: pymdownx.superfences.fence_code_format - pymdownx.tabbed: alternate_style: true - pymdownx.details diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 157d350..93bf0b0 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -99,11 +99,29 @@ def test_fixture_benchmark(fixture_path: Path) -> None: raw_json = orjson.dumps(records, option=orjson.OPT_INDENT_2).decode() raw_tokens = count_tokens(raw_json) + # Test compact JSON (baseline for comparison table) + compact_json = orjson.dumps(records).decode() + compact_tokens = count_tokens(compact_json) + + t0 = time.perf_counter() + orjson.dumps(records) + compact_encode_ms = (time.perf_counter() - t0) * 1000 + + t0 = time.perf_counter() + orjson.loads(orjson.dumps(records)) + compact_decode_ms = (time.perf_counter() - t0) * 1000 + + # Calculate compact JSON savings vs pretty JSON + compact_savings = (1 - compact_tokens / max(1, raw_tokens)) * 100 + # Test each format individually format_results: dict[ str, tuple[int, float, float, float] ] = {} # tokens, savings, encode_ms, decode_ms + # Add compact JSON as baseline + format_results["json"] = (compact_tokens, compact_savings, compact_encode_ms, compact_decode_ms) + for fmt, encoder, decoder in [ ("rows", lambda data: AGON.encode(data, format="rows"), AGON.decode), # type: ignore[misc] ("columns", lambda data: AGON.encode(data, format="columns"), AGON.decode), # type: ignore[misc] From c307a94ccc73d165b11f8011bab5a34b44c746b7 Mon Sep 17 00:00:00 2001 From: harvey Date: Thu, 25 Dec 2025 17:15:10 -0500 Subject: [PATCH 7/7] fix: adds pytest-xdist and tiktoken to unit test session --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 58c91e7..b80ca85 100644 --- a/noxfile.py +++ b/noxfile.py @@ -21,7 +21,7 @@ def lint(session: nox.Session) -> None: @nox.session(python=PYTHON_VERSIONS) def unit(session: nox.Session) -> None: """Run unit tests.""" - session.install(".", "pytest", "pytest-cov", "pytest-sugar") + session.install(".", "pytest", "pytest-cov", "pytest-sugar", "pytest-xdist", "tiktoken") session.run( "pytest", "--cov=agon",