feat: Add UNNEST parser support and test data for FIX allocations

TimelordUK · claude · TimelordUK · commit cba8a51b8daa · 2025-10-05T15:20:49.000+01:00
Add parser support for UNNEST syntax and comprehensive design documentation for row expansion feature to handle FIX repeated groups. Parser Changes: - Add parse_unnest() function to primary expression parser - Parses UNNEST(column_expr, 'delimiter') syntax - Creates SqlExpression::Unnest with column and delimiter - Validates delimiter is a string literal - Currently gets parsed as FunctionCall "UNNEST" due to parser architecture - Will be handled specially in evaluator/executor Test Data: - data/fix_allocations.csv - FIX message example data - Contains pipe-delimited accounts and comma-delimited amounts - Three test cases including mismatched lengths Documentation: - docs/UNNEST_DESIGN.md - Complete design specification - Input/output examples with expected results - NULL padding behavior for mismatched lengths - Implementation requirements for evaluator and executor Next Steps: - Handle UNNEST in arithmetic evaluator (return array of split values) - Update query executor to detect UNNEST and multiply rows - Implement NULL padding for mismatched array lengths All tests passing (397 passed) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/data/fix_allocations.csv b/data/fix_allocations.csv
@@ -0,0 +1,4 @@
+msg_type,order_id,symbol,accounts,amounts
+AS,ORD001,ZX5Y,ACC_1|ACC_2|ACC_3,"200,200,200"
+AS,ORD002,ABCD,ACC_4|ACC_5,"300,700"
+8,ORD003,WXYZ,ACC_1,"1000"
diff --git a/docs/UNNEST_DESIGN.md b/docs/UNNEST_DESIGN.md
@@ -0,0 +1,74 @@
+# UNNEST Goal - FIX Allocation Example
+
+## Input Data (fix_allocations.csv)
+```csv
+msg_type,order_id,symbol,accounts,amounts
+AS,ORD001,ZX5Y,ACC_1|ACC_2|ACC_3,"200,200,200"
+AS,ORD002,ABCD,ACC_4|ACC_5,"300,700"
+8,ORD003,WXYZ,ACC_1,"1000"
+```
+
+## Desired Query
+```sql
+SELECT
+  msg_type,
+  order_id,
+  symbol,
+  UNNEST(accounts, '|') AS account,
+  UNNEST(amounts, ',') AS amount
+FROM fix_allocations;
+```
+
+## Expected Output
+```
++----------+----------+--------+---------+--------+
+| msg_type | order_id | symbol | account | amount |
++----------+----------+--------+---------+--------+
+| AS       | ORD001   | ZX5Y   | ACC_1   | 200    |
+| AS       | ORD001   | ZX5Y   | ACC_2   | 200    |
+| AS       | ORD001   | ZX5Y   | ACC_3   | 200    |
+| AS       | ORD002   | ABCD   | ACC_4   | 300    |
+| AS       | ORD002   | ABCD   | ACC_5   | 700    |
+| 8        | ORD003   | WXYZ   | ACC_1   | 1000   |
++----------+----------+--------+---------+--------+
+```
+
+## How It Works
+
+1. **Row ORD001**: `accounts` has 3 items, `amounts` has 3 items
+   - Creates 3 output rows (max of both)
+   - Each regular column (msg_type, order_id, symbol) is replicated 3 times
+   - UNNEST columns get their respective split values
+
+2. **Row ORD002**: `accounts` has 2 items, `amounts` has 2 items
+   - Creates 2 output rows
+   - Values aligned by index
+
+3. **Row ORD003**: `accounts` has 1 item, `amounts` has 1 item
+   - Creates 1 output row (no expansion needed)
+
+## Mismatched Length Example (NULL Padding)
+
+If we had mismatched data:
+```csv
+AS,ORD004,TEST,ACC_1|ACC_2|ACC_3,"100,200"
+```
+
+Query result would be:
+```
+| AS | ORD004 | TEST | ACC_1 | 100  |
+| AS | ORD004 | TEST | ACC_2 | 200  |
+| AS | ORD004 | TEST | ACC_3 | NULL |  <- NULL padding
+```
+
+## Implementation Requirements
+
+1. **Parser**: Recognize `UNNEST(column_expr, 'delimiter')`
+2. **Evaluator**: Return array of split values (not executed row-by-row)
+3. **Query Executor**:
+   - Detect all UNNEST expressions in SELECT
+   - For each input row:
+     - Evaluate each UNNEST → get arrays
+     - Find max array length
+     - Generate N output rows
+     - Fill in values (NULL if array exhausted)
diff --git a/src/sql/parser/expressions/primary.rs b/src/sql/parser/expressions/primary.rs
@@ -61,6 +61,11 @@ where
             parse_datetime_constructor(parser)
         }
 
+        Token::Unnest => {
+            debug!("Parsing UNNEST expression");
+            parse_unnest(parser)
+        }
+
         Token::Identifier(id) => {
             let id_upper = id.to_uppercase();
             let id_clone = id.clone();
@@ -442,6 +447,46 @@ where
     }
 }
 
+/// Parse UNNEST expression
+/// Syntax: UNNEST(column_expr, 'delimiter')
+fn parse_unnest<P>(parser: &mut P) -> Result<SqlExpression, String>
+where
+    P: ParsePrimary + ExpressionParser + ?Sized,
+{
+    debug!("parse_unnest: starting");
+    ExpressionParser::advance(parser); // consume UNNEST
+    ExpressionParser::consume(parser, Token::LeftParen)?;
+
+    // Parse the column expression (first argument)
+    let column = parser.parse_logical_or()?;
+    debug!("parse_unnest: parsed column expression");
+
+    // Expect comma
+    ExpressionParser::consume(parser, Token::Comma)?;
+
+    // Parse the delimiter (second argument - must be a string literal)
+    let delimiter = match ExpressionParser::current_token(parser) {
+        Token::StringLiteral(s) => {
+            let delim = s.clone();
+            ExpressionParser::advance(parser);
+            delim
+        }
+        _ => {
+            return Err("UNNEST delimiter must be a string literal".to_string());
+        }
+    };
+
+    debug!(delimiter = %delimiter, "parse_unnest: parsed delimiter");
+
+    ExpressionParser::consume(parser, Token::RightParen)?;
+
+    debug!("parse_unnest: complete");
+    Ok(SqlExpression::Unnest {
+        column: Box::new(column),
+        delimiter,
+    })
+}
+
 /// Trait that parsers must implement to use primary expression parsing
 pub trait ParsePrimary {
     fn current_token(&self) -> &Token;