diff --git a/README.md b/README.md index 2738c7a..8d2fd8d 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,7 @@ $ cat events.csv \ | `-H`, `--header` | Print column names as the first output row | | `--json` | Output results as a JSON array of objects (mutually exclusive with `-H`) | | `--max-rows ` | Stop if more than `n` data rows are read (exit 1) | +| `--columns` | Read the CSV header row, print each column name on its own line, and exit 0. With `-v`/`--verbose`, also shows the inferred type per column (`name INTEGER`). Respects `--delimiter` and `--tsv`. Mutually exclusive with a query argument. | | `-v`, `--verbose` | Print `Loaded rows in s` to stderr after loading (always on TTY; forced with flag) | | `-h`, `--help` | Show usage help and exit | | `-V`, `--version` | Print version and exit | diff --git a/build.zig b/build.zig index 179d80b..657196d 100644 --- a/build.zig +++ b/build.zig @@ -353,6 +353,80 @@ pub fn build(b: *std.Build) void { test_verbose_short.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_verbose_short.step); + // Integration test 33: --columns prints column names one per line and exits 0 + const test_columns_basic = b.addSystemCommand(&.{ + "bash", "-c", + \\printf 'id,region,amount\n1,east,100\n' | ./zig-out/bin/sql-pipe --columns | diff - <(printf 'id\nregion\namount\n') + }); + test_columns_basic.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_basic.step); + + // Integration test 34: --columns --verbose prints "name TYPE" lines + const test_columns_verbose = b.addSystemCommand(&.{ + "bash", "-c", + \\printf 'id,name,amount\n1,Alice,3.14\n2,Bob,2.72\n' | ./zig-out/bin/sql-pipe --columns --verbose | diff - <(printf 'id INTEGER\nname TEXT\namount REAL\n') + }); + test_columns_verbose.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_verbose.step); + + // Integration test 35: --columns works with --delimiter + const test_columns_delimiter = b.addSystemCommand(&.{ + "bash", "-c", + \\printf 'a|b|c\n1|2|3\n' | ./zig-out/bin/sql-pipe --columns -d '|' | diff - <(printf 'a\nb\nc\n') + }); + test_columns_delimiter.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_delimiter.step); + + // Integration test 36: --columns works with --tsv + const test_columns_tsv = b.addSystemCommand(&.{ + "bash", "-c", + \\printf 'col1\tcol2\tcol3\n' | ./zig-out/bin/sql-pipe --columns --tsv | diff - <(printf 'col1\ncol2\ncol3\n') + }); + test_columns_tsv.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_tsv.step); + + // Integration test 37: --columns combined with a query argument exits 1 with error + const test_columns_with_query = b.addSystemCommand(&.{ + "bash", "-c", + \\msg=$(printf 'a,b\n1,2\n' | ./zig-out/bin/sql-pipe --columns 'SELECT * FROM t' 2>&1 >/dev/null; echo "EXIT:$?") + \\echo "$msg" | grep -q 'error: --columns cannot be combined with a query argument' && echo "$msg" | grep -q 'EXIT:1' + }); + test_columns_with_query.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_with_query.step); + + // Integration test 38: --columns --verbose with malformed CSV exits 2 + const test_columns_verbose_bad_csv = b.addSystemCommand(&.{ + "bash", "-c", + \\msg=$(printf 'a,b\n"unterminated' | ./zig-out/bin/sql-pipe --columns --verbose 2>&1 >/dev/null; echo "EXIT:$?") + \\echo "$msg" | grep -q 'row 2: unterminated quoted field' && echo "$msg" | grep -q 'EXIT:2' + }); + test_columns_verbose_bad_csv.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_verbose_bad_csv.step); + + // Integration test 39: --columns --verbose with header-only (no data rows) → all TEXT + const test_columns_verbose_no_data = b.addSystemCommand(&.{ + "bash", "-c", + \\printf 'id,name\n' | ./zig-out/bin/sql-pipe --columns --verbose | diff - <(printf 'id TEXT\nname TEXT\n') + }); + test_columns_verbose_no_data.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_verbose_no_data.step); + + // Integration test 40: --columns with empty stdin exits 2 + const test_columns_empty_stdin = b.addSystemCommand(&.{ + "bash", "-c", + \\printf '' | ./zig-out/bin/sql-pipe --columns 2>/dev/null; test $? -eq 2 + }); + test_columns_empty_stdin.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_empty_stdin.step); + + // Integration test 41: -v is a valid alias for --verbose with --columns + const test_columns_short_verbose = b.addSystemCommand(&.{ + "bash", "-c", + \\printf 'id,name\n1,Alice\n' | ./zig-out/bin/sql-pipe --columns -v | diff - <(printf 'id INTEGER\nname TEXT\n') + }); + test_columns_short_verbose.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_short_verbose.step); + // Unit tests for the RFC 4180 CSV parser (src/csv.zig) const unit_tests = b.addTest(.{ .root_module = b.createModule(.{ diff --git a/docs/sql-pipe.1.scd b/docs/sql-pipe.1.scd index 14288ae..b350b31 100644 --- a/docs/sql-pipe.1.scd +++ b/docs/sql-pipe.1.scd @@ -5,6 +5,7 @@ NAME SYNOPSIS *sql-pipe* [OPTIONS] + *sql-pipe* --columns [OPTIONS] DESCRIPTION sql-pipe reads CSV data from standard input, loads it into an in-memory SQLite @@ -57,6 +58,13 @@ OPTIONS code 1 and an error message. Use this to guard against accidentally piping extremely large files into memory. + *--columns* + Read the CSV header row, print each column name on its own line to + standard output, and exit with code 0. When combined with *-v* / + *--verbose*, also shows the inferred type (INTEGER, REAL, or TEXT) + for each column, using the first 100 data rows for inference. Respects + *--delimiter* and *--tsv*. Mutually exclusive with a query argument. + *-h, --help* Print the help message and exit with code 0. diff --git a/src/main.zig b/src/main.zig index c2f118f..8d005aa 100644 --- a/src/main.zig +++ b/src/main.zig @@ -17,6 +17,7 @@ const SqlPipeError = error{ MissingQuery, InvalidDelimiter, IncompatibleFlags, + ColumnsWithQuery, InvalidMaxRows, OpenDbFailed, EmptyInput, @@ -73,6 +74,14 @@ const ParsedArgs = struct { verbose: bool, }; +/// Arguments for `--columns` mode. +const ColumnsArgs = struct { + /// CSV field delimiter (default: ','). + delimiter: u8, + /// Show inferred type alongside name when true. + verbose: bool, +}; + /// Result of argument parsing — either parsed arguments or a special action. const ArgsResult = union(enum) { /// Normal execution: run the query. @@ -81,6 +90,8 @@ const ArgsResult = union(enum) { help, /// User requested --version / -V. version, + /// User requested --columns: list column names and exit. + columns: ColumnsArgs, }; // ─── Extracted functions ────────────────────────────── @@ -103,6 +114,9 @@ fn printUsage(writer: *std.Io.Writer) !void { \\ --json Output results as a JSON array of objects \\ --max-rows Stop if more than data rows are read (exit 1) \\ -v, --verbose Force row count to stderr (shown automatically on TTY) + \\ With --columns: show inferred type per column + \\ --columns List column names from header (one per line) and exit + \\ Combine with -v/--verbose to include inferred types \\ -h, --help Show this help message and exit \\ -V, --version Show version and exit \\ @@ -151,6 +165,7 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { var explicit_tsv = false; var max_rows: ?usize = null; var verbose = false; + var list_columns = false; // Loop invariant I: all args[1..i] have been processed; // query holds the first non-flag argument seen, or null; @@ -197,6 +212,8 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { if (max_rows.? == 0) return error.InvalidMaxRows; } else if (std.mem.eql(u8, arg, "--verbose") or std.mem.eql(u8, arg, "-v")) { verbose = true; + } else if (std.mem.eql(u8, arg, "--columns")) { + list_columns = true; } else { if (query == null) query = arg; } @@ -206,6 +223,14 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { if (json and header) return error.IncompatibleFlags; + // --columns is mutually exclusive with a query argument + if (list_columns and query != null) + return error.ColumnsWithQuery; + + // --columns mode: list headers and exit + if (list_columns) + return .{ .columns = ColumnsArgs{ .delimiter = delimiter, .verbose = verbose } }; + return .{ .parsed = ParsedArgs{ .query = query orelse return error.MissingQuery, .type_inference = type_inference, @@ -781,7 +806,7 @@ fn levenshteinDistance(a: []const u8, b: []const u8) usize { const cost: usize = if (a[i] == b[j]) 0 else 1; curr[j + 1] = @min(curr[j] + 1, @min(prev[j + 1] + 1, prev[j] + cost)); } - @memcpy(prev[0..b_len + 1], curr[0..b_len + 1]); + @memcpy(prev[0 .. b_len + 1], curr[0 .. b_len + 1]); } return prev[b_len]; } @@ -936,75 +961,99 @@ fn fatalSqlWithContext( std.process.exit(@intFromEnum(ExitCode.sql_error)); } -pub fn main(init: std.process.Init.Minimal) void { - var gpa: std.heap.DebugAllocator(.{}) = .init; - defer _ = gpa.deinit(); - const allocator = gpa.allocator(); - - var io = std.Io.Threaded.init_single_threaded; - - var stderr_buf: [1024]u8 = undefined; - var stderr_file_writer = std.Io.File.writer(std.Io.File.stderr(), io.io(), &stderr_buf); - const stderr_writer: *std.Io.Writer = &stderr_file_writer.interface; - - var stdout_buf: [4096]u8 = undefined; - var stdout_file_writer = std.Io.File.writer(std.Io.File.stdout(), io.io(), &stdout_buf); - const stdout_writer: *std.Io.Writer = &stdout_file_writer.interface; +/// runColumns(args, allocator, io, stderr_writer, stdout_writer) → void +/// Pre: args.delimiter is valid; allocator and writers are valid +/// Post: column names from stdin CSV header row are written to stdout, one per line; +/// when args.verbose is true, each line has format " " where TYPE +/// is inferred from the first 100 data rows (INTEGER, REAL, or TEXT) +/// error messages go to stderr; process exits 0 on success, 2 on CSV error +fn runColumns( + args: ColumnsArgs, + allocator: std.mem.Allocator, + io: std.Io, + stderr_writer: *std.Io.Writer, + stdout_writer: *std.Io.Writer, +) void { + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var csv_reader = csv.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, args.delimiter); - var args_arena = std.heap.ArenaAllocator.init(allocator); - defer args_arena.deinit(); - const args = init.args.toSlice(args_arena.allocator()) catch - fatal("failed to read process arguments", stderr_writer, .usage, .{}); + const header_record = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), + } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); + defer csv_reader.freeRecord(header_record); - const args_result = parseArgs(args) catch |err| { + const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| { switch (err) { - error.IncompatibleFlags => { - stderr_writer.writeAll("error: --json cannot be combined with --header\n") catch |werr| { - std.log.err("failed to write error message: {}", .{werr}); - }; - stderr_writer.flush() catch |ferr| std.log.err("failed to flush: {}", .{ferr}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }, - error.InvalidMaxRows => { - stderr_writer.writeAll("error: --max-rows must be a positive integer\n") catch |werr| { - std.log.err("failed to write error message: {}", .{werr}); - }; - stderr_writer.flush() catch |ferr| std.log.err("failed to flush: {}", .{ferr}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }, - else => {}, + error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, .csv_error, .{}), + error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse header", stderr_writer, .csv_error, .{}), } - printUsage(stderr_writer) catch |werr| { - std.log.err("failed to write usage: {}", .{werr}); - }; - stderr_writer.flush() catch |ferr| std.log.err("failed to flush: {}", .{ferr}); - std.process.exit(@intFromEnum(ExitCode.usage)); }; + defer { + for (cols) |col| allocator.free(col); + allocator.free(cols); + } + // {A1: cols is a non-empty slice of trimmed, BOM-free column names} + + if (args.verbose) { + // Read up to inference_buffer_size rows for type inference + var row_buffer: std.ArrayList([][]u8) = .empty; + defer { + for (row_buffer.items) |row| csv_reader.freeRecord(row); + row_buffer.deinit(allocator); + } + var data_row: usize = 1; // row 1 = header already read; data rows start at 2 + // Loop invariant I: row_buffer.items.len ≤ inference_buffer_size + // all items are valid parsed CSV records + // data_row = 1 + number of data rows attempted so far + // Bounding function: inference_buffer_size - row_buffer.items.len + // (decreases for each non-empty row appended; empty rows are counted by + // data_row but do not move the buffer toward the bound — stream must + // be finite for termination) + while (row_buffer.items.len < inference_buffer_size) { + data_row += 1; + const rec = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal( + "row {d}: unterminated quoted field", + stderr_writer, + .csv_error, + .{data_row}, + ), + else => fatal( + "row {d}: failed to parse CSV", + stderr_writer, + .csv_error, + .{data_row}, + ), + } orelse break; + if (rec.len == 0) { + csv_reader.freeRecord(rec); + continue; + } + row_buffer.append(allocator, rec) catch + fatal("out of memory while buffering rows", stderr_writer, .csv_error, .{}); + } + const types = inferTypes(allocator, row_buffer.items, cols.len) catch + fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); + defer allocator.free(types); - switch (args_result) { - .help => { - printUsage(stderr_writer) catch |err| { - std.log.err("failed to write usage: {}", .{err}); - }; - stderr_writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.success)); - }, - .version => { - stderr_writer.print("sql-pipe {s}\n", .{VERSION}) catch |err| { - std.log.err("failed to write version: {}", .{err}); + // Loop invariant I: cols[0..i] have been written with type annotation to stdout + // Bounding function: cols.len - i + for (cols, types) |col, t| { + stdout_writer.print("{s} {s}\n", .{ col, @tagName(t) }) catch |err| { + std.log.err("failed to write output: {}", .{err}); }; - stderr_writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.success)); - }, - .parsed => |parsed| { - run(parsed, allocator, io.io(), stderr_writer, stdout_writer); - stdout_file_writer.flush() catch |err| { - std.log.err("failed to flush stdout: {}", .{err}); - }; - stderr_file_writer.flush() catch |err| { - std.log.err("failed to flush stderr: {}", .{err}); + } + } else { + // Loop invariant I: cols[0..i] have been written to stdout + // Bounding function: cols.len - i + for (cols) |col| { + stdout_writer.print("{s}\n", .{col}) catch |err| { + std.log.err("failed to write output: {}", .{err}); }; - }, + } } } @@ -1206,3 +1255,91 @@ fn run( }; // {A10: all result rows written to stdout as CSV lines} } + +pub fn main(init: std.process.Init.Minimal) void { + var gpa: std.heap.DebugAllocator(.{}) = .init; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + var io = std.Io.Threaded.init_single_threaded; + + var stderr_buf: [1024]u8 = undefined; + var stderr_file_writer = std.Io.File.writer(std.Io.File.stderr(), io.io(), &stderr_buf); + const stderr_writer: *std.Io.Writer = &stderr_file_writer.interface; + + var stdout_buf: [4096]u8 = undefined; + var stdout_file_writer = std.Io.File.writer(std.Io.File.stdout(), io.io(), &stdout_buf); + const stdout_writer: *std.Io.Writer = &stdout_file_writer.interface; + + var args_arena = std.heap.ArenaAllocator.init(allocator); + defer args_arena.deinit(); + const args = init.args.toSlice(args_arena.allocator()) catch + fatal("failed to read process arguments", stderr_writer, .usage, .{}); + + const args_result = parseArgs(args) catch |err| { + switch (err) { + error.IncompatibleFlags => { + stderr_writer.writeAll("error: --json cannot be combined with --header\n") catch |werr| { + std.log.err("failed to write error message: {}", .{werr}); + }; + stderr_writer.flush() catch |ferr| std.log.err("failed to flush: {}", .{ferr}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }, + error.InvalidMaxRows => { + stderr_writer.writeAll("error: --max-rows must be a positive integer\n") catch |werr| { + std.log.err("failed to write error message: {}", .{werr}); + }; + stderr_writer.flush() catch |ferr| std.log.err("failed to flush: {}", .{ferr}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }, + error.ColumnsWithQuery => { + stderr_writer.writeAll("error: --columns cannot be combined with a query argument\n") catch |werr| { + std.log.err("failed to write error message: {}", .{werr}); + }; + stderr_writer.flush() catch |ferr| std.log.err("failed to flush: {}", .{ferr}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }, + else => {}, + } + printUsage(stderr_writer) catch |werr| { + std.log.err("failed to write usage: {}", .{werr}); + }; + stderr_writer.flush() catch |ferr| std.log.err("failed to flush: {}", .{ferr}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + + switch (args_result) { + .help => { + printUsage(stderr_writer) catch |err| { + std.log.err("failed to write usage: {}", .{err}); + }; + stderr_writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.success)); + }, + .version => { + stderr_writer.print("sql-pipe {s}\n", .{VERSION}) catch |err| { + std.log.err("failed to write version: {}", .{err}); + }; + stderr_writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.success)); + }, + .columns => |col_args| { + runColumns(col_args, allocator, io.io(), stderr_writer, stdout_writer); + stdout_file_writer.flush() catch |err| { + std.log.err("failed to flush stdout: {}", .{err}); + }; + stderr_file_writer.flush() catch |err| { + std.log.err("failed to flush stderr: {}", .{err}); + }; + }, + .parsed => |parsed| { + run(parsed, allocator, io.io(), stderr_writer, stdout_writer); + stdout_file_writer.flush() catch |err| { + std.log.err("failed to flush stdout: {}", .{err}); + }; + stderr_file_writer.flush() catch |err| { + std.log.err("failed to flush stderr: {}", .{err}); + }; + }, + } +}