Skip to content

Commit 5ee841b

Browse files
TimelordUKclaude
andcommitted
feat: Add comment-aware tokenization foundation
Add comment tokens to lexer while maintaining backwards compatibility with existing parser. This is phase 1 of comment preservation support for formatting. Changes: - Add Token::LineComment and Token::BlockComment variants - Add next_token_with_comments() method that preserves comments - Add tokenize_all_with_comments() for formatters - Keep next_token() unchanged (skips comments) for parser compatibility - Add helper methods read_line_comment() and read_block_comment() - Update text_navigation to handle new comment token variants - Add comprehensive tests for comment tokenization Testing: - 4 new lexer tests (line/block/multiple comments, backwards compat) - All 397 existing tests pass - Verified parser still skips comments (backwards compatible) Architecture: This establishes the foundation for comment preservation in formatters. Future work: - Attach comments to AST nodes (leading/trailing/inline) - Update AST formatter to emit comments - Remove workarounds in nvim plugin Related issue: Formatter currently strips all comments 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 5c35f40 commit 5ee841b

2 files changed

Lines changed: 282 additions & 0 deletions

File tree

src/sql/parser/lexer.rs

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ pub enum Token {
9898
// String operators
9999
Concat, // || for string concatenation
100100

101+
// Comments (preserved for formatting)
102+
LineComment(String), // -- comment text (without the -- prefix)
103+
BlockComment(String), // /* comment text */ (without delimiters)
104+
101105
// Special
102106
Eof,
103107
}
@@ -284,6 +288,51 @@ impl Lexer {
284288
}
285289
}
286290

291+
/// Read a line comment and return its content (without the -- prefix)
292+
fn read_line_comment(&mut self) -> String {
293+
let mut result = String::new();
294+
295+
// Skip '--'
296+
self.advance();
297+
self.advance();
298+
299+
// Read until end of line or EOF
300+
while let Some(ch) = self.current_char {
301+
if ch == '\n' {
302+
self.advance(); // consume the newline
303+
break;
304+
}
305+
result.push(ch);
306+
self.advance();
307+
}
308+
309+
result
310+
}
311+
312+
/// Read a block comment and return its content (without /* */ delimiters)
313+
fn read_block_comment(&mut self) -> String {
314+
let mut result = String::new();
315+
316+
// Skip '/*'
317+
self.advance();
318+
self.advance();
319+
320+
// Read until we find '*/'
321+
while let Some(ch) = self.current_char {
322+
if ch == '*' && self.peek(1) == Some('/') {
323+
self.advance(); // skip '*'
324+
self.advance(); // skip '/'
325+
break;
326+
}
327+
result.push(ch);
328+
self.advance();
329+
}
330+
331+
result
332+
}
333+
334+
/// Skip whitespace and comments (for backwards compatibility with parser)
335+
/// This is the old behavior that discards comments
287336
fn skip_whitespace_and_comments(&mut self) {
288337
loop {
289338
// Skip whitespace
@@ -398,6 +447,147 @@ impl Lexer {
398447
result
399448
}
400449

450+
/// Get next token while preserving comments as tokens
451+
/// This is the new behavior for comment-aware formatting
452+
pub fn next_token_with_comments(&mut self) -> Token {
453+
// Only skip whitespace, NOT comments
454+
self.skip_whitespace();
455+
456+
match self.current_char {
457+
None => Token::Eof,
458+
// Handle comments as tokens
459+
Some('-') if self.peek(1) == Some('-') => {
460+
let comment_text = self.read_line_comment();
461+
Token::LineComment(comment_text)
462+
}
463+
Some('/') if self.peek(1) == Some('*') => {
464+
let comment_text = self.read_block_comment();
465+
Token::BlockComment(comment_text)
466+
}
467+
Some('*') => {
468+
self.advance();
469+
Token::Star
470+
}
471+
Some('+') => {
472+
self.advance();
473+
Token::Plus
474+
}
475+
Some('/') => {
476+
// Regular division (comment case handled above)
477+
self.advance();
478+
Token::Divide
479+
}
480+
Some('%') => {
481+
self.advance();
482+
Token::Modulo
483+
}
484+
Some('.') => {
485+
self.advance();
486+
Token::Dot
487+
}
488+
Some(',') => {
489+
self.advance();
490+
Token::Comma
491+
}
492+
Some(':') => {
493+
self.advance();
494+
Token::Colon
495+
}
496+
Some('(') => {
497+
self.advance();
498+
Token::LeftParen
499+
}
500+
Some(')') => {
501+
self.advance();
502+
Token::RightParen
503+
}
504+
Some('=') => {
505+
self.advance();
506+
Token::Equal
507+
}
508+
Some('<') => {
509+
self.advance();
510+
if self.current_char == Some('=') {
511+
self.advance();
512+
Token::LessThanOrEqual
513+
} else if self.current_char == Some('>') {
514+
self.advance();
515+
Token::NotEqual
516+
} else {
517+
Token::LessThan
518+
}
519+
}
520+
Some('>') => {
521+
self.advance();
522+
if self.current_char == Some('=') {
523+
self.advance();
524+
Token::GreaterThanOrEqual
525+
} else {
526+
Token::GreaterThan
527+
}
528+
}
529+
Some('!') if self.peek(1) == Some('=') => {
530+
self.advance();
531+
self.advance();
532+
Token::NotEqual
533+
}
534+
Some('|') if self.peek(1) == Some('|') => {
535+
self.advance();
536+
self.advance();
537+
Token::Concat
538+
}
539+
Some('"') => {
540+
let ident_val = self.read_string();
541+
Token::QuotedIdentifier(ident_val)
542+
}
543+
Some('$') => {
544+
if self.peek_string(6) == "$JSON$" {
545+
let json_content = self.read_json_block();
546+
Token::JsonBlock(json_content)
547+
} else {
548+
let ident = self.read_identifier();
549+
Token::Identifier(ident)
550+
}
551+
}
552+
Some('\'') => {
553+
let string_val = self.read_string();
554+
Token::StringLiteral(string_val)
555+
}
556+
Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
557+
self.advance();
558+
let num = self.read_number();
559+
Token::NumberLiteral(format!("-{num}"))
560+
}
561+
Some('-') => {
562+
self.advance();
563+
Token::Minus
564+
}
565+
Some(ch) if ch.is_numeric() => {
566+
let num = self.read_number();
567+
Token::NumberLiteral(num)
568+
}
569+
Some('#') => {
570+
self.advance();
571+
let table_name = self.read_identifier();
572+
if table_name.is_empty() {
573+
Token::Identifier("#".to_string())
574+
} else {
575+
Token::Identifier(format!("#{}", table_name))
576+
}
577+
}
578+
Some(ch) if ch.is_alphabetic() || ch == '_' => {
579+
let ident = self.read_identifier();
580+
Token::from_keyword(&ident).unwrap_or_else(|| Token::Identifier(ident))
581+
}
582+
Some(ch) => {
583+
self.advance();
584+
Token::Identifier(ch.to_string())
585+
}
586+
}
587+
}
588+
589+
/// Get next token (backwards compatible - skips comments)
590+
/// This is the old behavior for existing parser
401591
pub fn next_token(&mut self) -> Token {
402592
self.skip_whitespace_and_comments();
403593

@@ -663,4 +853,94 @@ impl Lexer {
663853
}
664854
tokens
665855
}
856+
857+
/// Tokenize all tokens including comments
858+
/// This is useful for formatting tools that need to preserve comments
859+
pub fn tokenize_all_with_comments(&mut self) -> Vec<Token> {
860+
let mut tokens = Vec::new();
861+
loop {
862+
let token = self.next_token_with_comments();
863+
if matches!(token, Token::Eof) {
864+
tokens.push(token);
865+
break;
866+
}
867+
tokens.push(token);
868+
}
869+
tokens
870+
}
871+
}
872+
873+
#[cfg(test)]
874+
mod tests {
875+
use super::*;
876+
877+
#[test]
878+
fn test_line_comment_tokenization() {
879+
let sql = "SELECT col1, -- this is a comment\ncol2 FROM table";
880+
let mut lexer = Lexer::new(sql);
881+
let tokens = lexer.tokenize_all_with_comments();
882+
883+
// Find the comment token
884+
let comment_token = tokens.iter().find(|t| matches!(t, Token::LineComment(_)));
885+
assert!(comment_token.is_some(), "Should find line comment token");
886+
887+
if let Some(Token::LineComment(text)) = comment_token {
888+
assert_eq!(text.trim(), "this is a comment");
889+
}
890+
}
891+
892+
#[test]
893+
fn test_block_comment_tokenization() {
894+
let sql = "SELECT /* block comment */ col1 FROM table";
895+
let mut lexer = Lexer::new(sql);
896+
let tokens = lexer.tokenize_all_with_comments();
897+
898+
// Find the comment token
899+
let comment_token = tokens.iter().find(|t| matches!(t, Token::BlockComment(_)));
900+
assert!(comment_token.is_some(), "Should find block comment token");
901+
902+
if let Some(Token::BlockComment(text)) = comment_token {
903+
assert_eq!(text.trim(), "block comment");
904+
}
905+
}
906+
907+
#[test]
908+
fn test_multiple_comments() {
909+
let sql = "-- First comment\nSELECT col1, /* inline */ col2\n-- Second comment\nFROM table";
910+
let mut lexer = Lexer::new(sql);
911+
let tokens = lexer.tokenize_all_with_comments();
912+
913+
let line_comments: Vec<_> = tokens
914+
.iter()
915+
.filter(|t| matches!(t, Token::LineComment(_)))
916+
.collect();
917+
let block_comments: Vec<_> = tokens
918+
.iter()
919+
.filter(|t| matches!(t, Token::BlockComment(_)))
920+
.collect();
921+
922+
assert_eq!(line_comments.len(), 2, "Should find 2 line comments");
923+
assert_eq!(block_comments.len(), 1, "Should find 1 block comment");
924+
}
925+
926+
#[test]
927+
fn test_backwards_compatibility() {
928+
// Test that next_token() still skips comments
929+
let sql = "SELECT -- comment\ncol1 FROM table";
930+
let mut lexer = Lexer::new(sql);
931+
let tokens = lexer.tokenize_all();
932+
933+
// Should NOT contain any comment tokens
934+
let has_comments = tokens
935+
.iter()
936+
.any(|t| matches!(t, Token::LineComment(_) | Token::BlockComment(_)));
937+
assert!(
938+
!has_comments,
939+
"next_token() should skip comments for backwards compatibility"
940+
);
941+
942+
// Should still parse correctly
943+
assert!(tokens.iter().any(|t| matches!(t, Token::Select)));
944+
assert!(tokens.iter().any(|t| matches!(t, Token::From)));
945+
}
666946
}

src/text_navigation.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,8 @@ impl TextNavigator {
245245
Token::Cross => "CROSS",
246246
Token::Outer => "OUTER",
247247
Token::On => "ON",
248+
Token::LineComment(text) => text,
249+
Token::BlockComment(text) => text,
248250
Token::Eof => "EOF",
249251
}
250252
}

0 commit comments

Comments
 (0)