@@ -98,6 +98,10 @@ pub enum Token {
9898 // String operators
9999 Concat , // || for string concatenation
100100
101+ // Comments (preserved for formatting)
102+ LineComment ( String ) , // -- comment text (without the -- prefix)
103+ BlockComment ( String ) , // /* comment text */ (without delimiters)
104+
101105 // Special
102106 Eof ,
103107}
@@ -284,6 +288,51 @@ impl Lexer {
284288 }
285289 }
286290
291+ /// Read a line comment and return its content (without the -- prefix)
292+ fn read_line_comment ( & mut self ) -> String {
293+ let mut result = String :: new ( ) ;
294+
295+ // Skip '--'
296+ self . advance ( ) ;
297+ self . advance ( ) ;
298+
299+ // Read until end of line or EOF
300+ while let Some ( ch) = self . current_char {
301+ if ch == '\n' {
302+ self . advance ( ) ; // consume the newline
303+ break ;
304+ }
305+ result. push ( ch) ;
306+ self . advance ( ) ;
307+ }
308+
309+ result
310+ }
311+
312+ /// Read a block comment and return its content (without /* */ delimiters)
313+ fn read_block_comment ( & mut self ) -> String {
314+ let mut result = String :: new ( ) ;
315+
316+ // Skip '/*'
317+ self . advance ( ) ;
318+ self . advance ( ) ;
319+
320+ // Read until we find '*/'
321+ while let Some ( ch) = self . current_char {
322+ if ch == '*' && self . peek ( 1 ) == Some ( '/' ) {
323+ self . advance ( ) ; // skip '*'
324+ self . advance ( ) ; // skip '/'
325+ break ;
326+ }
327+ result. push ( ch) ;
328+ self . advance ( ) ;
329+ }
330+
331+ result
332+ }
333+
334+ /// Skip whitespace and comments (for backwards compatibility with parser)
335+ /// This is the old behavior that discards comments
287336 fn skip_whitespace_and_comments ( & mut self ) {
288337 loop {
289338 // Skip whitespace
@@ -398,6 +447,147 @@ impl Lexer {
398447 result
399448 }
400449
450+ /// Get next token while preserving comments as tokens
451+ /// This is the new behavior for comment-aware formatting
452+ pub fn next_token_with_comments ( & mut self ) -> Token {
453+ // Only skip whitespace, NOT comments
454+ self . skip_whitespace ( ) ;
455+
456+ match self . current_char {
457+ None => Token :: Eof ,
458+ // Handle comments as tokens
459+ Some ( '-' ) if self . peek ( 1 ) == Some ( '-' ) => {
460+ let comment_text = self . read_line_comment ( ) ;
461+ Token :: LineComment ( comment_text)
462+ }
463+ Some ( '/' ) if self . peek ( 1 ) == Some ( '*' ) => {
464+ let comment_text = self . read_block_comment ( ) ;
465+ Token :: BlockComment ( comment_text)
466+ }
467+ Some ( '*' ) => {
468+ self . advance ( ) ;
469+ Token :: Star
470+ }
471+ Some ( '+' ) => {
472+ self . advance ( ) ;
473+ Token :: Plus
474+ }
475+ Some ( '/' ) => {
476+ // Regular division (comment case handled above)
477+ self . advance ( ) ;
478+ Token :: Divide
479+ }
480+ Some ( '%' ) => {
481+ self . advance ( ) ;
482+ Token :: Modulo
483+ }
484+ Some ( '.' ) => {
485+ self . advance ( ) ;
486+ Token :: Dot
487+ }
488+ Some ( ',' ) => {
489+ self . advance ( ) ;
490+ Token :: Comma
491+ }
492+ Some ( ':' ) => {
493+ self . advance ( ) ;
494+ Token :: Colon
495+ }
496+ Some ( '(' ) => {
497+ self . advance ( ) ;
498+ Token :: LeftParen
499+ }
500+ Some ( ')' ) => {
501+ self . advance ( ) ;
502+ Token :: RightParen
503+ }
504+ Some ( '=' ) => {
505+ self . advance ( ) ;
506+ Token :: Equal
507+ }
508+ Some ( '<' ) => {
509+ self . advance ( ) ;
510+ if self . current_char == Some ( '=' ) {
511+ self . advance ( ) ;
512+ Token :: LessThanOrEqual
513+ } else if self . current_char == Some ( '>' ) {
514+ self . advance ( ) ;
515+ Token :: NotEqual
516+ } else {
517+ Token :: LessThan
518+ }
519+ }
520+ Some ( '>' ) => {
521+ self . advance ( ) ;
522+ if self . current_char == Some ( '=' ) {
523+ self . advance ( ) ;
524+ Token :: GreaterThanOrEqual
525+ } else {
526+ Token :: GreaterThan
527+ }
528+ }
529+ Some ( '!' ) if self . peek ( 1 ) == Some ( '=' ) => {
530+ self . advance ( ) ;
531+ self . advance ( ) ;
532+ Token :: NotEqual
533+ }
534+ Some ( '|' ) if self . peek ( 1 ) == Some ( '|' ) => {
535+ self . advance ( ) ;
536+ self . advance ( ) ;
537+ Token :: Concat
538+ }
539+ Some ( '"' ) => {
540+ let ident_val = self . read_string ( ) ;
541+ Token :: QuotedIdentifier ( ident_val)
542+ }
543+ Some ( '$' ) => {
544+ if self . peek_string ( 6 ) == "$JSON$" {
545+ let json_content = self . read_json_block ( ) ;
546+ Token :: JsonBlock ( json_content)
547+ } else {
548+ let ident = self . read_identifier ( ) ;
549+ Token :: Identifier ( ident)
550+ }
551+ }
552+ Some ( '\'' ) => {
553+ let string_val = self . read_string ( ) ;
554+ Token :: StringLiteral ( string_val)
555+ }
556+ Some ( '-' ) if self . peek ( 1 ) . is_some_and ( char:: is_numeric) => {
557+ self . advance ( ) ;
558+ let num = self . read_number ( ) ;
559+ Token :: NumberLiteral ( format ! ( "-{num}" ) )
560+ }
561+ Some ( '-' ) => {
562+ self . advance ( ) ;
563+ Token :: Minus
564+ }
565+ Some ( ch) if ch. is_numeric ( ) => {
566+ let num = self . read_number ( ) ;
567+ Token :: NumberLiteral ( num)
568+ }
569+ Some ( '#' ) => {
570+ self . advance ( ) ;
571+ let table_name = self . read_identifier ( ) ;
572+ if table_name. is_empty ( ) {
573+ Token :: Identifier ( "#" . to_string ( ) )
574+ } else {
575+ Token :: Identifier ( format ! ( "#{}" , table_name) )
576+ }
577+ }
578+ Some ( ch) if ch. is_alphabetic ( ) || ch == '_' => {
579+ let ident = self . read_identifier ( ) ;
580+ Token :: from_keyword ( & ident) . unwrap_or_else ( || Token :: Identifier ( ident) )
581+ }
582+ Some ( ch) => {
583+ self . advance ( ) ;
584+ Token :: Identifier ( ch. to_string ( ) )
585+ }
586+ }
587+ }
588+
589+ /// Get next token (backwards compatible - skips comments)
590+ /// This is the old behavior for existing parser
401591 pub fn next_token ( & mut self ) -> Token {
402592 self . skip_whitespace_and_comments ( ) ;
403593
@@ -663,4 +853,94 @@ impl Lexer {
663853 }
664854 tokens
665855 }
856+
857+ /// Tokenize all tokens including comments
858+ /// This is useful for formatting tools that need to preserve comments
859+ pub fn tokenize_all_with_comments ( & mut self ) -> Vec < Token > {
860+ let mut tokens = Vec :: new ( ) ;
861+ loop {
862+ let token = self . next_token_with_comments ( ) ;
863+ if matches ! ( token, Token :: Eof ) {
864+ tokens. push ( token) ;
865+ break ;
866+ }
867+ tokens. push ( token) ;
868+ }
869+ tokens
870+ }
871+ }
872+
873+ #[ cfg( test) ]
874+ mod tests {
875+ use super :: * ;
876+
877+ #[ test]
878+ fn test_line_comment_tokenization ( ) {
879+ let sql = "SELECT col1, -- this is a comment\n col2 FROM table" ;
880+ let mut lexer = Lexer :: new ( sql) ;
881+ let tokens = lexer. tokenize_all_with_comments ( ) ;
882+
883+ // Find the comment token
884+ let comment_token = tokens. iter ( ) . find ( |t| matches ! ( t, Token :: LineComment ( _) ) ) ;
885+ assert ! ( comment_token. is_some( ) , "Should find line comment token" ) ;
886+
887+ if let Some ( Token :: LineComment ( text) ) = comment_token {
888+ assert_eq ! ( text. trim( ) , "this is a comment" ) ;
889+ }
890+ }
891+
892+ #[ test]
893+ fn test_block_comment_tokenization ( ) {
894+ let sql = "SELECT /* block comment */ col1 FROM table" ;
895+ let mut lexer = Lexer :: new ( sql) ;
896+ let tokens = lexer. tokenize_all_with_comments ( ) ;
897+
898+ // Find the comment token
899+ let comment_token = tokens. iter ( ) . find ( |t| matches ! ( t, Token :: BlockComment ( _) ) ) ;
900+ assert ! ( comment_token. is_some( ) , "Should find block comment token" ) ;
901+
902+ if let Some ( Token :: BlockComment ( text) ) = comment_token {
903+ assert_eq ! ( text. trim( ) , "block comment" ) ;
904+ }
905+ }
906+
907+ #[ test]
908+ fn test_multiple_comments ( ) {
909+ let sql = "-- First comment\n SELECT col1, /* inline */ col2\n -- Second comment\n FROM table" ;
910+ let mut lexer = Lexer :: new ( sql) ;
911+ let tokens = lexer. tokenize_all_with_comments ( ) ;
912+
913+ let line_comments: Vec < _ > = tokens
914+ . iter ( )
915+ . filter ( |t| matches ! ( t, Token :: LineComment ( _) ) )
916+ . collect ( ) ;
917+ let block_comments: Vec < _ > = tokens
918+ . iter ( )
919+ . filter ( |t| matches ! ( t, Token :: BlockComment ( _) ) )
920+ . collect ( ) ;
921+
922+ assert_eq ! ( line_comments. len( ) , 2 , "Should find 2 line comments" ) ;
923+ assert_eq ! ( block_comments. len( ) , 1 , "Should find 1 block comment" ) ;
924+ }
925+
926+ #[ test]
927+ fn test_backwards_compatibility ( ) {
928+ // Test that next_token() still skips comments
929+ let sql = "SELECT -- comment\n col1 FROM table" ;
930+ let mut lexer = Lexer :: new ( sql) ;
931+ let tokens = lexer. tokenize_all ( ) ;
932+
933+ // Should NOT contain any comment tokens
934+ let has_comments = tokens
935+ . iter ( )
936+ . any ( |t| matches ! ( t, Token :: LineComment ( _) | Token :: BlockComment ( _) ) ) ;
937+ assert ! (
938+ !has_comments,
939+ "next_token() should skip comments for backwards compatibility"
940+ ) ;
941+
942+ // Should still parse correctly
943+ assert ! ( tokens. iter( ) . any( |t| matches!( t, Token :: Select ) ) ) ;
944+ assert ! ( tokens. iter( ) . any( |t| matches!( t, Token :: From ) ) ) ;
945+ }
666946}
0 commit comments