@@ -810,6 +810,8 @@ pub struct Tokenizer<'a> {
810810 /// If true (the default), the tokenizer will un-escape literal
811811 /// SQL strings See [`Tokenizer::with_unescape`] for more details.
812812 unescape : bool ,
813+ /// Tokens injected back into the stream (e.g. from MySQL C-style hints)
814+ pending_tokens : Vec < Token > ,
813815}
814816
815817impl < ' a > Tokenizer < ' a > {
@@ -834,6 +836,7 @@ impl<'a> Tokenizer<'a> {
834836 dialect,
835837 query,
836838 unescape : true ,
839+ pending_tokens : Vec :: new ( ) ,
837840 }
838841 }
839842
@@ -936,10 +939,16 @@ impl<'a> Tokenizer<'a> {
936939
937940 /// Get the next token or return None
938941 fn next_token (
939- & self ,
942+ & mut self ,
940943 chars : & mut State ,
941944 prev_token : Option < & Token > ,
942945 ) -> Result < Option < Token > , TokenizerError > {
946+ // Return any previously injected tokens first
947+ {
948+ if let Some ( tok) = self . pending_tokens . pop ( ) {
949+ return Ok ( Some ( tok) ) ;
950+ }
951+ }
943952 match chars. peek ( ) {
944953 Some ( & ch) => match ch {
945954 ' ' => self . consume_and_return ( chars, Token :: Whitespace ( Whitespace :: Space ) ) ,
@@ -2102,14 +2111,14 @@ impl<'a> Tokenizer<'a> {
21022111 }
21032112
21042113 fn tokenize_multiline_comment (
2105- & self ,
2114+ & mut self ,
21062115 chars : & mut State ,
21072116 ) -> Result < Option < Token > , TokenizerError > {
21082117 let mut s = String :: new ( ) ;
21092118 let mut nested = 1 ;
21102119 let mut c_style_comments = false ;
21112120 let supports_nested_comments = self . dialect . supports_nested_comments ( ) ;
2112- let supports_c_style_comments = self . dialect . supports_c_style_comments ( ) ;
2121+ let supports_c_style_comments = self . dialect . supports_c_style_hints ( ) ;
21132122 loop {
21142123 match chars. next ( ) {
21152124 Some ( '/' ) if matches ! ( chars. peek( ) , Some ( '*' ) ) && supports_nested_comments => {
@@ -2120,37 +2129,21 @@ impl<'a> Tokenizer<'a> {
21202129 }
21212130 Some ( '!' ) if supports_c_style_comments => {
21222131 c_style_comments = true ;
2123- // consume the optional version digits and whitespace
2132+ // consume only version digits (leave following whitespace/content intact)
21242133 while let Some ( & c) = chars. peek ( ) {
2125- if c. is_ascii_digit ( ) || c . is_whitespace ( ) {
2134+ if c. is_ascii_digit ( ) {
21262135 chars. next ( ) ;
21272136 } else {
21282137 break ;
21292138 }
21302139 }
21312140 }
2132- // consume all leading whitespaces until the '*/' character if in a C-style comment
2133- Some ( ch) if ch. is_whitespace ( ) && c_style_comments => {
2134- let mut tmp_s = String :: new ( ) ;
2135- while let Some ( c) = chars. next ( ) {
2136- if c. is_whitespace ( ) {
2137- tmp_s. push ( c) ;
2138- } else if c == '*' && chars. peek ( ) == Some ( & '/' ) {
2139- chars. next ( ) ; // consume the '/'
2140- return Ok ( Some ( Token :: make_word ( & s, None ) ) ) ;
2141- } else {
2142- tmp_s. push ( c) ;
2143- s. push_str ( & tmp_s) ;
2144- break ;
2145- }
2146- }
2147- }
21482141 Some ( '*' ) if matches ! ( chars. peek( ) , Some ( '/' ) ) => {
21492142 chars. next ( ) ; // consume the '/'
21502143 nested -= 1 ;
21512144 if nested == 0 {
21522145 if c_style_comments {
2153- break Ok ( Some ( Token :: make_word ( & s , None ) ) ) ;
2146+ break self . inject_tokens_from_c_style_hints_and_return_first ( s ) ;
21542147 }
21552148 break Ok ( Some ( Token :: Whitespace ( Whitespace :: MultiLineComment ( s) ) ) ) ;
21562149 }
@@ -2170,6 +2163,26 @@ impl<'a> Tokenizer<'a> {
21702163 }
21712164 }
21722165
2166+ /// Tokenize the given string using the same dialect/unescape settings and inject
2167+ /// the resulting tokens back into this tokenizer so they are returned before
2168+ /// any further characters from the main stream. Returns the first injected token.
2169+ fn inject_tokens_from_c_style_hints_and_return_first (
2170+ & mut self ,
2171+ inner_sql : String ,
2172+ ) -> Result < Option < Token > , TokenizerError > {
2173+ let trimmed = inner_sql. trim ( ) ;
2174+ if trimmed. is_empty ( ) {
2175+ return Ok ( None ) ;
2176+ }
2177+ let mut inner = Tokenizer :: new ( self . dialect , trimmed) . with_unescape ( self . unescape ) ;
2178+ let tokens = inner. tokenize ( ) ?;
2179+ // push in reverse so we can pop from the end efficiently
2180+ for t in tokens. into_iter ( ) . rev ( ) {
2181+ self . pending_tokens . push ( t) ;
2182+ }
2183+ Ok ( self . pending_tokens . pop ( ) )
2184+ }
2185+
21732186 fn parse_quoted_ident ( & self , chars : & mut State , quote_end : char ) -> ( String , Option < char > ) {
21742187 let mut last_char = None ;
21752188 let mut s = String :: new ( ) ;
@@ -4121,17 +4134,22 @@ mod tests {
41214134
41224135 #[ test]
41234136 fn tokenize_multiline_comment_with_c_style_comment_and_version ( ) {
4124- let sql = String :: from ( "0/*!8000000 word */1" ) ;
4125-
4137+ let sql_multi = String :: from ( "0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1" ) ;
41264138 let dialect = MySqlDialect { } ;
4127- let tokens = Tokenizer :: new ( & dialect, & sql ) . tokenize ( ) . unwrap ( ) ;
4139+ let tokens = Tokenizer :: new ( & dialect, & sql_multi ) . tokenize ( ) . unwrap ( ) ;
41284140 let expected = vec ! [
41294141 Token :: Number ( "0" . to_string( ) , false ) ,
4142+ Token :: Whitespace ( Whitespace :: Space ) ,
41304143 Token :: Word ( Word {
4131- value: "word " . to_string( ) ,
4144+ value: "KEY_BLOCK_SIZE " . to_string( ) ,
41324145 quote_style: None ,
4133- keyword: Keyword :: NoKeyword ,
4146+ keyword: Keyword :: KEY_BLOCK_SIZE ,
41344147 } ) ,
4148+ Token :: Whitespace ( Whitespace :: Space ) ,
4149+ Token :: Eq ,
4150+ Token :: Whitespace ( Whitespace :: Space ) ,
4151+ Token :: Number ( "1024" . to_string( ) , false ) ,
4152+ Token :: Whitespace ( Whitespace :: Space ) ,
41354153 Token :: Number ( "1" . to_string( ) , false ) ,
41364154 ] ;
41374155 compare ( expected, tokens) ;
0 commit comments