Skip to content

Commit f96249c

Browse files
Added pending tokens for c-style hints
Added the pending tokens structure to properly return all tokens inside a c-style hint comment.
1 parent 015aee8 commit f96249c

4 files changed

Lines changed: 48 additions & 30 deletions

File tree

src/dialect/generic.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ impl Dialect for GenericDialect {
156156
true
157157
}
158158

159-
fn supports_c_style_comments(&self) -> bool {
159+
fn supports_c_style_hints(&self) -> bool {
160160
true
161161
}
162162

src/dialect/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -900,7 +900,7 @@ pub trait Dialect: Debug + Any {
900900

901901
/// Returns true if the dialect supports hint and C-style comments
902902
/// e.g. `/*! hint */`
903-
fn supports_c_style_comments(&self) -> bool {
903+
fn supports_c_style_hints(&self) -> bool {
904904
false
905905
}
906906

src/dialect/mysql.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ impl Dialect for MySqlDialect {
8585
}
8686

8787
/// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
88-
fn supports_c_style_comments(&self) -> bool {
88+
fn supports_c_style_hints(&self) -> bool {
8989
true
9090
}
9191

src/tokenizer.rs

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,8 @@ pub struct Tokenizer<'a> {
810810
/// If true (the default), the tokenizer will un-escape literal
811811
/// SQL strings See [`Tokenizer::with_unescape`] for more details.
812812
unescape: bool,
813+
/// Tokens injected back into the stream (e.g. from MySQL C-style hints)
814+
pending_tokens: Vec<Token>,
813815
}
814816

815817
impl<'a> Tokenizer<'a> {
@@ -834,6 +836,7 @@ impl<'a> Tokenizer<'a> {
834836
dialect,
835837
query,
836838
unescape: true,
839+
pending_tokens: Vec::new(),
837840
}
838841
}
839842

@@ -936,10 +939,16 @@ impl<'a> Tokenizer<'a> {
936939

937940
/// Get the next token or return None
938941
fn next_token(
939-
&self,
942+
&mut self,
940943
chars: &mut State,
941944
prev_token: Option<&Token>,
942945
) -> Result<Option<Token>, TokenizerError> {
946+
// Return any previously injected tokens first
947+
{
948+
if let Some(tok) = self.pending_tokens.pop() {
949+
return Ok(Some(tok));
950+
}
951+
}
943952
match chars.peek() {
944953
Some(&ch) => match ch {
945954
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@@ -2102,14 +2111,14 @@ impl<'a> Tokenizer<'a> {
21022111
}
21032112

21042113
fn tokenize_multiline_comment(
2105-
&self,
2114+
&mut self,
21062115
chars: &mut State,
21072116
) -> Result<Option<Token>, TokenizerError> {
21082117
let mut s = String::new();
21092118
let mut nested = 1;
21102119
let mut c_style_comments = false;
21112120
let supports_nested_comments = self.dialect.supports_nested_comments();
2112-
let supports_c_style_comments = self.dialect.supports_c_style_comments();
2121+
let supports_c_style_comments = self.dialect.supports_c_style_hints();
21132122
loop {
21142123
match chars.next() {
21152124
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
@@ -2120,37 +2129,21 @@ impl<'a> Tokenizer<'a> {
21202129
}
21212130
Some('!') if supports_c_style_comments => {
21222131
c_style_comments = true;
2123-
// consume the optional version digits and whitespace
2132+
// consume only version digits (leave following whitespace/content intact)
21242133
while let Some(&c) = chars.peek() {
2125-
if c.is_ascii_digit() || c.is_whitespace() {
2134+
if c.is_ascii_digit() {
21262135
chars.next();
21272136
} else {
21282137
break;
21292138
}
21302139
}
21312140
}
2132-
// consume all leading whitespaces until the '*/' character if in a C-style comment
2133-
Some(ch) if ch.is_whitespace() && c_style_comments => {
2134-
let mut tmp_s = String::new();
2135-
while let Some(c) = chars.next() {
2136-
if c.is_whitespace() {
2137-
tmp_s.push(c);
2138-
} else if c == '*' && chars.peek() == Some(&'/') {
2139-
chars.next(); // consume the '/'
2140-
return Ok(Some(Token::make_word(&s, None)));
2141-
} else {
2142-
tmp_s.push(c);
2143-
s.push_str(&tmp_s);
2144-
break;
2145-
}
2146-
}
2147-
}
21482141
Some('*') if matches!(chars.peek(), Some('/')) => {
21492142
chars.next(); // consume the '/'
21502143
nested -= 1;
21512144
if nested == 0 {
21522145
if c_style_comments {
2153-
break Ok(Some(Token::make_word(&s, None)));
2146+
break self.inject_tokens_from_c_style_hints_and_return_first(s);
21542147
}
21552148
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
21562149
}
@@ -2170,6 +2163,26 @@ impl<'a> Tokenizer<'a> {
21702163
}
21712164
}
21722165

2166+
/// Tokenize the given string using the same dialect/unescape settings and inject
2167+
/// the resulting tokens back into this tokenizer so they are returned before
2168+
/// any further characters from the main stream. Returns the first injected token.
2169+
fn inject_tokens_from_c_style_hints_and_return_first(
2170+
&mut self,
2171+
inner_sql: String,
2172+
) -> Result<Option<Token>, TokenizerError> {
2173+
let trimmed = inner_sql.trim();
2174+
if trimmed.is_empty() {
2175+
return Ok(None);
2176+
}
2177+
let mut inner = Tokenizer::new(self.dialect, trimmed).with_unescape(self.unescape);
2178+
let tokens = inner.tokenize()?;
2179+
// push in reverse so we can pop from the end efficiently
2180+
for t in tokens.into_iter().rev() {
2181+
self.pending_tokens.push(t);
2182+
}
2183+
Ok(self.pending_tokens.pop())
2184+
}
2185+
21732186
fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
21742187
let mut last_char = None;
21752188
let mut s = String::new();
@@ -4121,17 +4134,22 @@ mod tests {
41214134

41224135
#[test]
41234136
fn tokenize_multiline_comment_with_c_style_comment_and_version() {
4124-
let sql = String::from("0/*!8000000 word */1");
4125-
4137+
let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
41264138
let dialect = MySqlDialect {};
4127-
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4139+
let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
41284140
let expected = vec![
41294141
Token::Number("0".to_string(), false),
4142+
Token::Whitespace(Whitespace::Space),
41304143
Token::Word(Word {
4131-
value: "word".to_string(),
4144+
value: "KEY_BLOCK_SIZE".to_string(),
41324145
quote_style: None,
4133-
keyword: Keyword::NoKeyword,
4146+
keyword: Keyword::KEY_BLOCK_SIZE,
41344147
}),
4148+
Token::Whitespace(Whitespace::Space),
4149+
Token::Eq,
4150+
Token::Whitespace(Whitespace::Space),
4151+
Token::Number("1024".to_string(), false),
4152+
Token::Whitespace(Whitespace::Space),
41354153
Token::Number("1".to_string(), false),
41364154
];
41374155
compare(expected, tokens);

0 commit comments

Comments
 (0)