• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use crate::lexer::lexer_impl::Lexer;
2 use crate::lexer::lexer_impl::LexerError;
3 use crate::lexer::loc::Loc;
4 use crate::lexer::parser_language::ParserLanguage;
5 use crate::lexer::str_lit::StrLit;
6 use crate::lexer::str_lit::StrLitDecodeError;
7 use crate::lexer::token::Token;
8 use crate::lexer::token::TokenWithLocation;
9 
10 #[derive(Debug, thiserror::Error)]
11 pub enum TokenizerError {
12     #[error(transparent)]
13     LexerError(#[from] LexerError),
14     #[error(transparent)]
15     StrLitDecodeError(#[from] StrLitDecodeError),
16     #[error("Internal tokenizer error")]
17     InternalError,
18     // TODO: too broad
19     #[error("Incorrect input")]
20     IncorrectInput,
21     #[error("Not allowed in this context: {0}")]
22     NotAllowedInThisContext(&'static str),
23     #[error("Unexpected end of input")]
24     UnexpectedEof,
25     #[error("Expecting string literal")]
26     ExpectStrLit,
27     #[error("Expecting int literal")]
28     ExpectIntLit,
29     #[error("Expecting float literal")]
30     ExpectFloatLit,
31     #[error("Expecting identifier")]
32     ExpectIdent,
33     #[error("Expecting identifier `{}`", .0)]
34     ExpectNamedIdent(String),
35     #[error("While parsing {}, expecting char `{}`", .1, .0)]
36     ExpectChar(char, &'static str),
37     #[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::<Vec<_>>().join(", "))]
38     ExpectAnyChar(Vec<char>),
39 }
40 
41 pub type TokenizerResult<R> = Result<R, TokenizerError>;
42 
43 #[derive(Clone)]
44 pub struct Tokenizer<'a> {
45     lexer: Lexer<'a>,
46     next_token: Option<TokenWithLocation>,
47     last_token_loc: Option<Loc>,
48 }
49 
50 impl<'a> Tokenizer<'a> {
new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a>51     pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> {
52         Tokenizer {
53             lexer: Lexer::new(input, comment_style),
54             next_token: None,
55             last_token_loc: None,
56         }
57     }
58 
loc(&self) -> Loc59     pub fn loc(&self) -> Loc {
60         // After lookahead return the location of the next token
61         self.next_token
62             .as_ref()
63             .map(|t| t.loc.clone())
64             // After token consumed return the location of that token
65             .or(self.last_token_loc.clone())
66             // Otherwise return the position of lexer
67             .unwrap_or(self.lexer.loc)
68     }
69 
lookahead_loc(&mut self) -> Loc70     pub fn lookahead_loc(&mut self) -> Loc {
71         drop(self.lookahead());
72         // TODO: does not handle EOF properly
73         self.loc()
74     }
75 
lookahead(&mut self) -> TokenizerResult<Option<&Token>>76     fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> {
77         Ok(match self.next_token {
78             Some(ref token) => Some(&token.token),
79             None => {
80                 self.next_token = self.lexer.next_token()?;
81                 self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone());
82                 match self.next_token {
83                     Some(ref token) => Some(&token.token),
84                     None => None,
85                 }
86             }
87         })
88     }
89 
lookahead_some(&mut self) -> TokenizerResult<&Token>90     pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> {
91         match self.lookahead()? {
92             Some(token) => Ok(token),
93             None => Err(TokenizerError::UnexpectedEof),
94         }
95     }
96 
next(&mut self) -> TokenizerResult<Option<Token>>97     fn next(&mut self) -> TokenizerResult<Option<Token>> {
98         self.lookahead()?;
99         Ok(self
100             .next_token
101             .take()
102             .map(|TokenWithLocation { token, .. }| token))
103     }
104 
next_some(&mut self) -> TokenizerResult<Token>105     pub fn next_some(&mut self) -> TokenizerResult<Token> {
106         match self.next()? {
107             Some(token) => Ok(token),
108             None => Err(TokenizerError::UnexpectedEof),
109         }
110     }
111 
112     /// Can be called only after lookahead, otherwise it's error
advance(&mut self) -> TokenizerResult<Token>113     pub fn advance(&mut self) -> TokenizerResult<Token> {
114         self.next_token
115             .take()
116             .map(|TokenWithLocation { token, .. }| token)
117             .ok_or(TokenizerError::InternalError)
118     }
119 
120     /// No more tokens
syntax_eof(&mut self) -> TokenizerResult<bool>121     pub fn syntax_eof(&mut self) -> TokenizerResult<bool> {
122         Ok(self.lookahead()?.is_none())
123     }
124 
next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>> where P: FnOnce(&Token) -> Option<R>,125     pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>>
126     where
127         P: FnOnce(&Token) -> Option<R>,
128     {
129         self.lookahead()?;
130         let v = match self.next_token {
131             Some(ref token) => match p(&token.token) {
132                 Some(v) => v,
133                 None => return Ok(None),
134             },
135             _ => return Ok(None),
136         };
137         self.next_token = None;
138         Ok(Some(v))
139     }
140 
next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E> where P: FnOnce(&Token) -> Result<R, E>, E: From<TokenizerError>,141     pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E>
142     where
143         P: FnOnce(&Token) -> Result<R, E>,
144         E: From<TokenizerError>,
145     {
146         self.lookahead()?;
147         let r = match self.next_token {
148             Some(ref token) => p(&token.token)?,
149             None => return Err(TokenizerError::UnexpectedEof.into()),
150         };
151         self.next_token = None;
152         Ok(r)
153     }
154 
next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>> where P: FnOnce(&Token) -> bool,155     fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>>
156     where
157         P: FnOnce(&Token) -> bool,
158     {
159         self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None })
160     }
161 
next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>>162     pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> {
163         let v = match self.lookahead()? {
164             Some(&Token::Ident(ref next)) => {
165                 if idents.into_iter().find(|&i| i == next).is_some() {
166                     next.clone()
167                 } else {
168                     return Ok(None);
169                 }
170             }
171             _ => return Ok(None),
172         };
173         self.advance()?;
174         Ok(Some(v))
175     }
176 
next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool>177     pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> {
178         Ok(self.next_ident_if_in(&[word])? != None)
179     }
180 
next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()>181     pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> {
182         if self.next_ident_if_eq(word)? {
183             Ok(())
184         } else {
185             Err(TokenizerError::ExpectNamedIdent(word.to_owned()))
186         }
187     }
188 
next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()>189     pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> {
190         if self.clone().next_ident_if_eq(word)? {
191             // TODO: which context?
192             return Err(TokenizerError::NotAllowedInThisContext(word));
193         }
194         Ok(())
195     }
196 
next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool>197     pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> {
198         Ok(self.next_token_if(|token| match token {
199             &Token::Symbol(c) if c == symbol => true,
200             _ => false,
201         })? != None)
202     }
203 
next_symbol_expect_eq( &mut self, symbol: char, desc: &'static str, ) -> TokenizerResult<()>204     pub fn next_symbol_expect_eq(
205         &mut self,
206         symbol: char,
207         desc: &'static str,
208     ) -> TokenizerResult<()> {
209         if self.lookahead_is_symbol(symbol)? {
210             self.advance()?;
211             Ok(())
212         } else {
213             Err(TokenizerError::ExpectChar(symbol, desc))
214         }
215     }
216 
next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char>217     pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char> {
218         for symbol in symbols {
219             if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") {
220                 return Ok(*symbol);
221             }
222         }
223         Err(TokenizerError::ExpectAnyChar(symbols.to_owned()))
224     }
225 
lookahead_is_str_lit(&mut self) -> TokenizerResult<bool>226     pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult<bool> {
227         Ok(match self.lookahead()? {
228             Some(&Token::StrLit(..)) => true,
229             _ => false,
230         })
231     }
232 
lookahead_is_int_lit(&mut self) -> TokenizerResult<bool>233     pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult<bool> {
234         Ok(match self.lookahead()? {
235             Some(&Token::IntLit(..)) => true,
236             _ => false,
237         })
238     }
239 
lookahead_is_json_number(&mut self) -> TokenizerResult<bool>240     pub fn lookahead_is_json_number(&mut self) -> TokenizerResult<bool> {
241         Ok(match self.lookahead()? {
242             Some(&Token::JsonNumber(..)) => true,
243             _ => false,
244         })
245     }
246 
lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>>247     pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> {
248         Ok(match self.lookahead()? {
249             Some(&Token::Symbol(c)) => Some(c),
250             _ => None,
251         })
252     }
253 
lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool>254     pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> {
255         Ok(self.lookahead_if_symbol()? == Some(symbol))
256     }
257 
lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool>258     pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> {
259         Ok(match self.lookahead()? {
260             Some(Token::Ident(i)) => i == ident,
261             _ => false,
262         })
263     }
264 
next_ident(&mut self) -> TokenizerResult<String>265     pub fn next_ident(&mut self) -> TokenizerResult<String> {
266         self.next_token_check_map(|token| match token {
267             &Token::Ident(ref ident) => Ok(ident.clone()),
268             _ => Err(TokenizerError::ExpectIdent),
269         })
270     }
271 
next_str_lit(&mut self) -> TokenizerResult<StrLit>272     pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> {
273         self.next_token_check_map(|token| match token {
274             &Token::StrLit(ref str_lit) => Ok(str_lit.clone()),
275             _ => Err(TokenizerError::ExpectStrLit),
276         })
277     }
278 
next_int_lit(&mut self) -> TokenizerResult<u64>279     pub fn next_int_lit(&mut self) -> TokenizerResult<u64> {
280         self.next_token_check_map(|token| match token {
281             &Token::IntLit(v) => Ok(v),
282             _ => Err(TokenizerError::ExpectIntLit),
283         })
284     }
285 
next_float_lit(&mut self) -> TokenizerResult<f64>286     pub fn next_float_lit(&mut self) -> TokenizerResult<f64> {
287         self.next_token_check_map(|token| match token {
288             &Token::FloatLit(v) => Ok(v),
289             _ => Err(TokenizerError::ExpectFloatLit),
290         })
291     }
292 }
293 
294 #[cfg(test)]
295 mod test {
296 
297     use super::*;
298 
tokenize<P, R>(input: &str, what: P) -> R where P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,299     fn tokenize<P, R>(input: &str, what: P) -> R
300     where
301         P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,
302     {
303         let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto);
304         let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc()));
305         let eof = tokenizer
306             .syntax_eof()
307             .expect(&format!("check eof failed at {}", tokenizer.loc()));
308         assert!(eof, "{}", tokenizer.loc());
309         r
310     }
311 
312     #[test]
test_ident()313     fn test_ident() {
314         let msg = r#"  aabb_c  "#;
315         let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned()));
316         assert_eq!("aabb_c", mess);
317     }
318 
319     #[test]
test_str_lit()320     fn test_str_lit() {
321         let msg = r#"  "a\nb"  "#;
322         let mess = tokenize(msg, |p| p.next_str_lit());
323         assert_eq!(
324             StrLit {
325                 escaped: r#"a\nb"#.to_owned()
326             },
327             mess
328         );
329     }
330 }
331