1 use crate::lexer::lexer_impl::Lexer; 2 use crate::lexer::lexer_impl::LexerError; 3 use crate::lexer::loc::Loc; 4 use crate::lexer::parser_language::ParserLanguage; 5 use crate::lexer::str_lit::StrLit; 6 use crate::lexer::str_lit::StrLitDecodeError; 7 use crate::lexer::token::Token; 8 use crate::lexer::token::TokenWithLocation; 9 10 #[derive(Debug, thiserror::Error)] 11 pub enum TokenizerError { 12 #[error(transparent)] 13 LexerError(#[from] LexerError), 14 #[error(transparent)] 15 StrLitDecodeError(#[from] StrLitDecodeError), 16 #[error("Internal tokenizer error")] 17 InternalError, 18 // TODO: too broad 19 #[error("Incorrect input")] 20 IncorrectInput, 21 #[error("Not allowed in this context: {0}")] 22 NotAllowedInThisContext(&'static str), 23 #[error("Unexpected end of input")] 24 UnexpectedEof, 25 #[error("Expecting string literal")] 26 ExpectStrLit, 27 #[error("Expecting int literal")] 28 ExpectIntLit, 29 #[error("Expecting float literal")] 30 ExpectFloatLit, 31 #[error("Expecting identifier")] 32 ExpectIdent, 33 #[error("Expecting identifier `{}`", .0)] 34 ExpectNamedIdent(String), 35 #[error("While parsing {}, expecting char `{}`", .1, .0)] 36 ExpectChar(char, &'static str), 37 #[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::<Vec<_>>().join(", "))] 38 ExpectAnyChar(Vec<char>), 39 } 40 41 pub type TokenizerResult<R> = Result<R, TokenizerError>; 42 43 #[derive(Clone)] 44 pub struct Tokenizer<'a> { 45 lexer: Lexer<'a>, 46 next_token: Option<TokenWithLocation>, 47 last_token_loc: Option<Loc>, 48 } 49 50 impl<'a> Tokenizer<'a> { new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a>51 pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> { 52 Tokenizer { 53 lexer: Lexer::new(input, comment_style), 54 next_token: None, 55 last_token_loc: None, 56 } 57 } 58 loc(&self) -> Loc59 pub fn loc(&self) -> Loc { 60 // After lookahead return the location of the next token 61 self.next_token 62 .as_ref() 63 .map(|t| t.loc.clone()) 64 // After token consumed return the location of that token 65 .or(self.last_token_loc.clone()) 66 // Otherwise return the position of lexer 67 .unwrap_or(self.lexer.loc) 68 } 69 lookahead_loc(&mut self) -> Loc70 pub fn lookahead_loc(&mut self) -> Loc { 71 drop(self.lookahead()); 72 // TODO: does not handle EOF properly 73 self.loc() 74 } 75 lookahead(&mut self) -> TokenizerResult<Option<&Token>>76 fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> { 77 Ok(match self.next_token { 78 Some(ref token) => Some(&token.token), 79 None => { 80 self.next_token = self.lexer.next_token()?; 81 self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone()); 82 match self.next_token { 83 Some(ref token) => Some(&token.token), 84 None => None, 85 } 86 } 87 }) 88 } 89 lookahead_some(&mut self) -> TokenizerResult<&Token>90 pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> { 91 match self.lookahead()? { 92 Some(token) => Ok(token), 93 None => Err(TokenizerError::UnexpectedEof), 94 } 95 } 96 next(&mut self) -> TokenizerResult<Option<Token>>97 fn next(&mut self) -> TokenizerResult<Option<Token>> { 98 self.lookahead()?; 99 Ok(self 100 .next_token 101 .take() 102 .map(|TokenWithLocation { token, .. }| token)) 103 } 104 next_some(&mut self) -> TokenizerResult<Token>105 pub fn next_some(&mut self) -> TokenizerResult<Token> { 106 match self.next()? { 107 Some(token) => Ok(token), 108 None => Err(TokenizerError::UnexpectedEof), 109 } 110 } 111 112 /// Can be called only after lookahead, otherwise it's error advance(&mut self) -> TokenizerResult<Token>113 pub fn advance(&mut self) -> TokenizerResult<Token> { 114 self.next_token 115 .take() 116 .map(|TokenWithLocation { token, .. }| token) 117 .ok_or(TokenizerError::InternalError) 118 } 119 120 /// No more tokens syntax_eof(&mut self) -> TokenizerResult<bool>121 pub fn syntax_eof(&mut self) -> TokenizerResult<bool> { 122 Ok(self.lookahead()?.is_none()) 123 } 124 next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>> where P: FnOnce(&Token) -> Option<R>,125 pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>> 126 where 127 P: FnOnce(&Token) -> Option<R>, 128 { 129 self.lookahead()?; 130 let v = match self.next_token { 131 Some(ref token) => match p(&token.token) { 132 Some(v) => v, 133 None => return Ok(None), 134 }, 135 _ => return Ok(None), 136 }; 137 self.next_token = None; 138 Ok(Some(v)) 139 } 140 next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E> where P: FnOnce(&Token) -> Result<R, E>, E: From<TokenizerError>,141 pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E> 142 where 143 P: FnOnce(&Token) -> Result<R, E>, 144 E: From<TokenizerError>, 145 { 146 self.lookahead()?; 147 let r = match self.next_token { 148 Some(ref token) => p(&token.token)?, 149 None => return Err(TokenizerError::UnexpectedEof.into()), 150 }; 151 self.next_token = None; 152 Ok(r) 153 } 154 next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>> where P: FnOnce(&Token) -> bool,155 fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>> 156 where 157 P: FnOnce(&Token) -> bool, 158 { 159 self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None }) 160 } 161 next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>>162 pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> { 163 let v = match self.lookahead()? { 164 Some(&Token::Ident(ref next)) => { 165 if idents.into_iter().find(|&i| i == next).is_some() { 166 next.clone() 167 } else { 168 return Ok(None); 169 } 170 } 171 _ => return Ok(None), 172 }; 173 self.advance()?; 174 Ok(Some(v)) 175 } 176 next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool>177 pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> { 178 Ok(self.next_ident_if_in(&[word])? != None) 179 } 180 next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()>181 pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> { 182 if self.next_ident_if_eq(word)? { 183 Ok(()) 184 } else { 185 Err(TokenizerError::ExpectNamedIdent(word.to_owned())) 186 } 187 } 188 next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()>189 pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> { 190 if self.clone().next_ident_if_eq(word)? { 191 // TODO: which context? 192 return Err(TokenizerError::NotAllowedInThisContext(word)); 193 } 194 Ok(()) 195 } 196 next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool>197 pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> { 198 Ok(self.next_token_if(|token| match token { 199 &Token::Symbol(c) if c == symbol => true, 200 _ => false, 201 })? != None) 202 } 203 next_symbol_expect_eq( &mut self, symbol: char, desc: &'static str, ) -> TokenizerResult<()>204 pub fn next_symbol_expect_eq( 205 &mut self, 206 symbol: char, 207 desc: &'static str, 208 ) -> TokenizerResult<()> { 209 if self.lookahead_is_symbol(symbol)? { 210 self.advance()?; 211 Ok(()) 212 } else { 213 Err(TokenizerError::ExpectChar(symbol, desc)) 214 } 215 } 216 next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char>217 pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char> { 218 for symbol in symbols { 219 if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") { 220 return Ok(*symbol); 221 } 222 } 223 Err(TokenizerError::ExpectAnyChar(symbols.to_owned())) 224 } 225 lookahead_is_str_lit(&mut self) -> TokenizerResult<bool>226 pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult<bool> { 227 Ok(match self.lookahead()? { 228 Some(&Token::StrLit(..)) => true, 229 _ => false, 230 }) 231 } 232 lookahead_is_int_lit(&mut self) -> TokenizerResult<bool>233 pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult<bool> { 234 Ok(match self.lookahead()? { 235 Some(&Token::IntLit(..)) => true, 236 _ => false, 237 }) 238 } 239 lookahead_is_json_number(&mut self) -> TokenizerResult<bool>240 pub fn lookahead_is_json_number(&mut self) -> TokenizerResult<bool> { 241 Ok(match self.lookahead()? { 242 Some(&Token::JsonNumber(..)) => true, 243 _ => false, 244 }) 245 } 246 lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>>247 pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> { 248 Ok(match self.lookahead()? { 249 Some(&Token::Symbol(c)) => Some(c), 250 _ => None, 251 }) 252 } 253 lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool>254 pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> { 255 Ok(self.lookahead_if_symbol()? == Some(symbol)) 256 } 257 lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool>258 pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> { 259 Ok(match self.lookahead()? { 260 Some(Token::Ident(i)) => i == ident, 261 _ => false, 262 }) 263 } 264 next_ident(&mut self) -> TokenizerResult<String>265 pub fn next_ident(&mut self) -> TokenizerResult<String> { 266 self.next_token_check_map(|token| match token { 267 &Token::Ident(ref ident) => Ok(ident.clone()), 268 _ => Err(TokenizerError::ExpectIdent), 269 }) 270 } 271 next_str_lit(&mut self) -> TokenizerResult<StrLit>272 pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> { 273 self.next_token_check_map(|token| match token { 274 &Token::StrLit(ref str_lit) => Ok(str_lit.clone()), 275 _ => Err(TokenizerError::ExpectStrLit), 276 }) 277 } 278 next_int_lit(&mut self) -> TokenizerResult<u64>279 pub fn next_int_lit(&mut self) -> TokenizerResult<u64> { 280 self.next_token_check_map(|token| match token { 281 &Token::IntLit(v) => Ok(v), 282 _ => Err(TokenizerError::ExpectIntLit), 283 }) 284 } 285 next_float_lit(&mut self) -> TokenizerResult<f64>286 pub fn next_float_lit(&mut self) -> TokenizerResult<f64> { 287 self.next_token_check_map(|token| match token { 288 &Token::FloatLit(v) => Ok(v), 289 _ => Err(TokenizerError::ExpectFloatLit), 290 }) 291 } 292 } 293 294 #[cfg(test)] 295 mod test { 296 297 use super::*; 298 tokenize<P, R>(input: &str, what: P) -> R where P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,299 fn tokenize<P, R>(input: &str, what: P) -> R 300 where 301 P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>, 302 { 303 let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto); 304 let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc())); 305 let eof = tokenizer 306 .syntax_eof() 307 .expect(&format!("check eof failed at {}", tokenizer.loc())); 308 assert!(eof, "{}", tokenizer.loc()); 309 r 310 } 311 312 #[test] test_ident()313 fn test_ident() { 314 let msg = r#" aabb_c "#; 315 let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned())); 316 assert_eq!("aabb_c", mess); 317 } 318 319 #[test] test_str_lit()320 fn test_str_lit() { 321 let msg = r#" "a\nb" "#; 322 let mess = tokenize(msg, |p| p.next_str_lit()); 323 assert_eq!( 324 StrLit { 325 escaped: r#"a\nb"#.to_owned() 326 }, 327 mess 328 ); 329 } 330 } 331