use std::char; use std::convert::TryFrom; use std::f64; use std::fmt; use std::num::ParseFloatError; use std::num::ParseIntError; use super::float; use super::loc::Loc; use super::loc::FIRST_COL; use super::str_lit::StrLit; use super::str_lit::StrLitDecodeError; use super::token::Token; use super::token::TokenWithLocation; use super::ParserLanguage; use crate::text_format::lexer::JsonNumberLit; #[derive(Debug)] pub enum LexerError { IncorrectInput, // TODO: something better than this UnexpectedEof, ExpectChar(char), ParseIntError, ParseFloatError, IncorrectFloatLit, // TODO: how it is different from ParseFloatError? IncorrectJsonEscape, IncorrectJsonNumber, IncorrectUnicodeChar, ExpectHexDigit, ExpectOctDigit, ExpectDecDigit, StrLitDecodeError(StrLitDecodeError), ExpectedIdent, } impl fmt::Display for LexerError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { LexerError::IncorrectInput => write!(f, "Incorrect input"), LexerError::UnexpectedEof => write!(f, "Unexpected EOF"), LexerError::ExpectChar(c) => write!(f, "Expecting char: {}", c), LexerError::ParseIntError => write!(f, "Parse int error"), LexerError::ParseFloatError => write!(f, "Parse float error"), LexerError::IncorrectFloatLit => write!(f, "Incorrect float literal"), LexerError::IncorrectJsonEscape => write!(f, "Incorrect JSON escape"), LexerError::IncorrectJsonNumber => write!(f, "Incorrect JSON number"), LexerError::IncorrectUnicodeChar => write!(f, "Incorrect Unicode char"), LexerError::ExpectHexDigit => write!(f, "Expecting hex digit"), LexerError::ExpectOctDigit => write!(f, "Expecting oct digit"), LexerError::ExpectDecDigit => write!(f, "Expecting dec digit"), LexerError::StrLitDecodeError(e) => write!(f, "{}", e), LexerError::ExpectedIdent => write!(f, "Expecting identifier"), } } } impl std::error::Error for LexerError {} pub type LexerResult = Result; impl From for LexerError { fn from(e: StrLitDecodeError) -> Self { LexerError::StrLitDecodeError(e) } } impl From for LexerError { fn from(_: ParseIntError) -> Self { LexerError::ParseIntError } } impl From for LexerError { fn from(_: ParseFloatError) -> Self { LexerError::ParseFloatError } } impl From for LexerError { fn from(_: float::ProtobufFloatParseError) -> Self { LexerError::IncorrectFloatLit } } #[derive(Copy, Clone)] pub struct Lexer<'a> { language: ParserLanguage, input: &'a str, pos: usize, pub loc: Loc, } fn is_letter(c: char) -> bool { c.is_alphabetic() || c == '_' } impl<'a> Lexer<'a> { pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> { Lexer { language, input, pos: 0, loc: Loc::start(), } } /// No more chars pub fn eof(&self) -> bool { self.pos == self.input.len() } /// Remaining chars fn rem_chars(&self) -> &'a str { &self.input[self.pos..] } pub fn lookahead_char_is bool>(&self, p: P) -> bool { self.lookahead_char().map_or(false, p) } fn lookahead_char_is_in(&self, alphabet: &str) -> bool { self.lookahead_char_is(|c| alphabet.contains(c)) } fn next_char_opt(&mut self) -> Option { let rem = self.rem_chars(); if rem.is_empty() { None } else { let mut char_indices = rem.char_indices(); let (_, c) = char_indices.next().unwrap(); let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len()); self.pos += c_len; if c == '\n' { self.loc.line += 1; self.loc.col = FIRST_COL; } else { self.loc.col += 1; } Some(c) } } fn next_char(&mut self) -> LexerResult { self.next_char_opt().ok_or(LexerError::UnexpectedEof) } /// Skip whitespaces fn skip_whitespaces(&mut self) { self.take_while(|c| c.is_whitespace()); } fn skip_c_comment(&mut self) -> LexerResult<()> { if self.skip_if_lookahead_is_str("/*") { let end = "*/"; match self.rem_chars().find(end) { None => Err(LexerError::UnexpectedEof), Some(len) => { let new_pos = self.pos + len + end.len(); self.skip_to_pos(new_pos); Ok(()) } } } else { Ok(()) } } fn skip_cpp_comment(&mut self) { if self.skip_if_lookahead_is_str("//") { loop { match self.next_char_opt() { Some('\n') | None => break, _ => {} } } } } fn skip_sh_comment(&mut self) { if self.skip_if_lookahead_is_str("#") { loop { match self.next_char_opt() { Some('\n') | None => break, _ => {} } } } } fn skip_comment(&mut self) -> LexerResult<()> { match self.language { ParserLanguage::Proto => { self.skip_c_comment()?; self.skip_cpp_comment(); } ParserLanguage::TextFormat => { self.skip_sh_comment(); } ParserLanguage::Json => {} } Ok(()) } pub fn skip_ws(&mut self) -> LexerResult<()> { loop { let pos = self.pos; self.skip_whitespaces(); self.skip_comment()?; if pos == self.pos { // Did not advance return Ok(()); } } } pub fn take_while(&mut self, f: F) -> &'a str where F: Fn(char) -> bool, { let start = self.pos; while self.lookahead_char().map(&f) == Some(true) { self.next_char_opt().unwrap(); } let end = self.pos; &self.input[start..end] } fn lookahead_char(&self) -> Option { self.clone().next_char_opt() } fn lookahead_is_str(&self, s: &str) -> bool { self.rem_chars().starts_with(s) } fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool { if self.lookahead_is_str(s) { let new_pos = self.pos + s.len(); self.skip_to_pos(new_pos); true } else { false } } fn next_char_if

(&mut self, p: P) -> Option where P: FnOnce(char) -> bool, { let mut clone = self.clone(); match clone.next_char_opt() { Some(c) if p(c) => { *self = clone; Some(c) } _ => None, } } pub fn next_char_if_eq(&mut self, expect: char) -> bool { self.next_char_if(|c| c == expect) != None } fn next_char_if_in(&mut self, alphabet: &str) -> Option { for c in alphabet.chars() { if self.next_char_if_eq(c) { return Some(c); } } None } fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> { if self.next_char_if_eq(expect) { Ok(()) } else { Err(LexerError::ExpectChar(expect)) } } fn next_char_expect

(&mut self, expect: P, err: LexerError) -> LexerResult where P: FnOnce(char) -> bool, { self.next_char_if(expect).ok_or(err) } // str functions /// properly update line and column fn skip_to_pos(&mut self, new_pos: usize) -> &'a str { assert!(new_pos >= self.pos); assert!(new_pos <= self.input.len()); let pos = self.pos; while self.pos != new_pos { self.next_char_opt().unwrap(); } &self.input[pos..new_pos] } // Protobuf grammar // char functions // letter = "A" … "Z" | "a" … "z" // https://github.com/google/protobuf/issues/4565 fn next_letter_opt(&mut self) -> Option { self.next_char_if(is_letter) } // capitalLetter = "A" … "Z" fn _next_capital_letter_opt(&mut self) -> Option { self.next_char_if(|c| c >= 'A' && c <= 'Z') } fn next_ident_part(&mut self) -> Option { self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_') } // Identifiers // ident = letter { letter | decimalDigit | "_" } fn next_ident_opt(&mut self) -> LexerResult> { if let Some(c) = self.next_letter_opt() { let mut ident = String::new(); ident.push(c); while let Some(c) = self.next_ident_part() { ident.push(c); } Ok(Some(ident)) } else { Ok(None) } } // Integer literals // hexLit = "0" ( "x" | "X" ) hexDigit { hexDigit } fn next_hex_lit_opt(&mut self) -> LexerResult> { Ok( if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") { let s = self.take_while(|c| c.is_ascii_hexdigit()); Some(u64::from_str_radix(s, 16)? as u64) } else { None }, ) } // decimalLit = ( "1" … "9" ) { decimalDigit } // octalLit = "0" { octalDigit } fn next_decimal_octal_lit_opt(&mut self) -> LexerResult> { // do not advance on number parse error let mut clone = self.clone(); let pos = clone.pos; Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None { clone.take_while(|c| c.is_ascii_digit()); let value = clone.input[pos..clone.pos].parse()?; *self = clone; Some(value) } else { None }) } // hexDigit = "0" … "9" | "A" … "F" | "a" … "f" fn next_hex_digit(&mut self) -> LexerResult { let mut clone = self.clone(); let r = match clone.next_char()? { c if c >= '0' && c <= '9' => c as u32 - b'0' as u32, c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10, c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10, _ => return Err(LexerError::ExpectHexDigit), }; *self = clone; Ok(r) } // octalDigit = "0" … "7" fn next_octal_digit(&mut self) -> LexerResult { self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit) .map(|c| c as u32 - '0' as u32) } // decimalDigit = "0" … "9" fn next_decimal_digit(&mut self) -> LexerResult { self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit) .map(|c| c as u32 - '0' as u32) } // decimals = decimalDigit { decimalDigit } fn next_decimal_digits(&mut self) -> LexerResult<()> { self.next_decimal_digit()?; self.take_while(|c| c >= '0' && c <= '9'); Ok(()) } // intLit = decimalLit | octalLit | hexLit pub fn next_int_lit_opt(&mut self) -> LexerResult> { assert_ne!(ParserLanguage::Json, self.language); self.skip_ws()?; if let Some(i) = self.next_hex_lit_opt()? { return Ok(Some(i)); } if let Some(i) = self.next_decimal_octal_lit_opt()? { return Ok(Some(i)); } Ok(None) } // Floating-point literals // exponent = ( "e" | "E" ) [ "+" | "-" ] decimals fn next_exponent_opt(&mut self) -> LexerResult> { if self.next_char_if_in("eE") != None { self.next_char_if_in("+-"); self.next_decimal_digits()?; Ok(Some(())) } else { Ok(None) } } // floatLit = ( decimals "." [ decimals ] [ exponent ] | decimals exponent | "."decimals [ exponent ] ) | "inf" | "nan" fn next_float_lit(&mut self) -> LexerResult<()> { assert_ne!(ParserLanguage::Json, self.language); // "inf" and "nan" are handled as part of ident if self.next_char_if_eq('.') { self.next_decimal_digits()?; self.next_exponent_opt()?; } else { self.next_decimal_digits()?; if self.next_char_if_eq('.') { self.next_decimal_digits()?; self.next_exponent_opt()?; } else { if self.next_exponent_opt()? == None { return Err(LexerError::IncorrectFloatLit); } } } Ok(()) } // String literals // charValue = hexEscape | octEscape | charEscape | /[^\0\n\\]/ // hexEscape = '\' ( "x" | "X" ) hexDigit hexDigit // https://github.com/google/protobuf/issues/4560 // octEscape = '\' octalDigit octalDigit octalDigit // charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' ) // quote = "'" | '"' pub fn next_byte_value(&mut self) -> LexerResult { match self.next_char()? { '\\' => { match self.next_char()? { '\'' => Ok(b'\''), '"' => Ok(b'"'), '\\' => Ok(b'\\'), 'a' => Ok(b'\x07'), 'b' => Ok(b'\x08'), 'f' => Ok(b'\x0c'), 'n' => Ok(b'\n'), 'r' => Ok(b'\r'), 't' => Ok(b'\t'), 'v' => Ok(b'\x0b'), 'x' => { let d1 = self.next_hex_digit()? as u8; let d2 = self.next_hex_digit()? as u8; Ok(((d1 << 4) | d2) as u8) } d if d >= '0' && d <= '7' => { let mut r = d as u8 - b'0'; for _ in 0..2 { match self.next_octal_digit() { Err(_) => break, Ok(d) => r = (r << 3) + d as u8, } } Ok(r) } // https://github.com/google/protobuf/issues/4562 // TODO: overflow c => Ok(c as u8), } } '\n' | '\0' => Err(LexerError::IncorrectInput), // TODO: check overflow c => Ok(c as u8), } } fn char_try_from(i: u32) -> LexerResult { char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar) } pub fn next_json_char_value(&mut self) -> LexerResult { match self.next_char()? { '\\' => match self.next_char()? { '"' => Ok('"'), '\'' => Ok('\''), '\\' => Ok('\\'), '/' => Ok('/'), 'b' => Ok('\x08'), 'f' => Ok('\x0c'), 'n' => Ok('\n'), 'r' => Ok('\r'), 't' => Ok('\t'), 'u' => { let mut v = 0; for _ in 0..4 { let digit = self.next_hex_digit()?; v = v * 16 + digit; } Self::char_try_from(v) } _ => Err(LexerError::IncorrectJsonEscape), }, c => Ok(c), } } // https://github.com/google/protobuf/issues/4564 // strLit = ( "'" { charValue } "'" ) | ( '"' { charValue } '"' ) fn next_str_lit_raw(&mut self) -> LexerResult { let mut raw = String::new(); let mut first = true; loop { if !first { self.skip_ws()?; } let start = self.pos; let q = match self.next_char_if_in("'\"") { Some(q) => q, None if !first => break, None => return Err(LexerError::IncorrectInput), }; first = false; while self.lookahead_char() != Some(q) { self.next_byte_value()?; } self.next_char_expect_eq(q)?; raw.push_str(&self.input[start + 1..self.pos - 1]); } Ok(raw) } fn next_str_lit_raw_opt(&mut self) -> LexerResult> { if self.lookahead_char_is_in("'\"") { Ok(Some(self.next_str_lit_raw()?)) } else { Ok(None) } } /// Parse next token as JSON number fn next_json_number_opt(&mut self) -> LexerResult> { assert_eq!(ParserLanguage::Json, self.language); fn is_digit(c: char) -> bool { c >= '0' && c <= '9' } fn is_digit_1_9(c: char) -> bool { c >= '1' && c <= '9' } if !self.lookahead_char_is_in("-0123456789") { return Ok(None); } let mut s = String::new(); if self.next_char_if_eq('-') { s.push('-'); } if self.next_char_if_eq('0') { s.push('0'); } else { s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?); while let Some(c) = self.next_char_if(is_digit) { s.push(c); } } if self.next_char_if_eq('.') { s.push('.'); s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?); while let Some(c) = self.next_char_if(is_digit) { s.push(c); } } if let Some(c) = self.next_char_if_in("eE") { s.push(c); if let Some(c) = self.next_char_if_in("+-") { s.push(c); } s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?); while let Some(c) = self.next_char_if(is_digit) { s.push(c); } } Ok(Some(JsonNumberLit(s))) } fn next_token_inner(&mut self) -> LexerResult { if self.language == ParserLanguage::Json { if let Some(v) = self.next_json_number_opt()? { return Ok(Token::JsonNumber(v)); } } if let Some(ident) = self.next_ident_opt()? { let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN { Token::FloatLit(f64::NAN) } else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF { Token::FloatLit(f64::INFINITY) } else { Token::Ident(ident.to_owned()) }; return Ok(token); } if self.language != ParserLanguage::Json { let mut clone = self.clone(); let pos = clone.pos; if let Ok(_) = clone.next_float_lit() { let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?; *self = clone; return Ok(Token::FloatLit(f)); } if let Some(lit) = self.next_int_lit_opt()? { return Ok(Token::IntLit(lit)); } } if let Some(escaped) = self.next_str_lit_raw_opt()? { return Ok(Token::StrLit(StrLit { escaped })); } // This branch must be after str lit if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) { return Ok(Token::Symbol(c)); } if let Some(ident) = self.next_ident_opt()? { return Ok(Token::Ident(ident)); } Err(LexerError::IncorrectInput) } pub fn next_token(&mut self) -> LexerResult> { self.skip_ws()?; let loc = self.loc; Ok(if self.eof() { None } else { let token = self.next_token_inner()?; // Skip whitespace here to update location // to the beginning of the next token self.skip_ws()?; Some(TokenWithLocation { token, loc }) }) } } #[cfg(test)] mod test { use super::*; fn lex(input: &str, parse_what: P) -> R where P: FnOnce(&mut Lexer) -> LexerResult, { let mut lexer = Lexer::new(input, ParserLanguage::Proto); let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc)); assert!(lexer.eof(), "check eof failed at {}", lexer.loc); r } fn lex_opt(input: &str, parse_what: P) -> R where P: FnOnce(&mut Lexer) -> LexerResult>, { let mut lexer = Lexer::new(input, ParserLanguage::Proto); let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc)); let r = o.expect(&format!("lexer returned none at {}", lexer.loc)); assert!(lexer.eof(), "check eof failed at {}", lexer.loc); r } #[test] fn test_lexer_int_lit() { let msg = r#"10"#; let mess = lex_opt(msg, |p| p.next_int_lit_opt()); assert_eq!(10, mess); } #[test] fn test_lexer_float_lit() { let msg = r#"12.3"#; let mess = lex(msg, |p| p.next_token_inner()); assert_eq!(Token::FloatLit(12.3), mess); } }