1 //! Lexing `&str` into a sequence of Rust tokens. 2 //! 3 //! Note that strictly speaking the parser in this crate is not required to work 4 //! on tokens which originated from text. Macros, eg, can synthesize tokens out 5 //! of thin air. So, ideally, lexer should be an orthogonal crate. It is however 6 //! convenient to include a text-based lexer here! 7 //! 8 //! Note that these tokens, unlike the tokens we feed into the parser, do 9 //! include info about comments and whitespace. 10 11 use std::ops; 12 13 use crate::{ 14 SyntaxKind::{self, *}, 15 T, 16 }; 17 18 pub struct LexedStr<'a> { 19 text: &'a str, 20 kind: Vec<SyntaxKind>, 21 start: Vec<u32>, 22 error: Vec<LexError>, 23 } 24 25 struct LexError { 26 msg: String, 27 token: u32, 28 } 29 30 impl<'a> LexedStr<'a> { new(text: &'a str) -> LexedStr<'a>31 pub fn new(text: &'a str) -> LexedStr<'a> { 32 let mut conv = Converter::new(text); 33 if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { 34 conv.res.push(SHEBANG, conv.offset); 35 conv.offset = shebang_len; 36 }; 37 38 for token in rustc_lexer::tokenize(&text[conv.offset..]) { 39 let token_text = &text[conv.offset..][..token.len as usize]; 40 41 conv.extend_token(&token.kind, token_text); 42 } 43 44 conv.finalize_with_eof() 45 } 46 single_token(text: &'a str) -> Option<(SyntaxKind, Option<String>)>47 pub fn single_token(text: &'a str) -> Option<(SyntaxKind, Option<String>)> { 48 if text.is_empty() { 49 return None; 50 } 51 52 let token = rustc_lexer::tokenize(text).next()?; 53 if token.len as usize != text.len() { 54 return None; 55 } 56 57 let mut conv = Converter::new(text); 58 conv.extend_token(&token.kind, text); 59 match &*conv.res.kind { 60 [kind] => Some((*kind, conv.res.error.pop().map(|it| it.msg))), 61 _ => None, 62 } 63 } 64 as_str(&self) -> &str65 pub fn as_str(&self) -> &str { 66 self.text 67 } 68 len(&self) -> usize69 pub fn len(&self) -> usize { 70 self.kind.len() - 1 71 } 72 is_empty(&self) -> bool73 pub fn is_empty(&self) -> bool { 74 self.len() == 0 75 } 76 kind(&self, i: usize) -> SyntaxKind77 pub fn kind(&self, i: usize) -> SyntaxKind { 78 assert!(i < self.len()); 79 self.kind[i] 80 } 81 text(&self, i: usize) -> &str82 pub fn text(&self, i: usize) -> &str { 83 self.range_text(i..i + 1) 84 } 85 range_text(&self, r: ops::Range<usize>) -> &str86 pub fn range_text(&self, r: ops::Range<usize>) -> &str { 87 assert!(r.start < r.end && r.end <= self.len()); 88 let lo = self.start[r.start] as usize; 89 let hi = self.start[r.end] as usize; 90 &self.text[lo..hi] 91 } 92 93 // Naming is hard. text_range(&self, i: usize) -> ops::Range<usize>94 pub fn text_range(&self, i: usize) -> ops::Range<usize> { 95 assert!(i < self.len()); 96 let lo = self.start[i] as usize; 97 let hi = self.start[i + 1] as usize; 98 lo..hi 99 } text_start(&self, i: usize) -> usize100 pub fn text_start(&self, i: usize) -> usize { 101 assert!(i <= self.len()); 102 self.start[i] as usize 103 } text_len(&self, i: usize) -> usize104 pub fn text_len(&self, i: usize) -> usize { 105 assert!(i < self.len()); 106 let r = self.text_range(i); 107 r.end - r.start 108 } 109 error(&self, i: usize) -> Option<&str>110 pub fn error(&self, i: usize) -> Option<&str> { 111 assert!(i < self.len()); 112 let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?; 113 Some(self.error[err].msg.as_str()) 114 } 115 errors(&self) -> impl Iterator<Item = (usize, &str)> + '_116 pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ { 117 self.error.iter().map(|it| (it.token as usize, it.msg.as_str())) 118 } 119 push(&mut self, kind: SyntaxKind, offset: usize)120 fn push(&mut self, kind: SyntaxKind, offset: usize) { 121 self.kind.push(kind); 122 self.start.push(offset as u32); 123 } 124 } 125 126 struct Converter<'a> { 127 res: LexedStr<'a>, 128 offset: usize, 129 } 130 131 impl<'a> Converter<'a> { new(text: &'a str) -> Self132 fn new(text: &'a str) -> Self { 133 Self { 134 res: LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() }, 135 offset: 0, 136 } 137 } 138 finalize_with_eof(mut self) -> LexedStr<'a>139 fn finalize_with_eof(mut self) -> LexedStr<'a> { 140 self.res.push(EOF, self.offset); 141 self.res 142 } 143 push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>)144 fn push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>) { 145 self.res.push(kind, self.offset); 146 self.offset += len; 147 148 if let Some(err) = err { 149 let token = self.res.len() as u32; 150 let msg = err.to_string(); 151 self.res.error.push(LexError { msg, token }); 152 } 153 } 154 extend_token(&mut self, kind: &rustc_lexer::TokenKind, token_text: &str)155 fn extend_token(&mut self, kind: &rustc_lexer::TokenKind, token_text: &str) { 156 // A note on an intended tradeoff: 157 // We drop some useful information here (see patterns with double dots `..`) 158 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of 159 // being `u16` that come from `rowan::SyntaxKind`. 160 let mut err = ""; 161 162 let syntax_kind = { 163 match kind { 164 rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT, 165 rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => { 166 if !terminated { 167 err = "Missing trailing `*/` symbols to terminate the block comment"; 168 } 169 COMMENT 170 } 171 172 rustc_lexer::TokenKind::Whitespace => WHITESPACE, 173 174 rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE, 175 rustc_lexer::TokenKind::Ident => { 176 SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) 177 } 178 rustc_lexer::TokenKind::InvalidIdent => { 179 err = "Ident contains invalid characters"; 180 IDENT 181 } 182 183 rustc_lexer::TokenKind::RawIdent => IDENT, 184 rustc_lexer::TokenKind::Literal { kind, .. } => { 185 self.extend_literal(token_text.len(), kind); 186 return; 187 } 188 189 rustc_lexer::TokenKind::Lifetime { starts_with_number } => { 190 if *starts_with_number { 191 err = "Lifetime name cannot start with a number"; 192 } 193 LIFETIME_IDENT 194 } 195 196 rustc_lexer::TokenKind::Semi => T![;], 197 rustc_lexer::TokenKind::Comma => T![,], 198 rustc_lexer::TokenKind::Dot => T![.], 199 rustc_lexer::TokenKind::OpenParen => T!['('], 200 rustc_lexer::TokenKind::CloseParen => T![')'], 201 rustc_lexer::TokenKind::OpenBrace => T!['{'], 202 rustc_lexer::TokenKind::CloseBrace => T!['}'], 203 rustc_lexer::TokenKind::OpenBracket => T!['['], 204 rustc_lexer::TokenKind::CloseBracket => T![']'], 205 rustc_lexer::TokenKind::At => T![@], 206 rustc_lexer::TokenKind::Pound => T![#], 207 rustc_lexer::TokenKind::Tilde => T![~], 208 rustc_lexer::TokenKind::Question => T![?], 209 rustc_lexer::TokenKind::Colon => T![:], 210 rustc_lexer::TokenKind::Dollar => T![$], 211 rustc_lexer::TokenKind::Eq => T![=], 212 rustc_lexer::TokenKind::Bang => T![!], 213 rustc_lexer::TokenKind::Lt => T![<], 214 rustc_lexer::TokenKind::Gt => T![>], 215 rustc_lexer::TokenKind::Minus => T![-], 216 rustc_lexer::TokenKind::And => T![&], 217 rustc_lexer::TokenKind::Or => T![|], 218 rustc_lexer::TokenKind::Plus => T![+], 219 rustc_lexer::TokenKind::Star => T![*], 220 rustc_lexer::TokenKind::Slash => T![/], 221 rustc_lexer::TokenKind::Caret => T![^], 222 rustc_lexer::TokenKind::Percent => T![%], 223 rustc_lexer::TokenKind::Unknown => ERROR, 224 rustc_lexer::TokenKind::UnknownPrefix => { 225 err = "unknown literal prefix"; 226 IDENT 227 } 228 rustc_lexer::TokenKind::Eof => EOF, 229 } 230 }; 231 232 let err = if err.is_empty() { None } else { Some(err) }; 233 self.push(syntax_kind, token_text.len(), err); 234 } 235 extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind)236 fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) { 237 let mut err = ""; 238 239 let syntax_kind = match *kind { 240 rustc_lexer::LiteralKind::Int { empty_int, base: _ } => { 241 if empty_int { 242 err = "Missing digits after the integer base prefix"; 243 } 244 INT_NUMBER 245 } 246 rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => { 247 if empty_exponent { 248 err = "Missing digits after the exponent symbol"; 249 } 250 FLOAT_NUMBER 251 } 252 rustc_lexer::LiteralKind::Char { terminated } => { 253 if !terminated { 254 err = "Missing trailing `'` symbol to terminate the character literal"; 255 } 256 CHAR 257 } 258 rustc_lexer::LiteralKind::Byte { terminated } => { 259 if !terminated { 260 err = "Missing trailing `'` symbol to terminate the byte literal"; 261 } 262 BYTE 263 } 264 rustc_lexer::LiteralKind::Str { terminated } => { 265 if !terminated { 266 err = "Missing trailing `\"` symbol to terminate the string literal"; 267 } 268 STRING 269 } 270 rustc_lexer::LiteralKind::ByteStr { terminated } => { 271 if !terminated { 272 err = "Missing trailing `\"` symbol to terminate the byte string literal"; 273 } 274 BYTE_STRING 275 } 276 rustc_lexer::LiteralKind::CStr { terminated } => { 277 if !terminated { 278 err = "Missing trailing `\"` symbol to terminate the string literal"; 279 } 280 C_STRING 281 } 282 rustc_lexer::LiteralKind::RawStr { n_hashes } => { 283 if n_hashes.is_none() { 284 err = "Invalid raw string literal"; 285 } 286 STRING 287 } 288 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => { 289 if n_hashes.is_none() { 290 err = "Invalid raw string literal"; 291 } 292 BYTE_STRING 293 } 294 rustc_lexer::LiteralKind::RawCStr { n_hashes } => { 295 if n_hashes.is_none() { 296 err = "Invalid raw string literal"; 297 } 298 C_STRING 299 } 300 }; 301 302 let err = if err.is_empty() { None } else { Some(err) }; 303 self.push(syntax_kind, len, err); 304 } 305 } 306