• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //! Lexing `&str` into a sequence of Rust tokens.
2 //!
3 //! Note that strictly speaking the parser in this crate is not required to work
4 //! on tokens which originated from text. Macros, eg, can synthesize tokens out
5 //! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
6 //! convenient to include a text-based lexer here!
7 //!
8 //! Note that these tokens, unlike the tokens we feed into the parser, do
9 //! include info about comments and whitespace.
10 
11 use std::ops;
12 
13 use crate::{
14     SyntaxKind::{self, *},
15     T,
16 };
17 
18 pub struct LexedStr<'a> {
19     text: &'a str,
20     kind: Vec<SyntaxKind>,
21     start: Vec<u32>,
22     error: Vec<LexError>,
23 }
24 
25 struct LexError {
26     msg: String,
27     token: u32,
28 }
29 
30 impl<'a> LexedStr<'a> {
new(text: &'a str) -> LexedStr<'a>31     pub fn new(text: &'a str) -> LexedStr<'a> {
32         let mut conv = Converter::new(text);
33         if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
34             conv.res.push(SHEBANG, conv.offset);
35             conv.offset = shebang_len;
36         };
37 
38         for token in rustc_lexer::tokenize(&text[conv.offset..]) {
39             let token_text = &text[conv.offset..][..token.len as usize];
40 
41             conv.extend_token(&token.kind, token_text);
42         }
43 
44         conv.finalize_with_eof()
45     }
46 
single_token(text: &'a str) -> Option<(SyntaxKind, Option<String>)>47     pub fn single_token(text: &'a str) -> Option<(SyntaxKind, Option<String>)> {
48         if text.is_empty() {
49             return None;
50         }
51 
52         let token = rustc_lexer::tokenize(text).next()?;
53         if token.len as usize != text.len() {
54             return None;
55         }
56 
57         let mut conv = Converter::new(text);
58         conv.extend_token(&token.kind, text);
59         match &*conv.res.kind {
60             [kind] => Some((*kind, conv.res.error.pop().map(|it| it.msg))),
61             _ => None,
62         }
63     }
64 
as_str(&self) -> &str65     pub fn as_str(&self) -> &str {
66         self.text
67     }
68 
len(&self) -> usize69     pub fn len(&self) -> usize {
70         self.kind.len() - 1
71     }
72 
is_empty(&self) -> bool73     pub fn is_empty(&self) -> bool {
74         self.len() == 0
75     }
76 
kind(&self, i: usize) -> SyntaxKind77     pub fn kind(&self, i: usize) -> SyntaxKind {
78         assert!(i < self.len());
79         self.kind[i]
80     }
81 
text(&self, i: usize) -> &str82     pub fn text(&self, i: usize) -> &str {
83         self.range_text(i..i + 1)
84     }
85 
range_text(&self, r: ops::Range<usize>) -> &str86     pub fn range_text(&self, r: ops::Range<usize>) -> &str {
87         assert!(r.start < r.end && r.end <= self.len());
88         let lo = self.start[r.start] as usize;
89         let hi = self.start[r.end] as usize;
90         &self.text[lo..hi]
91     }
92 
93     // Naming is hard.
text_range(&self, i: usize) -> ops::Range<usize>94     pub fn text_range(&self, i: usize) -> ops::Range<usize> {
95         assert!(i < self.len());
96         let lo = self.start[i] as usize;
97         let hi = self.start[i + 1] as usize;
98         lo..hi
99     }
text_start(&self, i: usize) -> usize100     pub fn text_start(&self, i: usize) -> usize {
101         assert!(i <= self.len());
102         self.start[i] as usize
103     }
text_len(&self, i: usize) -> usize104     pub fn text_len(&self, i: usize) -> usize {
105         assert!(i < self.len());
106         let r = self.text_range(i);
107         r.end - r.start
108     }
109 
error(&self, i: usize) -> Option<&str>110     pub fn error(&self, i: usize) -> Option<&str> {
111         assert!(i < self.len());
112         let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
113         Some(self.error[err].msg.as_str())
114     }
115 
errors(&self) -> impl Iterator<Item = (usize, &str)> + '_116     pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
117         self.error.iter().map(|it| (it.token as usize, it.msg.as_str()))
118     }
119 
push(&mut self, kind: SyntaxKind, offset: usize)120     fn push(&mut self, kind: SyntaxKind, offset: usize) {
121         self.kind.push(kind);
122         self.start.push(offset as u32);
123     }
124 }
125 
126 struct Converter<'a> {
127     res: LexedStr<'a>,
128     offset: usize,
129 }
130 
131 impl<'a> Converter<'a> {
new(text: &'a str) -> Self132     fn new(text: &'a str) -> Self {
133         Self {
134             res: LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() },
135             offset: 0,
136         }
137     }
138 
finalize_with_eof(mut self) -> LexedStr<'a>139     fn finalize_with_eof(mut self) -> LexedStr<'a> {
140         self.res.push(EOF, self.offset);
141         self.res
142     }
143 
push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>)144     fn push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>) {
145         self.res.push(kind, self.offset);
146         self.offset += len;
147 
148         if let Some(err) = err {
149             let token = self.res.len() as u32;
150             let msg = err.to_string();
151             self.res.error.push(LexError { msg, token });
152         }
153     }
154 
extend_token(&mut self, kind: &rustc_lexer::TokenKind, token_text: &str)155     fn extend_token(&mut self, kind: &rustc_lexer::TokenKind, token_text: &str) {
156         // A note on an intended tradeoff:
157         // We drop some useful information here (see patterns with double dots `..`)
158         // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
159         // being `u16` that come from `rowan::SyntaxKind`.
160         let mut err = "";
161 
162         let syntax_kind = {
163             match kind {
164                 rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
165                 rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
166                     if !terminated {
167                         err = "Missing trailing `*/` symbols to terminate the block comment";
168                     }
169                     COMMENT
170                 }
171 
172                 rustc_lexer::TokenKind::Whitespace => WHITESPACE,
173 
174                 rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE,
175                 rustc_lexer::TokenKind::Ident => {
176                     SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
177                 }
178                 rustc_lexer::TokenKind::InvalidIdent => {
179                     err = "Ident contains invalid characters";
180                     IDENT
181                 }
182 
183                 rustc_lexer::TokenKind::RawIdent => IDENT,
184                 rustc_lexer::TokenKind::Literal { kind, .. } => {
185                     self.extend_literal(token_text.len(), kind);
186                     return;
187                 }
188 
189                 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
190                     if *starts_with_number {
191                         err = "Lifetime name cannot start with a number";
192                     }
193                     LIFETIME_IDENT
194                 }
195 
196                 rustc_lexer::TokenKind::Semi => T![;],
197                 rustc_lexer::TokenKind::Comma => T![,],
198                 rustc_lexer::TokenKind::Dot => T![.],
199                 rustc_lexer::TokenKind::OpenParen => T!['('],
200                 rustc_lexer::TokenKind::CloseParen => T![')'],
201                 rustc_lexer::TokenKind::OpenBrace => T!['{'],
202                 rustc_lexer::TokenKind::CloseBrace => T!['}'],
203                 rustc_lexer::TokenKind::OpenBracket => T!['['],
204                 rustc_lexer::TokenKind::CloseBracket => T![']'],
205                 rustc_lexer::TokenKind::At => T![@],
206                 rustc_lexer::TokenKind::Pound => T![#],
207                 rustc_lexer::TokenKind::Tilde => T![~],
208                 rustc_lexer::TokenKind::Question => T![?],
209                 rustc_lexer::TokenKind::Colon => T![:],
210                 rustc_lexer::TokenKind::Dollar => T![$],
211                 rustc_lexer::TokenKind::Eq => T![=],
212                 rustc_lexer::TokenKind::Bang => T![!],
213                 rustc_lexer::TokenKind::Lt => T![<],
214                 rustc_lexer::TokenKind::Gt => T![>],
215                 rustc_lexer::TokenKind::Minus => T![-],
216                 rustc_lexer::TokenKind::And => T![&],
217                 rustc_lexer::TokenKind::Or => T![|],
218                 rustc_lexer::TokenKind::Plus => T![+],
219                 rustc_lexer::TokenKind::Star => T![*],
220                 rustc_lexer::TokenKind::Slash => T![/],
221                 rustc_lexer::TokenKind::Caret => T![^],
222                 rustc_lexer::TokenKind::Percent => T![%],
223                 rustc_lexer::TokenKind::Unknown => ERROR,
224                 rustc_lexer::TokenKind::UnknownPrefix => {
225                     err = "unknown literal prefix";
226                     IDENT
227                 }
228                 rustc_lexer::TokenKind::Eof => EOF,
229             }
230         };
231 
232         let err = if err.is_empty() { None } else { Some(err) };
233         self.push(syntax_kind, token_text.len(), err);
234     }
235 
extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind)236     fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) {
237         let mut err = "";
238 
239         let syntax_kind = match *kind {
240             rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
241                 if empty_int {
242                     err = "Missing digits after the integer base prefix";
243                 }
244                 INT_NUMBER
245             }
246             rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
247                 if empty_exponent {
248                     err = "Missing digits after the exponent symbol";
249                 }
250                 FLOAT_NUMBER
251             }
252             rustc_lexer::LiteralKind::Char { terminated } => {
253                 if !terminated {
254                     err = "Missing trailing `'` symbol to terminate the character literal";
255                 }
256                 CHAR
257             }
258             rustc_lexer::LiteralKind::Byte { terminated } => {
259                 if !terminated {
260                     err = "Missing trailing `'` symbol to terminate the byte literal";
261                 }
262                 BYTE
263             }
264             rustc_lexer::LiteralKind::Str { terminated } => {
265                 if !terminated {
266                     err = "Missing trailing `\"` symbol to terminate the string literal";
267                 }
268                 STRING
269             }
270             rustc_lexer::LiteralKind::ByteStr { terminated } => {
271                 if !terminated {
272                     err = "Missing trailing `\"` symbol to terminate the byte string literal";
273                 }
274                 BYTE_STRING
275             }
276             rustc_lexer::LiteralKind::CStr { terminated } => {
277                 if !terminated {
278                     err = "Missing trailing `\"` symbol to terminate the string literal";
279                 }
280                 C_STRING
281             }
282             rustc_lexer::LiteralKind::RawStr { n_hashes } => {
283                 if n_hashes.is_none() {
284                     err = "Invalid raw string literal";
285                 }
286                 STRING
287             }
288             rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
289                 if n_hashes.is_none() {
290                     err = "Invalid raw string literal";
291                 }
292                 BYTE_STRING
293             }
294             rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
295                 if n_hashes.is_none() {
296                     err = "Invalid raw string literal";
297                 }
298                 C_STRING
299             }
300         };
301 
302         let err = if err.is_empty() { None } else { Some(err) };
303         self.push(syntax_kind, len, err);
304     }
305 }
306