• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2023 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/trace_processor/sqlite/sqlite_tokenizer.h"
18 
19 #include <ctype.h>
20 #include <sqlite3.h>
21 #include <cstdint>
22 #include <optional>
23 #include <string_view>
24 
25 #include "perfetto/base/compiler.h"
26 #include "perfetto/base/logging.h"
27 
28 namespace perfetto {
29 namespace trace_processor {
30 
31 // The contents of this file are ~copied from SQLite with some modifications to
32 // minimize the amount copied: i.e. if we can call a libc function/public SQLite
33 // API instead of a private one.
34 //
35 // The changes are as follows:
36 // 1. Remove all ifdefs to only keep branches we actually use
37 // 2. Change handling of |CC_KYWD0| to remove distinction between different
38 //    SQLite kewords, reducing how many things we need to copy over.
39 // 3. Constants are changed from be macro defines to be values in
40 //    |SqliteTokenType|.
41 
42 namespace {
43 
44 const unsigned char sqlite3CtypeMap[256] = {
45     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07    ........ */
46     0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f    ........ */
47     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17    ........ */
48     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f    ........ */
49     0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27     !"#$%&' */
50     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f    ()*+,-./ */
51     0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37    01234567 */
52     0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f    89:;<=>? */
53 
54     0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47    @ABCDEFG */
55     0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f    HIJKLMNO */
56     0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57    PQRSTUVW */
57     0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f    XYZ[\]^_ */
58     0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67    `abcdefg */
59     0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f    hijklmno */
60     0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77    pqrstuvw */
61     0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f    xyz{|}~. */
62 
63     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87    ........ */
64     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f    ........ */
65     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97    ........ */
66     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f    ........ */
67     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7    ........ */
68     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af    ........ */
69     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7    ........ */
70     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf    ........ */
71 
72     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7    ........ */
73     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf    ........ */
74     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7    ........ */
75     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df    ........ */
76     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7    ........ */
77     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef    ........ */
78     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7    ........ */
79     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40  /* f8..ff    ........ */
80 };
81 
82 #define CC_X 0        /* The letter 'x', or start of BLOB literal */
83 #define CC_KYWD0 1    /* First letter of a keyword */
84 #define CC_KYWD 2     /* Alphabetics or '_'.  Usable in a keyword */
85 #define CC_DIGIT 3    /* Digits */
86 #define CC_DOLLAR 4   /* '$' */
87 #define CC_VARALPHA 5 /* '@', '#', ':'.  Alphabetic SQL variables */
88 #define CC_VARNUM 6   /* '?'.  Numeric SQL variables */
89 #define CC_SPACE 7    /* Space characters */
90 #define CC_QUOTE 8    /* '"', '\'', or '`'.  String literals, quoted ids */
91 #define CC_QUOTE2 9   /* '['.   [...] style quoted ids */
92 #define CC_PIPE 10    /* '|'.   Bitwise OR or concatenate */
93 #define CC_MINUS 11   /* '-'.  Minus or SQL-style comment */
94 #define CC_LT 12      /* '<'.  Part of < or <= or <> */
95 #define CC_GT 13      /* '>'.  Part of > or >= */
96 #define CC_EQ 14      /* '='.  Part of = or == */
97 #define CC_BANG 15    /* '!'.  Part of != */
98 #define CC_SLASH 16   /* '/'.  / or c-style comment */
99 #define CC_LP 17      /* '(' */
100 #define CC_RP 18      /* ')' */
101 #define CC_SEMI 19    /* ';' */
102 #define CC_PLUS 20    /* '+' */
103 #define CC_STAR 21    /* '*' */
104 #define CC_PERCENT 22 /* '%' */
105 #define CC_COMMA 23   /* ',' */
106 #define CC_AND 24     /* '&' */
107 #define CC_TILDA 25   /* '~' */
108 #define CC_DOT 26     /* '.' */
109 #define CC_ID 27      /* unicode characters usable in IDs */
110 #define CC_NUL 29     /* 0x00 */
111 #define CC_BOM 30     /* First byte of UTF8 BOM:  0xEF 0xBB 0xBF */
112 
113 // clang-format off
114 static const unsigned char aiClass[] = {
115 /*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
116 /* 0x */   29, 28, 28, 28, 28, 28, 28, 28, 28,  7,  7, 28,  7,  7, 28, 28,
117 /* 1x */   28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
118 /* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
119 /* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
120 /* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
121 /* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  2,  2,  9, 28, 28, 28,  2,
122 /* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
123 /* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  2,  2, 28, 10, 28, 25, 28,
124 /* 8x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
125 /* 9x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
126 /* Ax */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
127 /* Bx */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
128 /* Cx */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
129 /* Dx */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
130 /* Ex */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 30,
131 /* Fx */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27
132 };
133 // clang-format on
134 
135 #define IdChar(C) ((sqlite3CtypeMap[static_cast<unsigned char>(C)] & 0x46) != 0)
136 
137 // Copy of |sqlite3GetToken| for use by the PerfettoSql transpiler.
138 //
139 // We copy this function because |sqlite3GetToken| is static to sqlite3.c
140 // in most distributions of SQLite so we cannot call it from our code.
141 //
142 // While we could redefine SQLITE_PRIVATE, pragmatically that will not fly in
143 // all the places we build trace processor so we need to resort to making a
144 // copy.
GetSqliteToken(const unsigned char * z,SqliteTokenType * tokenType)145 int GetSqliteToken(const unsigned char* z, SqliteTokenType* tokenType) {
146   int i, c;
147   switch (aiClass[*z]) { /* Switch on the character-class of the first byte
148                          ** of the token. See the comment on the CC_ defines
149                          ** above. */
150     case CC_SPACE: {
151       for (i = 1; isspace(z[i]); i++) {
152       }
153       *tokenType = SqliteTokenType::TK_SPACE;
154       return i;
155     }
156     case CC_MINUS: {
157       if (z[1] == '-') {
158         for (i = 2; (c = z[i]) != 0 && c != '\n'; i++) {
159         }
160         *tokenType = SqliteTokenType::TK_SPACE; /* IMP: R-22934-25134 */
161         return i;
162       } else if (z[1] == '>') {
163         *tokenType = SqliteTokenType::TK_PTR;
164         return 2 + (z[2] == '>');
165       }
166       *tokenType = SqliteTokenType::TK_MINUS;
167       return 1;
168     }
169     case CC_LP: {
170       *tokenType = SqliteTokenType::TK_LP;
171       return 1;
172     }
173     case CC_RP: {
174       *tokenType = SqliteTokenType::TK_RP;
175       return 1;
176     }
177     case CC_SEMI: {
178       *tokenType = SqliteTokenType::TK_SEMI;
179       return 1;
180     }
181     case CC_PLUS: {
182       *tokenType = SqliteTokenType::TK_PLUS;
183       return 1;
184     }
185     case CC_STAR: {
186       *tokenType = SqliteTokenType::TK_STAR;
187       return 1;
188     }
189     case CC_SLASH: {
190       if (z[1] != '*' || z[2] == 0) {
191         *tokenType = SqliteTokenType::TK_SLASH;
192         return 1;
193       }
194       for (i = 3, c = z[2]; (c != '*' || z[i] != '/') && (c = z[i]) != 0; i++) {
195       }
196       if (c)
197         i++;
198       *tokenType = SqliteTokenType::TK_SPACE; /* IMP: R-22934-25134 */
199       return i;
200     }
201     case CC_PERCENT: {
202       *tokenType = SqliteTokenType::TK_REM;
203       return 1;
204     }
205     case CC_EQ: {
206       *tokenType = SqliteTokenType::TK_EQ;
207       return 1 + (z[1] == '=');
208     }
209     case CC_LT: {
210       if ((c = z[1]) == '=') {
211         *tokenType = SqliteTokenType::TK_LE;
212         return 2;
213       } else if (c == '>') {
214         *tokenType = SqliteTokenType::TK_NE;
215         return 2;
216       } else if (c == '<') {
217         *tokenType = SqliteTokenType::TK_LSHIFT;
218         return 2;
219       } else {
220         *tokenType = SqliteTokenType::TK_LT;
221         return 1;
222       }
223     }
224     case CC_GT: {
225       if ((c = z[1]) == '=') {
226         *tokenType = SqliteTokenType::TK_GE;
227         return 2;
228       } else if (c == '>') {
229         *tokenType = SqliteTokenType::TK_RSHIFT;
230         return 2;
231       } else {
232         *tokenType = SqliteTokenType::TK_GT;
233         return 1;
234       }
235     }
236     case CC_BANG: {
237       if (z[1] != '=') {
238         *tokenType = SqliteTokenType::TK_ILLEGAL;
239         return 1;
240       } else {
241         *tokenType = SqliteTokenType::TK_NE;
242         return 2;
243       }
244     }
245     case CC_PIPE: {
246       if (z[1] != '|') {
247         *tokenType = SqliteTokenType::TK_BITOR;
248         return 1;
249       } else {
250         *tokenType = SqliteTokenType::TK_CONCAT;
251         return 2;
252       }
253     }
254     case CC_COMMA: {
255       *tokenType = SqliteTokenType::TK_COMMA;
256       return 1;
257     }
258     case CC_AND: {
259       *tokenType = SqliteTokenType::TK_BITAND;
260       return 1;
261     }
262     case CC_TILDA: {
263       *tokenType = SqliteTokenType::TK_BITNOT;
264       return 1;
265     }
266     case CC_QUOTE: {
267       int delim = z[0];
268       for (i = 1; (c = z[i]) != 0; i++) {
269         if (c == delim) {
270           if (z[i + 1] == delim) {
271             i++;
272           } else {
273             break;
274           }
275         }
276       }
277       if (c == '\'') {
278         *tokenType = SqliteTokenType::TK_STRING;
279         return i + 1;
280       } else if (c != 0) {
281         *tokenType = SqliteTokenType::TK_ID;
282         return i + 1;
283       } else {
284         *tokenType = SqliteTokenType::TK_ILLEGAL;
285         return i;
286       }
287     }
288     case CC_DOT: {
289       if (!isdigit(z[1])) {
290         *tokenType = SqliteTokenType::TK_DOT;
291         return 1;
292       }
293       [[fallthrough]];
294     }
295     case CC_DIGIT: {
296       *tokenType = SqliteTokenType::TK_INTEGER;
297       if (z[0] == '0' && (z[1] == 'x' || z[1] == 'X') && isxdigit(z[2])) {
298         for (i = 3; isxdigit(z[i]); i++) {
299         }
300         return i;
301       }
302       for (i = 0; isxdigit(z[i]); i++) {
303       }
304       if (z[i] == '.') {
305         i++;
306         while (isxdigit(z[i])) {
307           i++;
308         }
309         *tokenType = SqliteTokenType::TK_FLOAT;
310       }
311       if ((z[i] == 'e' || z[i] == 'E') &&
312           (isdigit(z[i + 1]) ||
313            ((z[i + 1] == '+' || z[i + 1] == '-') && isdigit(z[i + 2])))) {
314         i += 2;
315         while (isdigit(z[i])) {
316           i++;
317         }
318         *tokenType = SqliteTokenType::TK_FLOAT;
319       }
320       while (IdChar(z[i])) {
321         *tokenType = SqliteTokenType::TK_ILLEGAL;
322         i++;
323       }
324       return i;
325     }
326     case CC_QUOTE2: {
327       for (i = 1, c = z[0]; c != ']' && (c = z[i]) != 0; i++) {
328       }
329       *tokenType =
330           c == ']' ? SqliteTokenType::TK_ID : SqliteTokenType::TK_ILLEGAL;
331       return i;
332     }
333     case CC_VARNUM: {
334       *tokenType = SqliteTokenType::TK_VARIABLE;
335       for (i = 1; isdigit(z[i]); i++) {
336       }
337       return i;
338     }
339     case CC_DOLLAR:
340     case CC_VARALPHA: {
341       int n = 0;
342       *tokenType = SqliteTokenType::TK_VARIABLE;
343       for (i = 1; (c = z[i]) != 0; i++) {
344         if (IdChar(c)) {
345           n++;
346         } else if (c == '(' && n > 0) {
347           do {
348             i++;
349           } while ((c = z[i]) != 0 && !isspace(c) && c != ')');
350           if (c == ')') {
351             i++;
352           } else {
353             *tokenType = SqliteTokenType::TK_ILLEGAL;
354           }
355           break;
356         } else if (c == ':' && z[i + 1] == ':') {
357           i++;
358         } else {
359           break;
360         }
361       }
362       if (n == 0)
363         *tokenType = SqliteTokenType::TK_ILLEGAL;
364       return i;
365     }
366     case CC_KYWD0: {
367       for (i = 1; aiClass[z[i]] <= CC_KYWD; i++) {
368       }
369       if (IdChar(z[i])) {
370         /* This token started out using characters that can appear in keywords,
371         ** but z[i] is a character not allowed within keywords, so this must
372         ** be an identifier instead */
373         i++;
374         break;
375       }
376       if (sqlite3_keyword_check(reinterpret_cast<const char*>(z), i)) {
377         *tokenType = SqliteTokenType::TK_GENERIC_KEYWORD;
378       } else {
379         *tokenType = SqliteTokenType::TK_ID;
380       }
381       return i;
382     }
383     case CC_X: {
384       if (z[1] == '\'') {
385         *tokenType = SqliteTokenType::TK_BLOB;
386         for (i = 2; isdigit(z[i]); i++) {
387         }
388         if (z[i] != '\'' || i % 2) {
389           *tokenType = SqliteTokenType::TK_ILLEGAL;
390           while (z[i] && z[i] != '\'') {
391             i++;
392           }
393         }
394         if (z[i])
395           i++;
396         return i;
397       }
398       [[fallthrough]];
399     }
400     case CC_KYWD:
401     case CC_ID: {
402       i = 1;
403       break;
404     }
405     case CC_BOM: {
406       if (z[1] == 0xbb && z[2] == 0xbf) {
407         *tokenType = SqliteTokenType::TK_SPACE;
408         return 3;
409       }
410       i = 1;
411       break;
412     }
413     case CC_NUL: {
414       *tokenType = SqliteTokenType::TK_ILLEGAL;
415       return 0;
416     }
417     default: {
418       *tokenType = SqliteTokenType::TK_ILLEGAL;
419       return 1;
420     }
421   }
422   while (IdChar(z[i])) {
423     i++;
424   }
425   *tokenType = SqliteTokenType::TK_ID;
426   return i;
427 }
428 
429 }  // namespace
430 
SqliteTokenizer(SqlSource sql)431 SqliteTokenizer::SqliteTokenizer(SqlSource sql) : source_(std::move(sql)) {}
432 
Next()433 SqliteTokenizer::Token SqliteTokenizer::Next() {
434   Token token;
435   const char* start = source_.sql().data() + offset_;
436   int n = GetSqliteToken(reinterpret_cast<const unsigned char*>(start),
437                          &token.token_type);
438   offset_ += static_cast<uint32_t>(n);
439   token.str = std::string_view(start, static_cast<uint32_t>(n));
440   return token;
441 }
442 
NextNonWhitespace()443 SqliteTokenizer::Token SqliteTokenizer::NextNonWhitespace() {
444   Token t;
445   for (t = Next(); t.token_type == SqliteTokenType::TK_SPACE; t = Next()) {
446   }
447   return t;
448 }
449 
NextTerminal()450 SqliteTokenizer::Token SqliteTokenizer::NextTerminal() {
451   Token tok = Next();
452   while (!tok.IsTerminal()) {
453     tok = Next();
454   }
455   return tok;
456 }
457 
Substr(const Token & start,const Token & end) const458 SqlSource SqliteTokenizer::Substr(const Token& start, const Token& end) const {
459   uint32_t offset =
460       static_cast<uint32_t>(start.str.data() - source_.sql().c_str());
461   uint32_t len = static_cast<uint32_t>(end.str.data() - start.str.data());
462   return source_.Substr(offset, len);
463 }
464 
SubstrToken(const Token & token) const465 SqlSource SqliteTokenizer::SubstrToken(const Token& token) const {
466   uint32_t offset =
467       static_cast<uint32_t>(token.str.data() - source_.sql().c_str());
468   uint32_t len = static_cast<uint32_t>(token.str.size());
469   return source_.Substr(offset, len);
470 }
471 
AsTraceback(const Token & token) const472 std::string SqliteTokenizer::AsTraceback(const Token& token) const {
473   PERFETTO_CHECK(source_.sql().c_str() <= token.str.data());
474   PERFETTO_CHECK(token.str.data() <=
475                  source_.sql().c_str() + source_.sql().size());
476   uint32_t offset =
477       static_cast<uint32_t>(token.str.data() - source_.sql().c_str());
478   return source_.AsTraceback(offset);
479 }
480 
Rewrite(SqlSource::Rewriter & rewriter,const Token & start,const Token & end,SqlSource rewrite,EndToken end_token) const481 void SqliteTokenizer::Rewrite(SqlSource::Rewriter& rewriter,
482                               const Token& start,
483                               const Token& end,
484                               SqlSource rewrite,
485                               EndToken end_token) const {
486   uint32_t s_off =
487       static_cast<uint32_t>(start.str.data() - source_.sql().c_str());
488   uint32_t e_off =
489       static_cast<uint32_t>(end.str.data() - source_.sql().c_str());
490   uint32_t e_diff = end_token == EndToken::kInclusive
491                         ? static_cast<uint32_t>(end.str.size())
492                         : 0;
493   rewriter.Rewrite(s_off, e_off + e_diff, std::move(rewrite));
494 }
495 
RewriteToken(SqlSource::Rewriter & rewriter,const Token & token,SqlSource rewrite) const496 void SqliteTokenizer::RewriteToken(SqlSource::Rewriter& rewriter,
497                                    const Token& token,
498                                    SqlSource rewrite) const {
499   uint32_t s_off =
500       static_cast<uint32_t>(token.str.data() - source_.sql().c_str());
501   uint32_t e_off = static_cast<uint32_t>(token.str.data() + token.str.size() -
502                                          source_.sql().c_str());
503   rewriter.Rewrite(s_off, e_off, std::move(rewrite));
504 }
505 
506 }  // namespace trace_processor
507 }  // namespace perfetto
508