1 /*
2 * Copyright (C) 2023 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/trace_processor/sqlite/sqlite_tokenizer.h"
18
19 #include <ctype.h>
20 #include <sqlite3.h>
21 #include <cstdint>
22 #include <optional>
23 #include <string_view>
24
25 #include "perfetto/base/compiler.h"
26 #include "perfetto/base/logging.h"
27
28 namespace perfetto {
29 namespace trace_processor {
30
31 // The contents of this file are ~copied from SQLite with some modifications to
32 // minimize the amount copied: i.e. if we can call a libc function/public SQLite
33 // API instead of a private one.
34 //
35 // The changes are as follows:
36 // 1. Remove all ifdefs to only keep branches we actually use
37 // 2. Change handling of |CC_KYWD0| to remove distinction between different
38 // SQLite kewords, reducing how many things we need to copy over.
39 // 3. Constants are changed from be macro defines to be values in
40 // |SqliteTokenType|.
41
42 namespace {
43
44 const unsigned char sqlite3CtypeMap[256] = {
45 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */
46 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */
47 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */
48 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f ........ */
49 0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27 !"#$%&' */
50 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f ()*+,-./ */
51 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37 01234567 */
52 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f 89:;<=>? */
53
54 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47 @ABCDEFG */
55 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f HIJKLMNO */
56 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57 PQRSTUVW */
57 0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f XYZ[\]^_ */
58 0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67 `abcdefg */
59 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f hijklmno */
60 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77 pqrstuvw */
61 0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f xyz{|}~. */
62
63 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87 ........ */
64 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f ........ */
65 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97 ........ */
66 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f ........ */
67 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7 ........ */
68 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af ........ */
69 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7 ........ */
70 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf ........ */
71
72 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7 ........ */
73 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf ........ */
74 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7 ........ */
75 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df ........ */
76 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7 ........ */
77 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef ........ */
78 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7 ........ */
79 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* f8..ff ........ */
80 };
81
82 #define CC_X 0 /* The letter 'x', or start of BLOB literal */
83 #define CC_KYWD0 1 /* First letter of a keyword */
84 #define CC_KYWD 2 /* Alphabetics or '_'. Usable in a keyword */
85 #define CC_DIGIT 3 /* Digits */
86 #define CC_DOLLAR 4 /* '$' */
87 #define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */
88 #define CC_VARNUM 6 /* '?'. Numeric SQL variables */
89 #define CC_SPACE 7 /* Space characters */
90 #define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */
91 #define CC_QUOTE2 9 /* '['. [...] style quoted ids */
92 #define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */
93 #define CC_MINUS 11 /* '-'. Minus or SQL-style comment */
94 #define CC_LT 12 /* '<'. Part of < or <= or <> */
95 #define CC_GT 13 /* '>'. Part of > or >= */
96 #define CC_EQ 14 /* '='. Part of = or == */
97 #define CC_BANG 15 /* '!'. Part of != */
98 #define CC_SLASH 16 /* '/'. / or c-style comment */
99 #define CC_LP 17 /* '(' */
100 #define CC_RP 18 /* ')' */
101 #define CC_SEMI 19 /* ';' */
102 #define CC_PLUS 20 /* '+' */
103 #define CC_STAR 21 /* '*' */
104 #define CC_PERCENT 22 /* '%' */
105 #define CC_COMMA 23 /* ',' */
106 #define CC_AND 24 /* '&' */
107 #define CC_TILDA 25 /* '~' */
108 #define CC_DOT 26 /* '.' */
109 #define CC_ID 27 /* unicode characters usable in IDs */
110 #define CC_NUL 29 /* 0x00 */
111 #define CC_BOM 30 /* First byte of UTF8 BOM: 0xEF 0xBB 0xBF */
112
113 // clang-format off
114 static const unsigned char aiClass[] = {
115 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */
116 /* 0x */ 29, 28, 28, 28, 28, 28, 28, 28, 28, 7, 7, 28, 7, 7, 28, 28,
117 /* 1x */ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
118 /* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16,
119 /* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6,
120 /* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
121 /* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 9, 28, 28, 28, 2,
122 /* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123 /* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 28, 10, 28, 25, 28,
124 /* 8x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
125 /* 9x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
126 /* Ax */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
127 /* Bx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
128 /* Cx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
129 /* Dx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
130 /* Ex */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 30,
131 /* Fx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27
132 };
133 // clang-format on
134
135 #define IdChar(C) ((sqlite3CtypeMap[static_cast<unsigned char>(C)] & 0x46) != 0)
136
137 // Copy of |sqlite3GetToken| for use by the PerfettoSql transpiler.
138 //
139 // We copy this function because |sqlite3GetToken| is static to sqlite3.c
140 // in most distributions of SQLite so we cannot call it from our code.
141 //
142 // While we could redefine SQLITE_PRIVATE, pragmatically that will not fly in
143 // all the places we build trace processor so we need to resort to making a
144 // copy.
GetSqliteToken(const unsigned char * z,SqliteTokenType * tokenType)145 int GetSqliteToken(const unsigned char* z, SqliteTokenType* tokenType) {
146 int i, c;
147 switch (aiClass[*z]) { /* Switch on the character-class of the first byte
148 ** of the token. See the comment on the CC_ defines
149 ** above. */
150 case CC_SPACE: {
151 for (i = 1; isspace(z[i]); i++) {
152 }
153 *tokenType = SqliteTokenType::TK_SPACE;
154 return i;
155 }
156 case CC_MINUS: {
157 if (z[1] == '-') {
158 for (i = 2; (c = z[i]) != 0 && c != '\n'; i++) {
159 }
160 *tokenType = SqliteTokenType::TK_SPACE; /* IMP: R-22934-25134 */
161 return i;
162 } else if (z[1] == '>') {
163 *tokenType = SqliteTokenType::TK_PTR;
164 return 2 + (z[2] == '>');
165 }
166 *tokenType = SqliteTokenType::TK_MINUS;
167 return 1;
168 }
169 case CC_LP: {
170 *tokenType = SqliteTokenType::TK_LP;
171 return 1;
172 }
173 case CC_RP: {
174 *tokenType = SqliteTokenType::TK_RP;
175 return 1;
176 }
177 case CC_SEMI: {
178 *tokenType = SqliteTokenType::TK_SEMI;
179 return 1;
180 }
181 case CC_PLUS: {
182 *tokenType = SqliteTokenType::TK_PLUS;
183 return 1;
184 }
185 case CC_STAR: {
186 *tokenType = SqliteTokenType::TK_STAR;
187 return 1;
188 }
189 case CC_SLASH: {
190 if (z[1] != '*' || z[2] == 0) {
191 *tokenType = SqliteTokenType::TK_SLASH;
192 return 1;
193 }
194 for (i = 3, c = z[2]; (c != '*' || z[i] != '/') && (c = z[i]) != 0; i++) {
195 }
196 if (c)
197 i++;
198 *tokenType = SqliteTokenType::TK_SPACE; /* IMP: R-22934-25134 */
199 return i;
200 }
201 case CC_PERCENT: {
202 *tokenType = SqliteTokenType::TK_REM;
203 return 1;
204 }
205 case CC_EQ: {
206 *tokenType = SqliteTokenType::TK_EQ;
207 return 1 + (z[1] == '=');
208 }
209 case CC_LT: {
210 if ((c = z[1]) == '=') {
211 *tokenType = SqliteTokenType::TK_LE;
212 return 2;
213 } else if (c == '>') {
214 *tokenType = SqliteTokenType::TK_NE;
215 return 2;
216 } else if (c == '<') {
217 *tokenType = SqliteTokenType::TK_LSHIFT;
218 return 2;
219 } else {
220 *tokenType = SqliteTokenType::TK_LT;
221 return 1;
222 }
223 }
224 case CC_GT: {
225 if ((c = z[1]) == '=') {
226 *tokenType = SqliteTokenType::TK_GE;
227 return 2;
228 } else if (c == '>') {
229 *tokenType = SqliteTokenType::TK_RSHIFT;
230 return 2;
231 } else {
232 *tokenType = SqliteTokenType::TK_GT;
233 return 1;
234 }
235 }
236 case CC_BANG: {
237 if (z[1] != '=') {
238 *tokenType = SqliteTokenType::TK_ILLEGAL;
239 return 1;
240 } else {
241 *tokenType = SqliteTokenType::TK_NE;
242 return 2;
243 }
244 }
245 case CC_PIPE: {
246 if (z[1] != '|') {
247 *tokenType = SqliteTokenType::TK_BITOR;
248 return 1;
249 } else {
250 *tokenType = SqliteTokenType::TK_CONCAT;
251 return 2;
252 }
253 }
254 case CC_COMMA: {
255 *tokenType = SqliteTokenType::TK_COMMA;
256 return 1;
257 }
258 case CC_AND: {
259 *tokenType = SqliteTokenType::TK_BITAND;
260 return 1;
261 }
262 case CC_TILDA: {
263 *tokenType = SqliteTokenType::TK_BITNOT;
264 return 1;
265 }
266 case CC_QUOTE: {
267 int delim = z[0];
268 for (i = 1; (c = z[i]) != 0; i++) {
269 if (c == delim) {
270 if (z[i + 1] == delim) {
271 i++;
272 } else {
273 break;
274 }
275 }
276 }
277 if (c == '\'') {
278 *tokenType = SqliteTokenType::TK_STRING;
279 return i + 1;
280 } else if (c != 0) {
281 *tokenType = SqliteTokenType::TK_ID;
282 return i + 1;
283 } else {
284 *tokenType = SqliteTokenType::TK_ILLEGAL;
285 return i;
286 }
287 }
288 case CC_DOT: {
289 if (!isdigit(z[1])) {
290 *tokenType = SqliteTokenType::TK_DOT;
291 return 1;
292 }
293 [[fallthrough]];
294 }
295 case CC_DIGIT: {
296 *tokenType = SqliteTokenType::TK_INTEGER;
297 if (z[0] == '0' && (z[1] == 'x' || z[1] == 'X') && isxdigit(z[2])) {
298 for (i = 3; isxdigit(z[i]); i++) {
299 }
300 return i;
301 }
302 for (i = 0; isxdigit(z[i]); i++) {
303 }
304 if (z[i] == '.') {
305 i++;
306 while (isxdigit(z[i])) {
307 i++;
308 }
309 *tokenType = SqliteTokenType::TK_FLOAT;
310 }
311 if ((z[i] == 'e' || z[i] == 'E') &&
312 (isdigit(z[i + 1]) ||
313 ((z[i + 1] == '+' || z[i + 1] == '-') && isdigit(z[i + 2])))) {
314 i += 2;
315 while (isdigit(z[i])) {
316 i++;
317 }
318 *tokenType = SqliteTokenType::TK_FLOAT;
319 }
320 while (IdChar(z[i])) {
321 *tokenType = SqliteTokenType::TK_ILLEGAL;
322 i++;
323 }
324 return i;
325 }
326 case CC_QUOTE2: {
327 for (i = 1, c = z[0]; c != ']' && (c = z[i]) != 0; i++) {
328 }
329 *tokenType =
330 c == ']' ? SqliteTokenType::TK_ID : SqliteTokenType::TK_ILLEGAL;
331 return i;
332 }
333 case CC_VARNUM: {
334 *tokenType = SqliteTokenType::TK_VARIABLE;
335 for (i = 1; isdigit(z[i]); i++) {
336 }
337 return i;
338 }
339 case CC_DOLLAR:
340 case CC_VARALPHA: {
341 int n = 0;
342 *tokenType = SqliteTokenType::TK_VARIABLE;
343 for (i = 1; (c = z[i]) != 0; i++) {
344 if (IdChar(c)) {
345 n++;
346 } else if (c == '(' && n > 0) {
347 do {
348 i++;
349 } while ((c = z[i]) != 0 && !isspace(c) && c != ')');
350 if (c == ')') {
351 i++;
352 } else {
353 *tokenType = SqliteTokenType::TK_ILLEGAL;
354 }
355 break;
356 } else if (c == ':' && z[i + 1] == ':') {
357 i++;
358 } else {
359 break;
360 }
361 }
362 if (n == 0)
363 *tokenType = SqliteTokenType::TK_ILLEGAL;
364 return i;
365 }
366 case CC_KYWD0: {
367 for (i = 1; aiClass[z[i]] <= CC_KYWD; i++) {
368 }
369 if (IdChar(z[i])) {
370 /* This token started out using characters that can appear in keywords,
371 ** but z[i] is a character not allowed within keywords, so this must
372 ** be an identifier instead */
373 i++;
374 break;
375 }
376 if (sqlite3_keyword_check(reinterpret_cast<const char*>(z), i)) {
377 *tokenType = SqliteTokenType::TK_GENERIC_KEYWORD;
378 } else {
379 *tokenType = SqliteTokenType::TK_ID;
380 }
381 return i;
382 }
383 case CC_X: {
384 if (z[1] == '\'') {
385 *tokenType = SqliteTokenType::TK_BLOB;
386 for (i = 2; isdigit(z[i]); i++) {
387 }
388 if (z[i] != '\'' || i % 2) {
389 *tokenType = SqliteTokenType::TK_ILLEGAL;
390 while (z[i] && z[i] != '\'') {
391 i++;
392 }
393 }
394 if (z[i])
395 i++;
396 return i;
397 }
398 [[fallthrough]];
399 }
400 case CC_KYWD:
401 case CC_ID: {
402 i = 1;
403 break;
404 }
405 case CC_BOM: {
406 if (z[1] == 0xbb && z[2] == 0xbf) {
407 *tokenType = SqliteTokenType::TK_SPACE;
408 return 3;
409 }
410 i = 1;
411 break;
412 }
413 case CC_NUL: {
414 *tokenType = SqliteTokenType::TK_ILLEGAL;
415 return 0;
416 }
417 default: {
418 *tokenType = SqliteTokenType::TK_ILLEGAL;
419 return 1;
420 }
421 }
422 while (IdChar(z[i])) {
423 i++;
424 }
425 *tokenType = SqliteTokenType::TK_ID;
426 return i;
427 }
428
429 } // namespace
430
SqliteTokenizer(SqlSource sql)431 SqliteTokenizer::SqliteTokenizer(SqlSource sql) : source_(std::move(sql)) {}
432
Next()433 SqliteTokenizer::Token SqliteTokenizer::Next() {
434 Token token;
435 const char* start = source_.sql().data() + offset_;
436 int n = GetSqliteToken(reinterpret_cast<const unsigned char*>(start),
437 &token.token_type);
438 offset_ += static_cast<uint32_t>(n);
439 token.str = std::string_view(start, static_cast<uint32_t>(n));
440 return token;
441 }
442
NextNonWhitespace()443 SqliteTokenizer::Token SqliteTokenizer::NextNonWhitespace() {
444 Token t;
445 for (t = Next(); t.token_type == SqliteTokenType::TK_SPACE; t = Next()) {
446 }
447 return t;
448 }
449
NextTerminal()450 SqliteTokenizer::Token SqliteTokenizer::NextTerminal() {
451 Token tok = Next();
452 while (!tok.IsTerminal()) {
453 tok = Next();
454 }
455 return tok;
456 }
457
Substr(const Token & start,const Token & end) const458 SqlSource SqliteTokenizer::Substr(const Token& start, const Token& end) const {
459 uint32_t offset =
460 static_cast<uint32_t>(start.str.data() - source_.sql().c_str());
461 uint32_t len = static_cast<uint32_t>(end.str.data() - start.str.data());
462 return source_.Substr(offset, len);
463 }
464
SubstrToken(const Token & token) const465 SqlSource SqliteTokenizer::SubstrToken(const Token& token) const {
466 uint32_t offset =
467 static_cast<uint32_t>(token.str.data() - source_.sql().c_str());
468 uint32_t len = static_cast<uint32_t>(token.str.size());
469 return source_.Substr(offset, len);
470 }
471
AsTraceback(const Token & token) const472 std::string SqliteTokenizer::AsTraceback(const Token& token) const {
473 PERFETTO_CHECK(source_.sql().c_str() <= token.str.data());
474 PERFETTO_CHECK(token.str.data() <=
475 source_.sql().c_str() + source_.sql().size());
476 uint32_t offset =
477 static_cast<uint32_t>(token.str.data() - source_.sql().c_str());
478 return source_.AsTraceback(offset);
479 }
480
Rewrite(SqlSource::Rewriter & rewriter,const Token & start,const Token & end,SqlSource rewrite,EndToken end_token) const481 void SqliteTokenizer::Rewrite(SqlSource::Rewriter& rewriter,
482 const Token& start,
483 const Token& end,
484 SqlSource rewrite,
485 EndToken end_token) const {
486 uint32_t s_off =
487 static_cast<uint32_t>(start.str.data() - source_.sql().c_str());
488 uint32_t e_off =
489 static_cast<uint32_t>(end.str.data() - source_.sql().c_str());
490 uint32_t e_diff = end_token == EndToken::kInclusive
491 ? static_cast<uint32_t>(end.str.size())
492 : 0;
493 rewriter.Rewrite(s_off, e_off + e_diff, std::move(rewrite));
494 }
495
RewriteToken(SqlSource::Rewriter & rewriter,const Token & token,SqlSource rewrite) const496 void SqliteTokenizer::RewriteToken(SqlSource::Rewriter& rewriter,
497 const Token& token,
498 SqlSource rewrite) const {
499 uint32_t s_off =
500 static_cast<uint32_t>(token.str.data() - source_.sql().c_str());
501 uint32_t e_off = static_cast<uint32_t>(token.str.data() + token.str.size() -
502 source_.sql().c_str());
503 rewriter.Rewrite(s_off, e_off, std::move(rewrite));
504 }
505
506 } // namespace trace_processor
507 } // namespace perfetto
508