1 /*
2 * *****************************************************************************
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * * Redistributions of source code must retain the above copyright notice, this
12 * list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright notice,
15 * this list of conditions and the following disclaimer in the documentation
16 * and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 * *****************************************************************************
31 *
32 * The lexer for bc.
33 *
34 */
35
36 #if BC_ENABLED
37
38 #include <assert.h>
39 #include <ctype.h>
40 #include <string.h>
41
42 #include <bc.h>
43 #include <vm.h>
44
45 /**
46 * Lexes an identifier, which may be a keyword.
47 * @param l The lexer.
48 */
bc_lex_identifier(BcLex * l)49 static void bc_lex_identifier(BcLex *l) {
50
51 // We already passed the first character, so we need to be sure to include
52 // it.
53 const char *buf = l->buf + l->i - 1;
54 size_t i;
55
56 // This loop is simply checking for keywords.
57 for (i = 0; i < bc_lex_kws_len; ++i) {
58
59 const BcLexKeyword *kw = bc_lex_kws + i;
60 size_t n = BC_LEX_KW_LEN(kw);
61
62 if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_') {
63
64 // If the keyword has been redefined, and redefinition is allowed
65 // (it is not allowed for builtin libraries), break out of the loop
66 // and use it as a name. This depends on the argument parser to
67 // ensure that only non-POSIX keywords get redefined.
68 if (!vm.no_redefine && vm.redefined_kws[i]) break;
69
70 l->t = BC_LEX_KW_AUTO + (BcLexType) i;
71
72 // Warn or error, as appropriate for the mode, if the keyword is not
73 // in the POSIX standard.
74 if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
75
76 // We minus 1 because the index has already been incremented.
77 l->i += n - 1;
78
79 // Already have the token; bail.
80 return;
81 }
82 }
83
84 // If not a keyword, parse the name.
85 bc_lex_name(l);
86
87 // POSIX doesn't allow identifiers that are more than one character, so we
88 // might have to warn or error here too.
89 if (BC_ERR(l->str.len - 1 > 1))
90 bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
91 }
92
93 /**
94 * Parses a bc string. This is separate from dc strings because dc strings need
95 * to be balanced.
96 * @param l The lexer.
97 */
bc_lex_string(BcLex * l)98 static void bc_lex_string(BcLex *l) {
99
100 // We need to keep track of newlines to increment them properly.
101 size_t len, nlines, i;
102 const char *buf;
103 char c;
104 bool got_more;
105
106 l->t = BC_LEX_STR;
107
108 do {
109
110 nlines = 0;
111 buf = l->buf;
112 got_more = false;
113
114 assert(!vm.is_stdin || buf == vm.buffer.v);
115
116 // Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
117 // is '\q', which makes this loop simpler.
118 for (i = l->i; (c = buf[i]) && c != '"'; ++i) nlines += (c == '\n');
119
120 if (BC_ERR(c == '\0') && !vm.eof && l->is_stdin)
121 got_more = bc_lex_readLine(l);
122
123 } while (got_more && c != '"');
124
125 // If the string did not end properly, barf.
126 if (c != '"') {
127 l->i = i;
128 bc_lex_err(l, BC_ERR_PARSE_STRING);
129 }
130
131 // Set the temp string to the parsed string.
132 len = i - l->i;
133 bc_vec_string(&l->str, len, l->buf + l->i);
134
135 l->i = i + 1;
136 l->line += nlines;
137 }
138
139 /**
140 * This function takes a lexed operator and checks to see if it's the assignment
141 * version, setting the token appropriately.
142 * @param l The lexer.
143 * @param with The token to assign if it is an assignment operator.
144 * @param without The token to assign if it is not an assignment operator.
145 */
bc_lex_assign(BcLex * l,BcLexType with,BcLexType without)146 static void bc_lex_assign(BcLex *l, BcLexType with, BcLexType without) {
147 if (l->buf[l->i] == '=') {
148 l->i += 1;
149 l->t = with;
150 }
151 else l->t = without;
152 }
153
bc_lex_token(BcLex * l)154 void bc_lex_token(BcLex *l) {
155
156 // We increment here. This means that all lexing needs to take that into
157 // account, such as when parsing an identifier. If we don't, the first
158 // character of every identifier would be missing.
159 char c = l->buf[l->i++], c2;
160
161 // This is the workhorse of the lexer.
162 switch (c) {
163
164 case '\0':
165 case '\n':
166 case '\t':
167 case '\v':
168 case '\f':
169 case '\r':
170 case ' ':
171 {
172 bc_lex_commonTokens(l, c);
173 break;
174 }
175
176 case '!':
177 {
178 // Even though it's not an assignment, we can use this.
179 bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
180
181 // POSIX doesn't allow boolean not.
182 if (l->t == BC_LEX_OP_BOOL_NOT)
183 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
184
185 break;
186 }
187
188 case '"':
189 {
190 bc_lex_string(l);
191 break;
192 }
193
194 case '#':
195 {
196 // POSIX does not allow line comments.
197 bc_lex_err(l, BC_ERR_POSIX_COMMENT);
198 bc_lex_lineComment(l);
199 break;
200 }
201
202 case '%':
203 {
204 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
205 break;
206 }
207
208 case '&':
209 {
210 c2 = l->buf[l->i];
211
212 // Either we have boolean and or an error. And boolean and is not
213 // allowed by POSIX.
214 if (BC_NO_ERR(c2 == '&')) {
215
216 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
217
218 l->i += 1;
219 l->t = BC_LEX_OP_BOOL_AND;
220 }
221 else bc_lex_invalidChar(l, c);
222
223 break;
224 }
225 #if BC_ENABLE_EXTRA_MATH
226 case '$':
227 {
228 l->t = BC_LEX_OP_TRUNC;
229 break;
230 }
231
232 case '@':
233 {
234 bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
235 break;
236 }
237 #endif // BC_ENABLE_EXTRA_MATH
238 case '(':
239 case ')':
240 {
241 l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
242 break;
243 }
244
245 case '*':
246 {
247 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
248 break;
249 }
250
251 case '+':
252 {
253 c2 = l->buf[l->i];
254
255 // Have to check for increment first.
256 if (c2 == '+') {
257 l->i += 1;
258 l->t = BC_LEX_OP_INC;
259 }
260 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
261 break;
262 }
263
264 case ',':
265 {
266 l->t = BC_LEX_COMMA;
267 break;
268 }
269
270 case '-':
271 {
272 c2 = l->buf[l->i];
273
274 // Have to check for decrement first.
275 if (c2 == '-') {
276 l->i += 1;
277 l->t = BC_LEX_OP_DEC;
278 }
279 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
280 break;
281 }
282
283 case '.':
284 {
285 c2 = l->buf[l->i];
286
287 // If it's alone, it's an alias for last.
288 if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
289 else {
290 l->t = BC_LEX_KW_LAST;
291 bc_lex_err(l, BC_ERR_POSIX_DOT);
292 }
293
294 break;
295 }
296
297 case '/':
298 {
299 c2 = l->buf[l->i];
300 if (c2 =='*') bc_lex_comment(l);
301 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
302 break;
303 }
304
305 case '0':
306 case '1':
307 case '2':
308 case '3':
309 case '4':
310 case '5':
311 case '6':
312 case '7':
313 case '8':
314 case '9':
315 case 'A':
316 case 'B':
317 case 'C':
318 case 'D':
319 case 'E':
320 case 'F':
321 // Apparently, GNU bc (and maybe others) allows any uppercase letter as
322 // a number. When single digits, they act like the ones above. When
323 // multi-digit, any letter above the input base is automatically set to
324 // the biggest allowable digit in the input base.
325 case 'G':
326 case 'H':
327 case 'I':
328 case 'J':
329 case 'K':
330 case 'L':
331 case 'M':
332 case 'N':
333 case 'O':
334 case 'P':
335 case 'Q':
336 case 'R':
337 case 'S':
338 case 'T':
339 case 'U':
340 case 'V':
341 case 'W':
342 case 'X':
343 case 'Y':
344 case 'Z':
345 {
346 bc_lex_number(l, c);
347 break;
348 }
349
350 case ';':
351 {
352 l->t = BC_LEX_SCOLON;
353 break;
354 }
355
356 case '<':
357 {
358 #if BC_ENABLE_EXTRA_MATH
359 c2 = l->buf[l->i];
360
361 // Check for shift.
362 if (c2 == '<') {
363 l->i += 1;
364 bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
365 break;
366 }
367 #endif // BC_ENABLE_EXTRA_MATH
368 bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
369 break;
370 }
371
372 case '=':
373 {
374 bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
375 break;
376 }
377
378 case '>':
379 {
380 #if BC_ENABLE_EXTRA_MATH
381 c2 = l->buf[l->i];
382
383 // Check for shift.
384 if (c2 == '>') {
385 l->i += 1;
386 bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
387 break;
388 }
389 #endif // BC_ENABLE_EXTRA_MATH
390 bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
391 break;
392 }
393
394 case '[':
395 case ']':
396 {
397 l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
398 break;
399 }
400
401 case '\\':
402 {
403 // In bc, a backslash+newline is whitespace.
404 if (BC_NO_ERR(l->buf[l->i] == '\n')) {
405 l->i += 1;
406 l->t = BC_LEX_WHITESPACE;
407 }
408 else bc_lex_invalidChar(l, c);
409 break;
410 }
411
412 case '^':
413 {
414 bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
415 break;
416 }
417
418 case 'a':
419 case 'b':
420 case 'c':
421 case 'd':
422 case 'e':
423 case 'f':
424 case 'g':
425 case 'h':
426 case 'i':
427 case 'j':
428 case 'k':
429 case 'l':
430 case 'm':
431 case 'n':
432 case 'o':
433 case 'p':
434 case 'q':
435 case 'r':
436 case 's':
437 case 't':
438 case 'u':
439 case 'v':
440 case 'w':
441 case 'x':
442 case 'y':
443 case 'z':
444 {
445 bc_lex_identifier(l);
446 break;
447 }
448
449 case '{':
450 case '}':
451 {
452 l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
453 break;
454 }
455
456 case '|':
457 {
458 c2 = l->buf[l->i];
459
460 // Once again, boolean or is not allowed by POSIX.
461 if (BC_NO_ERR(c2 == '|')) {
462
463 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
464
465 l->i += 1;
466 l->t = BC_LEX_OP_BOOL_OR;
467 }
468 else bc_lex_invalidChar(l, c);
469
470 break;
471 }
472
473 default:
474 {
475 bc_lex_invalidChar(l, c);
476 }
477 }
478 }
479 #endif // BC_ENABLED
480