1 /*
2 * *****************************************************************************
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * * Redistributions of source code must retain the above copyright notice, this
12 * list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright notice,
15 * this list of conditions and the following disclaimer in the documentation
16 * and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 * *****************************************************************************
31 *
32 * Common code for the lexers.
33 *
34 */
35
36 #include <assert.h>
37 #include <ctype.h>
38 #include <stdbool.h>
39 #include <string.h>
40
41 #include <lex.h>
42 #include <vm.h>
43 #include <bc.h>
44
bc_lex_invalidChar(BcLex * l,char c)45 void bc_lex_invalidChar(BcLex *l, char c) {
46 l->t = BC_LEX_INVALID;
47 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
48 }
49
bc_lex_lineComment(BcLex * l)50 void bc_lex_lineComment(BcLex *l) {
51 l->t = BC_LEX_WHITESPACE;
52 while (l->i < l->len && l->buf[l->i] != '\n') l->i += 1;
53 }
54
bc_lex_comment(BcLex * l)55 void bc_lex_comment(BcLex *l) {
56
57 size_t i, nlines = 0;
58 const char *buf;
59 bool end = false, got_more;
60 char c;
61
62 l->i += 1;
63 l->t = BC_LEX_WHITESPACE;
64
65 // This loop is complex because it might need to request more data from
66 // stdin if the comment is not ended. This loop is taken until the comment
67 // is finished or we have EOF.
68 do {
69
70 buf = l->buf;
71 got_more = false;
72
73 // If we are in stdin mode, the buffer must be the one used for stdin.
74 assert(!vm.is_stdin || buf == vm.buffer.v);
75
76 // Find the end of the comment.
77 for (i = l->i; !end; i += !end) {
78
79 // While we don't have an asterisk, eat, but increment nlines.
80 for (; (c = buf[i]) && c != '*'; ++i) nlines += (c == '\n');
81
82 // If this is true, we need to request more data.
83 if (BC_ERR(!c || buf[i + 1] == '\0')) {
84
85 // Read more.
86 if (!vm.eof && l->is_stdin) got_more = bc_lex_readLine(l);
87
88 break;
89 }
90
91 // If this turns true, we found the end. Yay!
92 end = (buf[i + 1] == '/');
93 }
94
95 } while (got_more && !end);
96
97 // If we didn't find the end, barf.
98 if (!end) {
99 l->i = i;
100 bc_lex_err(l, BC_ERR_PARSE_COMMENT);
101 }
102
103 l->i = i + 2;
104 l->line += nlines;
105 }
106
bc_lex_whitespace(BcLex * l)107 void bc_lex_whitespace(BcLex *l) {
108
109 char c;
110
111 l->t = BC_LEX_WHITESPACE;
112
113 // Eat. We don't eat newlines because they can be special.
114 for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i]);
115 }
116
bc_lex_commonTokens(BcLex * l,char c)117 void bc_lex_commonTokens(BcLex *l, char c) {
118 if (!c) l->t = BC_LEX_EOF;
119 else if (c == '\n') l->t = BC_LEX_NLINE;
120 else bc_lex_whitespace(l);
121 }
122
123 /**
124 * Parses a number.
125 * @param l The lexer.
126 * @param start The start character.
127 * @param int_only Whether this function should only look for an integer. This
128 * is used to implement the exponent of scientific notation.
129 */
bc_lex_num(BcLex * l,char start,bool int_only)130 static size_t bc_lex_num(BcLex *l, char start, bool int_only) {
131
132 const char *buf = l->buf + l->i;
133 size_t i;
134 char c;
135 bool last_pt, pt = (start == '.');
136
137 // This loop looks complex. It is not. It is asking if the character is not
138 // a nul byte and it if it a valid num character based on what we have found
139 // thus far, or whether it is a backslash followed by a newline. I can do
140 // i+1 on the buffer because the buffer must have a nul byte.
141 for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) ||
142 (c == '\\' && buf[i + 1] == '\n')); ++i)
143 {
144 // I don't need to test that the next character is a newline because
145 // the loop condition above ensures that.
146 if (c == '\\') {
147
148 i += 2;
149
150 // Make sure to eat whitespace at the beginning of the line.
151 while(isspace(buf[i]) && buf[i] != '\n') i += 1;
152
153 c = buf[i];
154
155 // If the next character is not a number character, bail.
156 if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break;
157 }
158
159 // Did we find the radix point?
160 last_pt = (c == '.');
161
162 // If we did, and we already have one, then break because it's not part
163 // of this number.
164 if (pt && last_pt) break;
165
166 // Set whether we have found a radix point.
167 pt = pt || last_pt;
168
169 bc_vec_push(&l->str, &c);
170 }
171
172 return i;
173 }
174
bc_lex_number(BcLex * l,char start)175 void bc_lex_number(BcLex *l, char start) {
176
177 l->t = BC_LEX_NUMBER;
178
179 // Make sure the string is clear.
180 bc_vec_popAll(&l->str);
181 bc_vec_push(&l->str, &start);
182
183 // Parse the number.
184 l->i += bc_lex_num(l, start, false);
185
186 #if BC_ENABLE_EXTRA_MATH
187 {
188 char c = l->buf[l->i];
189
190 // Do we have a number in scientific notation?
191 if (c == 'e') {
192
193 #if BC_ENABLED
194 // Barf for POSIX.
195 if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM);
196 #endif // BC_ENABLED
197
198 // Push the e.
199 bc_vec_push(&l->str, &c);
200 l->i += 1;
201 c = l->buf[l->i];
202
203 // Check for negative specifically because bc_lex_num() does not.
204 if (c == BC_LEX_NEG_CHAR) {
205 bc_vec_push(&l->str, &c);
206 l->i += 1;
207 c = l->buf[l->i];
208 }
209
210 // We must have a number character, so barf if not.
211 if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true)))
212 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
213
214 // Parse the exponent.
215 l->i += bc_lex_num(l, 0, true);
216 }
217 }
218 #endif // BC_ENABLE_EXTRA_MATH
219
220 bc_vec_pushByte(&l->str, '\0');
221 }
222
bc_lex_name(BcLex * l)223 void bc_lex_name(BcLex *l) {
224
225 size_t i = 0;
226 const char *buf = l->buf + l->i - 1;
227 char c = buf[i];
228
229 l->t = BC_LEX_NAME;
230
231 // Should be obvious. It's looking for valid characters.
232 while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_') c = buf[++i];
233
234 // Set the string to the identifier.
235 bc_vec_string(&l->str, i, buf);
236
237 // Increment the index. We minus 1 because it has already been incremented.
238 l->i += i - 1;
239 }
240
bc_lex_init(BcLex * l)241 void bc_lex_init(BcLex *l) {
242 BC_SIG_ASSERT_LOCKED;
243 assert(l != NULL);
244 bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE);
245 }
246
bc_lex_free(BcLex * l)247 void bc_lex_free(BcLex *l) {
248 BC_SIG_ASSERT_LOCKED;
249 assert(l != NULL);
250 bc_vec_free(&l->str);
251 }
252
bc_lex_file(BcLex * l,const char * file)253 void bc_lex_file(BcLex *l, const char *file) {
254 assert(l != NULL && file != NULL);
255 l->line = 1;
256 vm.file = file;
257 }
258
bc_lex_next(BcLex * l)259 void bc_lex_next(BcLex *l) {
260
261 assert(l != NULL);
262
263 l->last = l->t;
264
265 // If this wasn't here, the line number would be off.
266 l->line += (l->i != 0 && l->buf[l->i - 1] == '\n');
267
268 // If the last token was EOF, someone called this one too many times.
269 if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF);
270
271 l->t = BC_LEX_EOF;
272
273 // We are done if this is true.
274 if (l->i == l->len) return;
275
276 // Loop until failure or we don't have whitespace. This
277 // is so the parser doesn't get inundated with whitespace.
278 do {
279 vm.next(l);
280 } while (l->t == BC_LEX_WHITESPACE);
281 }
282
283 /**
284 * Updates the buffer and len so that they are not invalidated when the stdin
285 * buffer grows.
286 * @param l The lexer.
287 * @param text The text.
288 * @param len The length of the text.
289 */
bc_lex_fixText(BcLex * l,const char * text,size_t len)290 static void bc_lex_fixText(BcLex *l, const char *text, size_t len) {
291 l->buf = text;
292 l->len = len;
293 }
294
bc_lex_readLine(BcLex * l)295 bool bc_lex_readLine(BcLex *l) {
296
297 bool good = bc_vm_readLine(false);
298
299 bc_lex_fixText(l, vm.buffer.v, vm.buffer.len - 1);
300
301 return good;
302 }
303
bc_lex_text(BcLex * l,const char * text,bool is_stdin)304 void bc_lex_text(BcLex *l, const char *text, bool is_stdin) {
305 assert(l != NULL && text != NULL);
306 bc_lex_fixText(l, text, strlen(text));
307 l->i = 0;
308 l->t = l->last = BC_LEX_INVALID;
309 l->is_stdin = is_stdin;
310 bc_lex_next(l);
311 }
312