• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * This is a really stupid C tokenizer. It doesn't do any include
3  * files or anything complex at all. That's the preprocessor.
4  *
5  * Copyright (C) 2003 Transmeta Corp.
6  *               2003 Linus Torvalds
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <unistd.h>
33 #include <stdint.h>
34 
35 #include "lib.h"
36 #include "allocate.h"
37 #include "token.h"
38 #include "symbol.h"
39 
40 #define EOF (-1)
41 
42 int input_stream_nr = 0;
43 struct stream *input_streams;
44 static int input_streams_allocated;
45 unsigned int tabstop = 8;
46 
47 #define BUFSIZE (8192)
48 
49 typedef struct {
50 	int fd, offset, size;
51 	int pos, line, nr;
52 	int newline, whitespace;
53 	struct token **tokenlist;
54 	struct token *token;
55 	unsigned char *buffer;
56 } stream_t;
57 
stream_name(int stream)58 const char *stream_name(int stream)
59 {
60 	if (stream < 0 || stream > input_stream_nr)
61 		return "<bad stream>";
62 	return input_streams[stream].name;
63 }
64 
stream_prev(int stream)65 int stream_prev(int stream)
66 {
67 	if (stream < 0 || stream > input_stream_nr)
68 		return -1;
69 	stream = input_streams[stream].pos.stream;
70 	if (stream > input_stream_nr)
71 		return -1;
72 	return stream;
73 }
74 
stream_pos(stream_t * stream)75 static struct position stream_pos(stream_t *stream)
76 {
77 	struct position pos;
78 	pos.type = 0;
79 	pos.stream = stream->nr;
80 	pos.newline = stream->newline;
81 	pos.whitespace = stream->whitespace;
82 	pos.pos = stream->pos;
83 	pos.line = stream->line;
84 	pos.noexpand = 0;
85 	return pos;
86 }
87 
show_special(int val)88 const char *show_special(int val)
89 {
90 	static char buffer[4];
91 
92 	buffer[0] = val;
93 	buffer[1] = 0;
94 	if (val >= SPECIAL_BASE)
95 		strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
96 	return buffer;
97 }
98 
show_ident(const struct ident * ident)99 const char *show_ident(const struct ident *ident)
100 {
101 	static char buff[4][256];
102 	static int n;
103 	char *buffer;
104 
105 	if (!ident)
106 		return "<noident>";
107 	buffer = buff[3 & ++n];
108 	sprintf(buffer, "%.*s", ident->len, ident->name);
109 	return buffer;
110 }
111 
charstr(char * ptr,unsigned char c,unsigned char escape,unsigned char next)112 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
113 {
114 	if (isprint(c)) {
115 		if (c == escape || c == '\\')
116 			*ptr++ = '\\';
117 		*ptr++ = c;
118 		return ptr;
119 	}
120 	*ptr++ = '\\';
121 	switch (c) {
122 	case '\n':
123 		*ptr++ = 'n';
124 		return ptr;
125 	case '\t':
126 		*ptr++ = 't';
127 		return ptr;
128 	}
129 	if (!isdigit(next))
130 		return ptr + sprintf(ptr, "%o", c);
131 
132 	return ptr + sprintf(ptr, "%03o", c);
133 }
134 
show_string(const struct string * string)135 const char *show_string(const struct string *string)
136 {
137 	static char buffer[4 * MAX_STRING + 3];
138 	char *ptr;
139 	int i;
140 
141 	if (!string || !string->length)
142 		return "<bad_string>";
143 	ptr = buffer;
144 	*ptr++ = '"';
145 	for (i = 0; i < string->length-1; i++) {
146 		const char *p = string->data + i;
147 		ptr = charstr(ptr, p[0], '"', p[1]);
148 	}
149 	*ptr++ = '"';
150 	*ptr = '\0';
151 	return buffer;
152 }
153 
show_char(const char * s,size_t len,char prefix,char delim)154 static const char *show_char(const char *s, size_t len, char prefix, char delim)
155 {
156 	static char buffer[MAX_STRING + 4];
157 	char *p = buffer;
158 	if (prefix)
159 		*p++ = prefix;
160 	*p++ = delim;
161 	memcpy(p, s, len);
162 	p += len;
163 	*p++ = delim;
164 	*p++ = '\0';
165 	return buffer;
166 }
167 
quote_char(const char * s,size_t len,char prefix,char delim)168 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
169 {
170 	static char buffer[2*MAX_STRING + 6];
171 	size_t i;
172 	char *p = buffer;
173 	if (prefix)
174 		*p++ = prefix;
175 	if (delim == '"')
176 		*p++ = '\\';
177 	*p++ = delim;
178 	for (i = 0; i < len; i++) {
179 		if (s[i] == '"' || s[i] == '\\')
180 			*p++ = '\\';
181 		*p++ = s[i];
182 	}
183 	if (delim == '"')
184 		*p++ = '\\';
185 	*p++ = delim;
186 	*p++ = '\0';
187 	return buffer;
188 }
189 
show_token(const struct token * token)190 const char *show_token(const struct token *token)
191 {
192 	static char buffer[256];
193 
194 	if (!token)
195 		return "<no token>";
196 	switch (token_type(token)) {
197 	case TOKEN_ERROR:
198 		return "syntax error";
199 
200 	case TOKEN_EOF:
201 		return "end-of-input";
202 
203 	case TOKEN_IDENT:
204 		return show_ident(token->ident);
205 
206 	case TOKEN_NUMBER:
207 		return token->number;
208 
209 	case TOKEN_SPECIAL:
210 		return show_special(token->special);
211 
212 	case TOKEN_CHAR:
213 		return show_char(token->string->data,
214 			token->string->length - 1, 0, '\'');
215 	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
216 		return show_char(token->embedded,
217 			token_type(token) - TOKEN_CHAR, 0, '\'');
218 	case TOKEN_WIDE_CHAR:
219 		return show_char(token->string->data,
220 			token->string->length - 1, 'L', '\'');
221 	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
222 		return show_char(token->embedded,
223 			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
224 	case TOKEN_STRING:
225 		return show_char(token->string->data,
226 			token->string->length - 1, 0, '"');
227 	case TOKEN_WIDE_STRING:
228 		return show_char(token->string->data,
229 			token->string->length - 1, 'L', '"');
230 
231 	case TOKEN_STREAMBEGIN:
232 		sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
233 		return buffer;
234 
235 	case TOKEN_STREAMEND:
236 		sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
237 		return buffer;
238 
239 	case TOKEN_UNTAINT:
240 		sprintf(buffer, "<untaint>");
241 		return buffer;
242 
243 	case TOKEN_ARG_COUNT:
244 		sprintf(buffer, "<argcnt>");
245 		return buffer;
246 
247 	default:
248 		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
249 		return buffer;
250 	}
251 }
252 
quote_token(const struct token * token)253 const char *quote_token(const struct token *token)
254 {
255 	static char buffer[256];
256 
257 	switch (token_type(token)) {
258 	case TOKEN_ERROR:
259 		return "syntax error";
260 
261 	case TOKEN_IDENT:
262 		return show_ident(token->ident);
263 
264 	case TOKEN_NUMBER:
265 		return token->number;
266 
267 	case TOKEN_SPECIAL:
268 		return show_special(token->special);
269 
270 	case TOKEN_CHAR:
271 		return quote_char(token->string->data,
272 			token->string->length - 1, 0, '\'');
273 	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
274 		return quote_char(token->embedded,
275 			token_type(token) - TOKEN_CHAR, 0, '\'');
276 	case TOKEN_WIDE_CHAR:
277 		return quote_char(token->string->data,
278 			token->string->length - 1, 'L', '\'');
279 	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
280 		return quote_char(token->embedded,
281 			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
282 	case TOKEN_STRING:
283 		return quote_char(token->string->data,
284 			token->string->length - 1, 0, '"');
285 	case TOKEN_WIDE_STRING:
286 		return quote_char(token->string->data,
287 			token->string->length - 1, 'L', '"');
288 	default:
289 		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
290 		return buffer;
291 	}
292 }
293 
294 #define HASHED_INPUT_BITS (6)
295 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
296 #define HASH_PRIME 0x9e370001UL
297 
298 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
299 
hash_stream(const char * name)300 int *hash_stream(const char *name)
301 {
302 	uint32_t hash = 0;
303 	unsigned char c;
304 
305 	while ((c = *name++) != 0)
306 		hash = (hash + (c << 4) + (c >> 4)) * 11;
307 
308 	hash *= HASH_PRIME;
309 	hash >>= 32 - HASHED_INPUT_BITS;
310 	return input_stream_hashes + hash;
311 }
312 
init_stream(const struct position * pos,const char * name,int fd,const char ** next_path)313 int init_stream(const struct position *pos, const char *name, int fd, const char **next_path)
314 {
315 	int stream = input_stream_nr, *hash;
316 	struct stream *current;
317 
318 	if (stream >= input_streams_allocated) {
319 		int newalloc = stream * 4 / 3 + 10;
320 		input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
321 		if (!input_streams)
322 			die("Unable to allocate more streams space");
323 		input_streams_allocated = newalloc;
324 	}
325 	current = input_streams + stream;
326 	memset(current, 0, sizeof(*current));
327 	current->name = name;
328 	current->fd = fd;
329 	current->next_path = next_path;
330 	current->path = NULL;
331 	current->constant = CONSTANT_FILE_MAYBE;
332 	if (pos)
333 		current->pos = *pos;
334 	else
335 		current->pos.stream = -1;
336 	input_stream_nr = stream+1;
337 	hash = hash_stream(name);
338 	current->next_stream = *hash;
339 	*hash = stream;
340 	return stream;
341 }
342 
alloc_token(stream_t * stream)343 static struct token * alloc_token(stream_t *stream)
344 {
345 	struct token *token = __alloc_token(0);
346 	token->pos = stream_pos(stream);
347 	return token;
348 }
349 
350 /*
351  *  Argh...  That was surprisingly messy - handling '\r' complicates the
352  *  things a _lot_.
353  */
nextchar_slow(stream_t * stream)354 static int nextchar_slow(stream_t *stream)
355 {
356 	int offset = stream->offset;
357 	int size = stream->size;
358 	int c;
359 	int spliced = 0, had_cr, had_backslash;
360 
361 restart:
362 	had_cr = had_backslash = 0;
363 
364 repeat:
365 	if (offset >= size) {
366 		if (stream->fd < 0)
367 			goto got_eof;
368 		size = read(stream->fd, stream->buffer, BUFSIZE);
369 		if (size <= 0)
370 			goto got_eof;
371 		stream->size = size;
372 		stream->offset = offset = 0;
373 	}
374 
375 	c = stream->buffer[offset++];
376 	if (had_cr)
377 		goto check_lf;
378 
379 	if (c == '\r') {
380 		had_cr = 1;
381 		goto repeat;
382 	}
383 
384 norm:
385 	if (!had_backslash) {
386 		switch (c) {
387 		case '\t':
388 			stream->pos += tabstop - stream->pos % tabstop;
389 			break;
390 		case '\n':
391 			stream->line++;
392 			stream->pos = 0;
393 			stream->newline = 1;
394 			break;
395 		case '\\':
396 			had_backslash = 1;
397 			stream->pos++;
398 			goto repeat;
399 		default:
400 			stream->pos++;
401 		}
402 	} else {
403 		if (c == '\n') {
404 			stream->line++;
405 			stream->pos = 0;
406 			spliced = 1;
407 			goto restart;
408 		}
409 		offset--;
410 		c = '\\';
411 	}
412 out:
413 	stream->offset = offset;
414 
415 	return c;
416 
417 check_lf:
418 	if (c != '\n')
419 		offset--;
420 	c = '\n';
421 	goto norm;
422 
423 got_eof:
424 	if (had_backslash) {
425 		c = '\\';
426 		goto out;
427 	}
428 	if (stream->pos & Wnewline_eof)
429 		warning(stream_pos(stream), "no newline at end of file");
430 	else if (spliced)
431 		warning(stream_pos(stream), "backslash-newline at end of file");
432 	return EOF;
433 }
434 
435 /*
436  *  We want that as light as possible while covering all normal cases.
437  *  Slow path (including the logics with line-splicing and EOF sanity
438  *  checks) is in nextchar_slow().
439  */
nextchar(stream_t * stream)440 static inline int nextchar(stream_t *stream)
441 {
442 	int offset = stream->offset;
443 
444 	if (offset < stream->size) {
445 		int c = stream->buffer[offset++];
446 		static const char special[256] = {
447 			['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
448 		};
449 		if (!special[c]) {
450 			stream->offset = offset;
451 			stream->pos++;
452 			return c;
453 		}
454 	}
455 	return nextchar_slow(stream);
456 }
457 
458 struct token eof_token_entry;
459 
mark_eof(stream_t * stream)460 static struct token *mark_eof(stream_t *stream)
461 {
462 	struct token *end;
463 
464 	end = alloc_token(stream);
465 	eof_token_entry.pos = end->pos;
466 	token_type(end) = TOKEN_STREAMEND;
467 	end->pos.newline = 1;
468 
469 	eof_token_entry.next = &eof_token_entry;
470 	eof_token_entry.pos.newline = 1;
471 
472 	end->next =  &eof_token_entry;
473 	*stream->tokenlist = end;
474 	stream->tokenlist = NULL;
475 	return end;
476 }
477 
add_token(stream_t * stream)478 static void add_token(stream_t *stream)
479 {
480 	struct token *token = stream->token;
481 
482 	stream->token = NULL;
483 	token->next = NULL;
484 	*stream->tokenlist = token;
485 	stream->tokenlist = &token->next;
486 }
487 
drop_token(stream_t * stream)488 static void drop_token(stream_t *stream)
489 {
490 	stream->newline |= stream->token->pos.newline;
491 	stream->whitespace |= stream->token->pos.whitespace;
492 	stream->token = NULL;
493 }
494 
495 enum {
496 	Letter = 1,
497 	Digit = 2,
498 	Hex = 4,
499 	Exp = 8,
500 	Dot = 16,
501 	ValidSecond = 32,
502 	Quote = 64,
503 };
504 
505 static const char cclass[257] = {
506 	['0' + 1 ... '9' + 1] = Digit | Hex,
507 	['A' + 1 ... 'D' + 1] = Letter | Hex,
508 	['E' + 1] = Letter | Hex | Exp,	/* E<exp> */
509 	['F' + 1] = Letter | Hex,
510 	['G' + 1 ... 'O' + 1] = Letter,
511 	['P' + 1] = Letter | Exp,	/* P<exp> */
512 	['Q' + 1 ... 'Z' + 1] = Letter,
513 	['a' + 1 ... 'd' + 1] = Letter | Hex,
514 	['e' + 1] = Letter | Hex | Exp,	/* e<exp> */
515 	['f' + 1] = Letter | Hex,
516 	['g' + 1 ... 'o' + 1] = Letter,
517 	['p' + 1] = Letter | Exp,	/* p<exp> */
518 	['q' + 1 ... 'z' + 1] = Letter,
519 	['_' + 1] = Letter,
520 	['.' + 1] = Dot | ValidSecond,
521 	['=' + 1] = ValidSecond,
522 	['+' + 1] = ValidSecond,
523 	['-' + 1] = ValidSecond,
524 	['>' + 1] = ValidSecond,
525 	['<' + 1] = ValidSecond,
526 	['&' + 1] = ValidSecond,
527 	['|' + 1] = ValidSecond,
528 	['#' + 1] = ValidSecond,
529 	['\'' + 1] = Quote,
530 	['"' + 1] = Quote,
531 };
532 
533 /*
534  * pp-number:
535  *	digit
536  *	. digit
537  *	pp-number digit
538  *	pp-number identifier-nodigit
539  *	pp-number e sign
540  *	pp-number E sign
541  *	pp-number p sign
542  *	pp-number P sign
543  *	pp-number .
544  */
get_one_number(int c,int next,stream_t * stream)545 static int get_one_number(int c, int next, stream_t *stream)
546 {
547 	struct token *token;
548 	static char buffer[4095];
549 	char *p = buffer, *buffer_end = buffer + sizeof (buffer);
550 
551 	*p++ = c;
552 	for (;;) {
553 		long class =  cclass[next + 1];
554 		if (!(class & (Dot | Digit | Letter)))
555 			break;
556 		if (p != buffer_end)
557 			*p++ = next;
558 		next = nextchar(stream);
559 		if (class & Exp) {
560 			if (next == '-' || next == '+') {
561 				if (p != buffer_end)
562 					*p++ = next;
563 				next = nextchar(stream);
564 			}
565 		}
566 	}
567 
568 	if (p == buffer_end) {
569 		sparse_error(stream_pos(stream), "number token exceeds %td characters",
570 		      buffer_end - buffer);
571 		// Pretend we saw just "1".
572 		buffer[0] = '1';
573 		p = buffer + 1;
574 	}
575 
576 	*p++ = 0;
577 	token = stream->token;
578 	token_type(token) = TOKEN_NUMBER;
579 	token->number = xmemdup(buffer, p - buffer);
580 	add_token(stream);
581 
582 	return next;
583 }
584 
eat_string(int next,stream_t * stream,enum token_type type)585 static int eat_string(int next, stream_t *stream, enum token_type type)
586 {
587 	static char buffer[MAX_STRING];
588 	struct string *string;
589 	struct token *token = stream->token;
590 	int len = 0;
591 	int escape;
592 	int want_hex = 0;
593 	char delim = type < TOKEN_STRING ? '\'' : '"';
594 
595 	for (escape = 0; escape || next != delim; next = nextchar(stream)) {
596 		if (len < MAX_STRING)
597 			buffer[len] = next;
598 		len++;
599 		if (next == '\n') {
600 			warning(stream_pos(stream),
601 				"missing terminating %c character", delim);
602 			/* assume delimiter is lost */
603 			break;
604 		}
605 		if (next == EOF) {
606 			warning(stream_pos(stream),
607 				"End of file in middle of string");
608 			return next;
609 		}
610 		if (!escape) {
611 			if (want_hex && !(cclass[next + 1] & Hex))
612 				warning(stream_pos(stream),
613 					"\\x used with no following hex digits");
614 			want_hex = 0;
615 			escape = next == '\\';
616 		} else {
617 			escape = 0;
618 			want_hex = next == 'x';
619 		}
620 	}
621 	if (want_hex)
622 		warning(stream_pos(stream),
623 			"\\x used with no following hex digits");
624 	if (len > MAX_STRING) {
625 		warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
626 		len = MAX_STRING;
627 	}
628 	if (delim == '\'' && len && len <= 4) {
629 		token_type(token) = type + len;
630 		memset(buffer + len, '\0', 4 - len);
631 		memcpy(token->embedded, buffer, 4);
632 	} else {
633 		token_type(token) = type;
634 		string = __alloc_string(len+1);
635 		memcpy(string->data, buffer, len);
636 		string->data[len] = '\0';
637 		string->length = len+1;
638 		token->string = string;
639 	}
640 
641 	/* Pass it on.. */
642 	token = stream->token;
643 	add_token(stream);
644 	return nextchar(stream);
645 }
646 
drop_stream_eoln(stream_t * stream)647 static int drop_stream_eoln(stream_t *stream)
648 {
649 	drop_token(stream);
650 	for (;;) {
651 		switch (nextchar(stream)) {
652 		case EOF:
653 			return EOF;
654 		case '\n':
655 			return nextchar(stream);
656 		}
657 	}
658 }
659 
drop_stream_comment(stream_t * stream)660 static int drop_stream_comment(stream_t *stream)
661 {
662 	int newline;
663 	int next;
664 	drop_token(stream);
665 	newline = stream->newline;
666 
667 	next = nextchar(stream);
668 	for (;;) {
669 		int curr = next;
670 		if (curr == EOF) {
671 			warning(stream_pos(stream), "End of file in the middle of a comment");
672 			return curr;
673 		}
674 		next = nextchar(stream);
675 		if (curr == '*' && next == '/')
676 			break;
677 	}
678 	stream->newline = newline;
679 	return nextchar(stream);
680 }
681 
682 unsigned char combinations[][4] = COMBINATION_STRINGS;
683 
684 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
685 
686 /* hash function for two-character punctuators - all give unique values */
687 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
688 
689 /*
690  * note that we won't get false positives - special_hash(0,0) is 0 and
691  * entry 0 is filled (by +=), so all the missing ones are OK.
692  */
693 static unsigned char hash_results[32][2] = {
694 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
695 	RES('+', '='), /* 00 */
696 	RES('/', '='), /* 01 */
697 	RES('^', '='), /* 05 */
698 	RES('&', '&'), /* 07 */
699 	RES('#', '#'), /* 08 */
700 	RES('<', '<'), /* 0a */
701 	RES('<', '='), /* 0c */
702 	RES('!', '='), /* 0e */
703 	RES('%', '='), /* 0f */
704 	RES('-', '-'), /* 10 */
705 	RES('-', '='), /* 11 */
706 	RES('-', '>'), /* 13 */
707 	RES('=', '='), /* 15 */
708 	RES('&', '='), /* 17 */
709 	RES('*', '='), /* 18 */
710 	RES('.', '.'), /* 1a */
711 	RES('+', '+'), /* 1b */
712 	RES('|', '='), /* 1c */
713 	RES('>', '='), /* 1d */
714 	RES('|', '|'), /* 1e */
715 	RES('>', '>')  /* 1f */
716 #undef RES
717 };
718 static int code[32] = {
719 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
720 	CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
721 	CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
722 	CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
723 	CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
724 	CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
725 	CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
726 	CODE('<', '=', SPECIAL_LTE), /* 0c */
727 	CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
728 	CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
729 	CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
730 	CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
731 	CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
732 	CODE('=', '=', SPECIAL_EQUAL), /* 15 */
733 	CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
734 	CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
735 	CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
736 	CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
737 	CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
738 	CODE('>', '=', SPECIAL_GTE), /* 1d */
739 	CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
740 	CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
741 #undef CODE
742 };
743 
get_one_special(int c,stream_t * stream)744 static int get_one_special(int c, stream_t *stream)
745 {
746 	struct token *token;
747 	int next, value, i;
748 
749 	next = nextchar(stream);
750 
751 	/*
752 	 * Check for numbers, strings, character constants, and comments
753 	 */
754 	switch (c) {
755 	case '.':
756 		if (next >= '0' && next <= '9')
757 			return get_one_number(c, next, stream);
758 		break;
759 	case '"':
760 		return eat_string(next, stream, TOKEN_STRING);
761 	case '\'':
762 		return eat_string(next, stream, TOKEN_CHAR);
763 	case '/':
764 		if (next == '/')
765 			return drop_stream_eoln(stream);
766 		if (next == '*')
767 			return drop_stream_comment(stream);
768 	}
769 
770 	/*
771 	 * Check for combinations
772 	 */
773 	value = c;
774 	if (cclass[next + 1] & ValidSecond) {
775 		i = special_hash(c, next);
776 		if (hash_results[i][0] == c && hash_results[i][1] == next) {
777 			value = code[i];
778 			next = nextchar(stream);
779 			if (value >= SPECIAL_LEFTSHIFT &&
780 			    next == "==."[value - SPECIAL_LEFTSHIFT]) {
781 				value += 3;
782 				next = nextchar(stream);
783 			}
784 		}
785 	}
786 
787 	/* Pass it on.. */
788 	token = stream->token;
789 	token_type(token) = TOKEN_SPECIAL;
790 	token->special = value;
791 	add_token(stream);
792 	return next;
793 }
794 
795 #define IDENT_HASH_BITS (13)
796 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
797 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
798 
799 #define ident_hash_init(c)		(c)
800 #define ident_hash_add(oldhash,c)	((oldhash)*11 + (c))
801 #define ident_hash_end(hash)		((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
802 
803 static struct ident *hash_table[IDENT_HASH_SIZE];
804 static int ident_hit, ident_miss, idents;
805 
show_identifier_stats(void)806 void show_identifier_stats(void)
807 {
808 	int i;
809 	int distribution[100];
810 
811 	fprintf(stderr, "identifiers: %d hits, %d misses\n",
812 		ident_hit, ident_miss);
813 
814 	for (i = 0; i < 100; i++)
815 		distribution[i] = 0;
816 
817 	for (i = 0; i < IDENT_HASH_SIZE; i++) {
818 		struct ident * ident = hash_table[i];
819 		int count = 0;
820 
821 		while (ident) {
822 			count++;
823 			ident = ident->next;
824 		}
825 		if (count > 99)
826 			count = 99;
827 		distribution[count]++;
828 	}
829 
830 	for (i = 0; i < 100; i++) {
831 		if (distribution[i])
832 			fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
833 	}
834 }
835 
alloc_ident(const char * name,int len)836 static struct ident *alloc_ident(const char *name, int len)
837 {
838 	struct ident *ident = __alloc_ident(len);
839 	ident->symbols = NULL;
840 	ident->len = len;
841 	ident->tainted = 0;
842 	memcpy(ident->name, name, len);
843 	return ident;
844 }
845 
insert_hash(struct ident * ident,unsigned long hash)846 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
847 {
848 	ident->next = hash_table[hash];
849 	hash_table[hash] = ident;
850 	ident_miss++;
851 	return ident;
852 }
853 
create_hashed_ident(const char * name,int len,unsigned long hash)854 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
855 {
856 	struct ident *ident;
857 	struct ident **p;
858 
859 	p = &hash_table[hash];
860 	while ((ident = *p) != NULL) {
861 		if (ident->len == (unsigned char) len) {
862 			if (strncmp(name, ident->name, len) != 0)
863 				goto next;
864 
865 			ident_hit++;
866 			return ident;
867 		}
868 next:
869 		//misses++;
870 		p = &ident->next;
871 	}
872 	ident = alloc_ident(name, len);
873 	*p = ident;
874 	ident->next = NULL;
875 	ident_miss++;
876 	idents++;
877 	return ident;
878 }
879 
hash_name(const char * name,int len)880 static unsigned long hash_name(const char *name, int len)
881 {
882 	unsigned long hash;
883 	const unsigned char *p = (const unsigned char *)name;
884 
885 	hash = ident_hash_init(*p++);
886 	while (--len) {
887 		unsigned int i = *p++;
888 		hash = ident_hash_add(hash, i);
889 	}
890 	return ident_hash_end(hash);
891 }
892 
hash_ident(struct ident * ident)893 struct ident *hash_ident(struct ident *ident)
894 {
895 	return insert_hash(ident, hash_name(ident->name, ident->len));
896 }
897 
built_in_ident(const char * name)898 struct ident *built_in_ident(const char *name)
899 {
900 	int len = strlen(name);
901 	return create_hashed_ident(name, len, hash_name(name, len));
902 }
903 
built_in_token(int stream,struct ident * ident)904 struct token *built_in_token(int stream, struct ident *ident)
905 {
906 	struct token *token;
907 
908 	token = __alloc_token(0);
909 	token->pos.stream = stream;
910 	token_type(token) = TOKEN_IDENT;
911 	token->ident = ident;
912 	return token;
913 }
914 
get_one_identifier(int c,stream_t * stream)915 static int get_one_identifier(int c, stream_t *stream)
916 {
917 	struct token *token;
918 	struct ident *ident;
919 	unsigned long hash;
920 	char buf[256];
921 	int len = 1;
922 	int next;
923 
924 	hash = ident_hash_init(c);
925 	buf[0] = c;
926 	for (;;) {
927 		next = nextchar(stream);
928 		if (!(cclass[next + 1] & (Letter | Digit)))
929 			break;
930 		if (len >= sizeof(buf))
931 			break;
932 		hash = ident_hash_add(hash, next);
933 		buf[len] = next;
934 		len++;
935 	};
936 	if (cclass[next + 1] & Quote) {
937 		if (len == 1 && buf[0] == 'L') {
938 			if (next == '\'')
939 				return eat_string(nextchar(stream), stream,
940 							TOKEN_WIDE_CHAR);
941 			else
942 				return eat_string(nextchar(stream), stream,
943 							TOKEN_WIDE_STRING);
944 		}
945 	}
946 	hash = ident_hash_end(hash);
947 	ident = create_hashed_ident(buf, len, hash);
948 
949 	/* Pass it on.. */
950 	token = stream->token;
951 	token_type(token) = TOKEN_IDENT;
952 	token->ident = ident;
953 	add_token(stream);
954 	return next;
955 }
956 
get_one_token(int c,stream_t * stream)957 static int get_one_token(int c, stream_t *stream)
958 {
959 	long class = cclass[c + 1];
960 	if (class & Digit)
961 		return get_one_number(c, nextchar(stream), stream);
962 	if (class & Letter)
963 		return get_one_identifier(c, stream);
964 	return get_one_special(c, stream);
965 }
966 
setup_stream(stream_t * stream,int idx,int fd,unsigned char * buf,unsigned int buf_size)967 static struct token *setup_stream(stream_t *stream, int idx, int fd,
968 	unsigned char *buf, unsigned int buf_size)
969 {
970 	struct token *begin;
971 
972 	stream->nr = idx;
973 	stream->line = 1;
974 	stream->newline = 1;
975 	stream->whitespace = 0;
976 	stream->pos = 0;
977 
978 	stream->token = NULL;
979 	stream->fd = fd;
980 	stream->offset = 0;
981 	stream->size = buf_size;
982 	stream->buffer = buf;
983 
984 	begin = alloc_token(stream);
985 	token_type(begin) = TOKEN_STREAMBEGIN;
986 	stream->tokenlist = &begin->next;
987 	return begin;
988 }
989 
tokenize_stream(stream_t * stream)990 static struct token *tokenize_stream(stream_t *stream)
991 {
992 	int c = nextchar(stream);
993 	while (c != EOF) {
994 		if (!isspace(c)) {
995 			struct token *token = alloc_token(stream);
996 			stream->token = token;
997 			stream->newline = 0;
998 			stream->whitespace = 0;
999 			c = get_one_token(c, stream);
1000 			continue;
1001 		}
1002 		stream->whitespace = 1;
1003 		c = nextchar(stream);
1004 	}
1005 	return mark_eof(stream);
1006 }
1007 
tokenize_buffer(void * buffer,unsigned long size,struct token ** endtoken)1008 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1009 {
1010 	stream_t stream;
1011 	struct token *begin;
1012 
1013 	begin = setup_stream(&stream, 0, -1, buffer, size);
1014 	*endtoken = tokenize_stream(&stream);
1015 	return begin;
1016 }
1017 
tokenize(const struct position * pos,const char * name,int fd,struct token * endtoken,const char ** next_path)1018 struct token * tokenize(const struct position *pos, const char *name, int fd, struct token *endtoken, const char **next_path)
1019 {
1020 	struct token *begin, *end;
1021 	stream_t stream;
1022 	unsigned char buffer[BUFSIZE];
1023 	int idx;
1024 
1025 	idx = init_stream(pos, name, fd, next_path);
1026 	if (idx < 0) {
1027 		// info(endtoken->pos, "File %s is const", name);
1028 		return endtoken;
1029 	}
1030 
1031 	begin = setup_stream(&stream, idx, fd, buffer, 0);
1032 	end = tokenize_stream(&stream);
1033 	if (endtoken)
1034 		end->next = endtoken;
1035 	return begin;
1036 }
1037