1
2 /* Tokenizer implementation */
3
4 #include "Python.h"
5 #include "pgenheaders.h"
6
7 #include <ctype.h>
8 #include <assert.h>
9
10 #include "tokenizer.h"
11 #include "errcode.h"
12
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
21
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
29
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
34
35 /* Token names */
36
37 char *_PyParser_TokenNames[] = {
38 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93 };
94
95 /* Create and initialize a new tok_state structure */
96
97 static struct tok_state *
tok_new(void)98 tok_new(void)
99 {
100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101 sizeof(struct tok_state));
102 if (tok == NULL)
103 return NULL;
104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105 tok->done = E_OK;
106 tok->fp = NULL;
107 tok->input = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
126 #ifndef PGEN
127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
129 #endif
130 return tok;
131 }
132
133 static char *
new_string(const char * s,Py_ssize_t len)134 new_string(const char *s, Py_ssize_t len)
135 {
136 char* result = (char *)PyMem_MALLOC(len + 1);
137 if (result != NULL) {
138 memcpy(result, s, len);
139 result[len] = '\0';
140 }
141 return result;
142 }
143
144 #ifdef PGEN
145
146 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)147 decoding_fgets(char *s, int size, struct tok_state *tok)
148 {
149 return fgets(s, size, tok->fp);
150 }
151
152 static int
decoding_feof(struct tok_state * tok)153 decoding_feof(struct tok_state *tok)
154 {
155 return feof(tok->fp);
156 }
157
158 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)159 decode_str(const char *str, int exec_input, struct tok_state *tok)
160 {
161 return new_string(str, strlen(str));
162 }
163
164 #else /* PGEN */
165
166 static char *
error_ret(struct tok_state * tok)167 error_ret(struct tok_state *tok) /* XXX */
168 {
169 tok->decoding_erred = 1;
170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171 PyMem_FREE(tok->buf);
172 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
173 tok->done = E_DECODE;
174 return NULL; /* as if it were EOF */
175 }
176
177
178 static char *
get_normal_name(char * s)179 get_normal_name(char *s) /* for utf-8 and latin-1 */
180 {
181 char buf[13];
182 int i;
183 for (i = 0; i < 12; i++) {
184 int c = s[i];
185 if (c == '\0')
186 break;
187 else if (c == '_')
188 buf[i] = '-';
189 else
190 buf[i] = tolower(c);
191 }
192 buf[i] = '\0';
193 if (strcmp(buf, "utf-8") == 0 ||
194 strncmp(buf, "utf-8-", 6) == 0)
195 return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0)
202 return "iso-8859-1";
203 else
204 return s;
205 }
206
207 /* Return the coding spec in S, or NULL if none is found. */
208
209 static char *
get_coding_spec(const char * s,Py_ssize_t size)210 get_coding_spec(const char *s, Py_ssize_t size)
211 {
212 Py_ssize_t i;
213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
220 }
221 for (; i < size - 6; i++) { /* XXX inefficient search */
222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
231
232 begin = t;
233 while (Py_ISALNUM(t[0]) ||
234 t[0] == '-' || t[0] == '_' || t[0] == '.')
235 t++;
236
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q;
240 if (!r)
241 return NULL;
242 q = get_normal_name(r);
243 if (r != q) {
244 PyMem_FREE(r);
245 r = new_string(q, strlen(q));
246 }
247 return r;
248 }
249 }
250 }
251 return NULL;
252 }
253
254 /* Check whether the line contains a coding spec. If it does,
255 invoke the set_readline function for the new encoding.
256 This function receives the tok_state and the new encoding.
257 Return 1 on success, 0 on failure. */
258
259 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))260 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
261 int set_readline(struct tok_state *, const char *))
262 {
263 char * cs;
264 int r = 1;
265
266 if (tok->cont_line) {
267 /* It's a continuation line, so it can't be a coding spec. */
268 tok->read_coding_spec = 1;
269 return 1;
270 }
271 cs = get_coding_spec(line, size);
272 if (!cs) {
273 Py_ssize_t i;
274 for (i = 0; i < size; i++) {
275 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
276 break;
277 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
278 /* Stop checking coding spec after a line containing
279 * anything except a comment. */
280 tok->read_coding_spec = 1;
281 break;
282 }
283 }
284 } else {
285 tok->read_coding_spec = 1;
286 if (tok->encoding == NULL) {
287 assert(tok->decoding_state == 1); /* raw */
288 if (strcmp(cs, "utf-8") == 0 ||
289 strcmp(cs, "iso-8859-1") == 0) {
290 tok->encoding = cs;
291 } else {
292 #ifdef Py_USING_UNICODE
293 r = set_readline(tok, cs);
294 if (r) {
295 tok->encoding = cs;
296 tok->decoding_state = -1;
297 }
298 else {
299 PyErr_Format(PyExc_SyntaxError,
300 "encoding problem: %s", cs);
301 PyMem_FREE(cs);
302 }
303 #else
304 /* Without Unicode support, we cannot
305 process the coding spec. Since there
306 won't be any Unicode literals, that
307 won't matter. */
308 PyMem_FREE(cs);
309 #endif
310 }
311 } else { /* then, compare cs with BOM */
312 r = (strcmp(tok->encoding, cs) == 0);
313 if (!r)
314 PyErr_Format(PyExc_SyntaxError,
315 "encoding problem: %s with BOM", cs);
316 PyMem_FREE(cs);
317 }
318 }
319 return r;
320 }
321
322 /* See whether the file starts with a BOM. If it does,
323 invoke the set_readline function with the new encoding.
324 Return 1 on success, 0 on failure. */
325
326 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)327 check_bom(int get_char(struct tok_state *),
328 void unget_char(int, struct tok_state *),
329 int set_readline(struct tok_state *, const char *),
330 struct tok_state *tok)
331 {
332 int ch1, ch2, ch3;
333 ch1 = get_char(tok);
334 tok->decoding_state = 1;
335 if (ch1 == EOF) {
336 return 1;
337 } else if (ch1 == 0xEF) {
338 ch2 = get_char(tok);
339 if (ch2 != 0xBB) {
340 unget_char(ch2, tok);
341 unget_char(ch1, tok);
342 return 1;
343 }
344 ch3 = get_char(tok);
345 if (ch3 != 0xBF) {
346 unget_char(ch3, tok);
347 unget_char(ch2, tok);
348 unget_char(ch1, tok);
349 return 1;
350 }
351 #if 0
352 /* Disable support for UTF-16 BOMs until a decision
353 is made whether this needs to be supported. */
354 } else if (ch1 == 0xFE) {
355 ch2 = get_char(tok);
356 if (ch2 != 0xFF) {
357 unget_char(ch2, tok);
358 unget_char(ch1, tok);
359 return 1;
360 }
361 if (!set_readline(tok, "utf-16-be"))
362 return 0;
363 tok->decoding_state = -1;
364 } else if (ch1 == 0xFF) {
365 ch2 = get_char(tok);
366 if (ch2 != 0xFE) {
367 unget_char(ch2, tok);
368 unget_char(ch1, tok);
369 return 1;
370 }
371 if (!set_readline(tok, "utf-16-le"))
372 return 0;
373 tok->decoding_state = -1;
374 #endif
375 } else {
376 unget_char(ch1, tok);
377 return 1;
378 }
379 if (tok->encoding != NULL)
380 PyMem_FREE(tok->encoding);
381 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
382 return 1;
383 }
384
385 /* Read a line of text from TOK into S, using the stream in TOK.
386 Return NULL on failure, else S.
387
388 On entry, tok->decoding_buffer will be one of:
389 1) NULL: need to call tok->decoding_readline to get a new line
390 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
391 stored the result in tok->decoding_buffer
392 3) PyStringObject *: previous call to fp_readl did not have enough room
393 (in the s buffer) to copy entire contents of the line read
394 by tok->decoding_readline. tok->decoding_buffer has the overflow.
395 In this case, fp_readl is called in a loop (with an expanded buffer)
396 until the buffer ends with a '\n' (or until the end of the file is
397 reached): see tok_nextc and its calls to decoding_fgets.
398 */
399
400 static char *
fp_readl(char * s,int size,struct tok_state * tok)401 fp_readl(char *s, int size, struct tok_state *tok)
402 {
403 #ifndef Py_USING_UNICODE
404 /* In a non-Unicode built, this should never be called. */
405 Py_FatalError("fp_readl should not be called in this build.");
406 return NULL; /* Keep compiler happy (not reachable) */
407 #else
408 PyObject* utf8 = NULL;
409 PyObject* buf = tok->decoding_buffer;
410 char *str;
411 Py_ssize_t utf8len;
412
413 /* Ask for one less byte so we can terminate it */
414 assert(size > 0);
415 size--;
416
417 if (buf == NULL) {
418 buf = PyObject_CallObject(tok->decoding_readline, NULL);
419 if (buf == NULL)
420 return error_ret(tok);
421 if (!PyUnicode_Check(buf)) {
422 Py_DECREF(buf);
423 PyErr_SetString(PyExc_SyntaxError,
424 "codec did not return a unicode object");
425 return error_ret(tok);
426 }
427 } else {
428 tok->decoding_buffer = NULL;
429 if (PyString_CheckExact(buf))
430 utf8 = buf;
431 }
432 if (utf8 == NULL) {
433 utf8 = PyUnicode_AsUTF8String(buf);
434 Py_DECREF(buf);
435 if (utf8 == NULL)
436 return error_ret(tok);
437 }
438 str = PyString_AsString(utf8);
439 utf8len = PyString_GET_SIZE(utf8);
440 if (utf8len > size) {
441 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
442 if (tok->decoding_buffer == NULL) {
443 Py_DECREF(utf8);
444 return error_ret(tok);
445 }
446 utf8len = size;
447 }
448 memcpy(s, str, utf8len);
449 s[utf8len] = '\0';
450 Py_DECREF(utf8);
451 if (utf8len == 0)
452 return NULL; /* EOF */
453 return s;
454 #endif
455 }
456
457 /* Set the readline function for TOK to a StreamReader's
458 readline function. The StreamReader is named ENC.
459
460 This function is called from check_bom and check_coding_spec.
461
462 ENC is usually identical to the future value of tok->encoding,
463 except for the (currently unsupported) case of UTF-16.
464
465 Return 1 on success, 0 on failure. */
466
467 static int
fp_setreadl(struct tok_state * tok,const char * enc)468 fp_setreadl(struct tok_state *tok, const char* enc)
469 {
470 PyObject *reader, *stream, *readline;
471
472 /* XXX: constify filename argument. */
473 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
474 if (stream == NULL)
475 return 0;
476
477 reader = PyCodec_StreamReader(enc, stream, NULL);
478 Py_DECREF(stream);
479 if (reader == NULL)
480 return 0;
481
482 readline = PyObject_GetAttrString(reader, "readline");
483 Py_DECREF(reader);
484 if (readline == NULL)
485 return 0;
486
487 tok->decoding_readline = readline;
488 return 1;
489 }
490
491 /* Fetch the next byte from TOK. */
492
fp_getc(struct tok_state * tok)493 static int fp_getc(struct tok_state *tok) {
494 return getc(tok->fp);
495 }
496
497 /* Unfetch the last byte back into TOK. */
498
fp_ungetc(int c,struct tok_state * tok)499 static void fp_ungetc(int c, struct tok_state *tok) {
500 ungetc(c, tok->fp);
501 }
502
503 /* Read a line of input from TOK. Determine encoding
504 if necessary. */
505
506 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)507 decoding_fgets(char *s, int size, struct tok_state *tok)
508 {
509 char *line = NULL;
510 int badchar = 0;
511 for (;;) {
512 if (tok->decoding_state < 0) {
513 /* We already have a codec associated with
514 this input. */
515 line = fp_readl(s, size, tok);
516 break;
517 } else if (tok->decoding_state > 0) {
518 /* We want a 'raw' read. */
519 line = Py_UniversalNewlineFgets(s, size,
520 tok->fp, NULL);
521 break;
522 } else {
523 /* We have not yet determined the encoding.
524 If an encoding is found, use the file-pointer
525 reader functions from now on. */
526 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
527 return error_ret(tok);
528 assert(tok->decoding_state != 0);
529 }
530 }
531 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
532 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
533 return error_ret(tok);
534 }
535 }
536 #ifndef PGEN
537 /* The default encoding is ASCII, so make sure we don't have any
538 non-ASCII bytes in it. */
539 if (line && !tok->encoding) {
540 unsigned char *c;
541 for (c = (unsigned char *)line; *c; c++)
542 if (*c > 127) {
543 badchar = *c;
544 break;
545 }
546 }
547 if (badchar) {
548 char buf[500];
549 /* Need to add 1 to the line number, since this line
550 has not been counted, yet. */
551 sprintf(buf,
552 "Non-ASCII character '\\x%.2x' "
553 "in file %.200s on line %i, "
554 "but no encoding declared; "
555 "see http://python.org/dev/peps/pep-0263/ for details",
556 badchar, tok->filename, tok->lineno + 1);
557 PyErr_SetString(PyExc_SyntaxError, buf);
558 return error_ret(tok);
559 }
560 #endif
561 return line;
562 }
563
564 static int
decoding_feof(struct tok_state * tok)565 decoding_feof(struct tok_state *tok)
566 {
567 if (tok->decoding_state >= 0) {
568 return feof(tok->fp);
569 } else {
570 PyObject* buf = tok->decoding_buffer;
571 if (buf == NULL) {
572 buf = PyObject_CallObject(tok->decoding_readline, NULL);
573 if (buf == NULL) {
574 error_ret(tok);
575 return 1;
576 } else {
577 tok->decoding_buffer = buf;
578 }
579 }
580 return PyObject_Length(buf) == 0;
581 }
582 }
583
584 /* Fetch a byte from TOK, using the string buffer. */
585
586 static int
buf_getc(struct tok_state * tok)587 buf_getc(struct tok_state *tok) {
588 return Py_CHARMASK(*tok->str++);
589 }
590
591 /* Unfetch a byte from TOK, using the string buffer. */
592
593 static void
buf_ungetc(int c,struct tok_state * tok)594 buf_ungetc(int c, struct tok_state *tok) {
595 tok->str--;
596 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
597 }
598
599 /* Set the readline function for TOK to ENC. For the string-based
600 tokenizer, this means to just record the encoding. */
601
602 static int
buf_setreadl(struct tok_state * tok,const char * enc)603 buf_setreadl(struct tok_state *tok, const char* enc) {
604 tok->enc = enc;
605 return 1;
606 }
607
608 /* Return a UTF-8 encoding Python string object from the
609 C byte string STR, which is encoded with ENC. */
610
611 #ifdef Py_USING_UNICODE
612 static PyObject *
translate_into_utf8(const char * str,const char * enc)613 translate_into_utf8(const char* str, const char* enc) {
614 PyObject *utf8;
615 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
616 if (buf == NULL)
617 return NULL;
618 utf8 = PyUnicode_AsUTF8String(buf);
619 Py_DECREF(buf);
620 return utf8;
621 }
622 #endif
623
624
625 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)626 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
627 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
628 char *buf, *current;
629 char c = '\0';
630 buf = PyMem_MALLOC(needed_length);
631 if (buf == NULL) {
632 tok->done = E_NOMEM;
633 return NULL;
634 }
635 for (current = buf; *s; s++, current++) {
636 c = *s;
637 if (skip_next_lf) {
638 skip_next_lf = 0;
639 if (c == '\n') {
640 c = *++s;
641 if (!c)
642 break;
643 }
644 }
645 if (c == '\r') {
646 skip_next_lf = 1;
647 c = '\n';
648 }
649 *current = c;
650 }
651 /* If this is exec input, add a newline to the end of the string if
652 there isn't one already. */
653 if (exec_input && c != '\n') {
654 *current = '\n';
655 current++;
656 }
657 *current = '\0';
658 final_length = current - buf + 1;
659 if (final_length < needed_length && final_length)
660 /* should never fail */
661 buf = PyMem_REALLOC(buf, final_length);
662 return buf;
663 }
664
665 /* Decode a byte string STR for use as the buffer of TOK.
666 Look for encoding declarations inside STR, and record them
667 inside TOK. */
668
669 static const char *
decode_str(const char * input,int single,struct tok_state * tok)670 decode_str(const char *input, int single, struct tok_state *tok)
671 {
672 PyObject* utf8 = NULL;
673 const char *str;
674 const char *s;
675 const char *newl[2] = {NULL, NULL};
676 int lineno = 0;
677 tok->input = str = translate_newlines(input, single, tok);
678 if (str == NULL)
679 return NULL;
680 tok->enc = NULL;
681 tok->str = str;
682 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
683 return error_ret(tok);
684 str = tok->str; /* string after BOM if any */
685 assert(str);
686 #ifdef Py_USING_UNICODE
687 if (tok->enc != NULL) {
688 utf8 = translate_into_utf8(str, tok->enc);
689 if (utf8 == NULL)
690 return error_ret(tok);
691 str = PyString_AsString(utf8);
692 }
693 #endif
694 for (s = str;; s++) {
695 if (*s == '\0') break;
696 else if (*s == '\n') {
697 assert(lineno < 2);
698 newl[lineno] = s;
699 lineno++;
700 if (lineno == 2) break;
701 }
702 }
703 tok->enc = NULL;
704 /* need to check line 1 and 2 separately since check_coding_spec
705 assumes a single line as input */
706 if (newl[0]) {
707 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
708 return error_ret(tok);
709 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
710 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
711 tok, buf_setreadl))
712 return error_ret(tok);
713 }
714 }
715 #ifdef Py_USING_UNICODE
716 if (tok->enc != NULL) {
717 assert(utf8 == NULL);
718 utf8 = translate_into_utf8(str, tok->enc);
719 if (utf8 == NULL)
720 return error_ret(tok);
721 str = PyString_AsString(utf8);
722 }
723 #endif
724 assert(tok->decoding_buffer == NULL);
725 tok->decoding_buffer = utf8; /* CAUTION */
726 return str;
727 }
728
729 #endif /* PGEN */
730
731 /* Set up tokenizer for string */
732
733 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)734 PyTokenizer_FromString(const char *str, int exec_input)
735 {
736 struct tok_state *tok = tok_new();
737 if (tok == NULL)
738 return NULL;
739 str = (char *)decode_str(str, exec_input, tok);
740 if (str == NULL) {
741 PyTokenizer_Free(tok);
742 return NULL;
743 }
744
745 /* XXX: constify members. */
746 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
747 return tok;
748 }
749
750
751 /* Set up tokenizer for file */
752
753 struct tok_state *
PyTokenizer_FromFile(FILE * fp,char * ps1,char * ps2)754 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
755 {
756 struct tok_state *tok = tok_new();
757 if (tok == NULL)
758 return NULL;
759 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
760 PyTokenizer_Free(tok);
761 return NULL;
762 }
763 tok->cur = tok->inp = tok->buf;
764 tok->end = tok->buf + BUFSIZ;
765 tok->fp = fp;
766 tok->prompt = ps1;
767 tok->nextprompt = ps2;
768 return tok;
769 }
770
771
772 /* Free a tok_state structure */
773
774 void
PyTokenizer_Free(struct tok_state * tok)775 PyTokenizer_Free(struct tok_state *tok)
776 {
777 if (tok->encoding != NULL)
778 PyMem_FREE(tok->encoding);
779 #ifndef PGEN
780 Py_XDECREF(tok->decoding_readline);
781 Py_XDECREF(tok->decoding_buffer);
782 #endif
783 if (tok->fp != NULL && tok->buf != NULL)
784 PyMem_FREE(tok->buf);
785 if (tok->input)
786 PyMem_FREE((char *)tok->input);
787 PyMem_FREE(tok);
788 }
789
790 #if !defined(PGEN) && defined(Py_USING_UNICODE)
791 static int
tok_stdin_decode(struct tok_state * tok,char ** inp)792 tok_stdin_decode(struct tok_state *tok, char **inp)
793 {
794 PyObject *enc, *sysstdin, *decoded, *utf8;
795 const char *encoding;
796 char *converted;
797
798 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
799 return 0;
800 sysstdin = PySys_GetObject("stdin");
801 if (sysstdin == NULL || !PyFile_Check(sysstdin))
802 return 0;
803
804 enc = ((PyFileObject *)sysstdin)->f_encoding;
805 if (enc == NULL || !PyString_Check(enc))
806 return 0;
807 Py_INCREF(enc);
808
809 encoding = PyString_AsString(enc);
810 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
811 if (decoded == NULL)
812 goto error_clear;
813
814 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
815 Py_DECREF(decoded);
816 if (utf8 == NULL)
817 goto error_clear;
818
819 assert(PyString_Check(utf8));
820 converted = new_string(PyString_AS_STRING(utf8),
821 PyString_GET_SIZE(utf8));
822 Py_DECREF(utf8);
823 if (converted == NULL)
824 goto error_nomem;
825
826 PyMem_FREE(*inp);
827 *inp = converted;
828 if (tok->encoding != NULL)
829 PyMem_FREE(tok->encoding);
830 tok->encoding = new_string(encoding, strlen(encoding));
831 if (tok->encoding == NULL)
832 goto error_nomem;
833
834 Py_DECREF(enc);
835 return 0;
836
837 error_nomem:
838 Py_DECREF(enc);
839 tok->done = E_NOMEM;
840 return -1;
841
842 error_clear:
843 Py_DECREF(enc);
844 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
845 tok->done = E_ERROR;
846 return -1;
847 }
848 /* Fallback to iso-8859-1: for backward compatibility */
849 PyErr_Clear();
850 return 0;
851 }
852 #endif
853
854 /* Get next char, updating state; error code goes into tok->done */
855
856 static int
tok_nextc(register struct tok_state * tok)857 tok_nextc(register struct tok_state *tok)
858 {
859 for (;;) {
860 if (tok->cur != tok->inp) {
861 return Py_CHARMASK(*tok->cur++); /* Fast path */
862 }
863 if (tok->done != E_OK)
864 return EOF;
865 if (tok->fp == NULL) {
866 char *end = strchr(tok->inp, '\n');
867 if (end != NULL)
868 end++;
869 else {
870 end = strchr(tok->inp, '\0');
871 if (end == tok->inp) {
872 tok->done = E_EOF;
873 return EOF;
874 }
875 }
876 if (tok->start == NULL)
877 tok->buf = tok->cur;
878 tok->line_start = tok->cur;
879 tok->lineno++;
880 tok->inp = end;
881 return Py_CHARMASK(*tok->cur++);
882 }
883 if (tok->prompt != NULL) {
884 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
885 if (tok->nextprompt != NULL)
886 tok->prompt = tok->nextprompt;
887 if (newtok == NULL)
888 tok->done = E_INTR;
889 else if (*newtok == '\0') {
890 PyMem_FREE(newtok);
891 tok->done = E_EOF;
892 }
893 #if !defined(PGEN) && defined(Py_USING_UNICODE)
894 else if (tok_stdin_decode(tok, &newtok) != 0)
895 PyMem_FREE(newtok);
896 #endif
897 else if (tok->start != NULL) {
898 size_t start = tok->start - tok->buf;
899 size_t oldlen = tok->cur - tok->buf;
900 size_t newlen = oldlen + strlen(newtok);
901 char *buf = tok->buf;
902 buf = (char *)PyMem_REALLOC(buf, newlen+1);
903 tok->lineno++;
904 if (buf == NULL) {
905 PyMem_FREE(tok->buf);
906 tok->buf = NULL;
907 PyMem_FREE(newtok);
908 tok->done = E_NOMEM;
909 return EOF;
910 }
911 tok->buf = buf;
912 tok->cur = tok->buf + oldlen;
913 tok->line_start = tok->cur;
914 strcpy(tok->buf + oldlen, newtok);
915 PyMem_FREE(newtok);
916 tok->inp = tok->buf + newlen;
917 tok->end = tok->inp + 1;
918 tok->start = tok->buf + start;
919 }
920 else {
921 tok->lineno++;
922 if (tok->buf != NULL)
923 PyMem_FREE(tok->buf);
924 tok->buf = newtok;
925 tok->cur = tok->buf;
926 tok->line_start = tok->buf;
927 tok->inp = strchr(tok->buf, '\0');
928 tok->end = tok->inp + 1;
929 }
930 }
931 else {
932 int done = 0;
933 Py_ssize_t cur = 0;
934 char *pt;
935 if (tok->start == NULL) {
936 if (tok->buf == NULL) {
937 tok->buf = (char *)
938 PyMem_MALLOC(BUFSIZ);
939 if (tok->buf == NULL) {
940 tok->done = E_NOMEM;
941 return EOF;
942 }
943 tok->end = tok->buf + BUFSIZ;
944 }
945 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
946 tok) == NULL) {
947 if (!tok->decoding_erred)
948 tok->done = E_EOF;
949 done = 1;
950 }
951 else {
952 tok->done = E_OK;
953 tok->inp = strchr(tok->buf, '\0');
954 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
955 }
956 }
957 else {
958 cur = tok->cur - tok->buf;
959 if (decoding_feof(tok)) {
960 tok->done = E_EOF;
961 done = 1;
962 }
963 else
964 tok->done = E_OK;
965 }
966 tok->lineno++;
967 /* Read until '\n' or EOF */
968 while (!done) {
969 Py_ssize_t curstart = tok->start == NULL ? -1 :
970 tok->start - tok->buf;
971 Py_ssize_t curvalid = tok->inp - tok->buf;
972 Py_ssize_t newsize = curvalid + BUFSIZ;
973 char *newbuf = tok->buf;
974 newbuf = (char *)PyMem_REALLOC(newbuf,
975 newsize);
976 if (newbuf == NULL) {
977 tok->done = E_NOMEM;
978 tok->cur = tok->inp;
979 return EOF;
980 }
981 tok->buf = newbuf;
982 tok->cur = tok->buf + cur;
983 tok->line_start = tok->cur;
984 tok->inp = tok->buf + curvalid;
985 tok->end = tok->buf + newsize;
986 tok->start = curstart < 0 ? NULL :
987 tok->buf + curstart;
988 if (decoding_fgets(tok->inp,
989 (int)(tok->end - tok->inp),
990 tok) == NULL) {
991 /* Break out early on decoding
992 errors, as tok->buf will be NULL
993 */
994 if (tok->decoding_erred)
995 return EOF;
996 /* Last line does not end in \n,
997 fake one */
998 strcpy(tok->inp, "\n");
999 }
1000 tok->inp = strchr(tok->inp, '\0');
1001 done = tok->inp[-1] == '\n';
1002 }
1003 if (tok->buf != NULL) {
1004 tok->cur = tok->buf + cur;
1005 tok->line_start = tok->cur;
1006 /* replace "\r\n" with "\n" */
1007 /* For Mac leave the \r, giving a syntax error */
1008 pt = tok->inp - 2;
1009 if (pt >= tok->buf && *pt == '\r') {
1010 *pt++ = '\n';
1011 *pt = '\0';
1012 tok->inp = pt;
1013 }
1014 }
1015 }
1016 if (tok->done != E_OK) {
1017 if (tok->prompt != NULL)
1018 PySys_WriteStderr("\n");
1019 tok->cur = tok->inp;
1020 return EOF;
1021 }
1022 }
1023 /*NOTREACHED*/
1024 }
1025
1026
1027 /* Back-up one character */
1028
1029 static void
tok_backup(register struct tok_state * tok,register int c)1030 tok_backup(register struct tok_state *tok, register int c)
1031 {
1032 if (c != EOF) {
1033 if (--tok->cur < tok->buf)
1034 Py_FatalError("tok_backup: beginning of buffer");
1035 if (*tok->cur != c)
1036 *tok->cur = c;
1037 }
1038 }
1039
1040
1041 /* Return the token corresponding to a single character */
1042
1043 int
PyToken_OneChar(int c)1044 PyToken_OneChar(int c)
1045 {
1046 switch (c) {
1047 case '(': return LPAR;
1048 case ')': return RPAR;
1049 case '[': return LSQB;
1050 case ']': return RSQB;
1051 case ':': return COLON;
1052 case ',': return COMMA;
1053 case ';': return SEMI;
1054 case '+': return PLUS;
1055 case '-': return MINUS;
1056 case '*': return STAR;
1057 case '/': return SLASH;
1058 case '|': return VBAR;
1059 case '&': return AMPER;
1060 case '<': return LESS;
1061 case '>': return GREATER;
1062 case '=': return EQUAL;
1063 case '.': return DOT;
1064 case '%': return PERCENT;
1065 case '`': return BACKQUOTE;
1066 case '{': return LBRACE;
1067 case '}': return RBRACE;
1068 case '^': return CIRCUMFLEX;
1069 case '~': return TILDE;
1070 case '@': return AT;
1071 default: return OP;
1072 }
1073 }
1074
1075
1076 int
PyToken_TwoChars(int c1,int c2)1077 PyToken_TwoChars(int c1, int c2)
1078 {
1079 switch (c1) {
1080 case '=':
1081 switch (c2) {
1082 case '=': return EQEQUAL;
1083 }
1084 break;
1085 case '!':
1086 switch (c2) {
1087 case '=': return NOTEQUAL;
1088 }
1089 break;
1090 case '<':
1091 switch (c2) {
1092 case '>': return NOTEQUAL;
1093 case '=': return LESSEQUAL;
1094 case '<': return LEFTSHIFT;
1095 }
1096 break;
1097 case '>':
1098 switch (c2) {
1099 case '=': return GREATEREQUAL;
1100 case '>': return RIGHTSHIFT;
1101 }
1102 break;
1103 case '+':
1104 switch (c2) {
1105 case '=': return PLUSEQUAL;
1106 }
1107 break;
1108 case '-':
1109 switch (c2) {
1110 case '=': return MINEQUAL;
1111 }
1112 break;
1113 case '*':
1114 switch (c2) {
1115 case '*': return DOUBLESTAR;
1116 case '=': return STAREQUAL;
1117 }
1118 break;
1119 case '/':
1120 switch (c2) {
1121 case '/': return DOUBLESLASH;
1122 case '=': return SLASHEQUAL;
1123 }
1124 break;
1125 case '|':
1126 switch (c2) {
1127 case '=': return VBAREQUAL;
1128 }
1129 break;
1130 case '%':
1131 switch (c2) {
1132 case '=': return PERCENTEQUAL;
1133 }
1134 break;
1135 case '&':
1136 switch (c2) {
1137 case '=': return AMPEREQUAL;
1138 }
1139 break;
1140 case '^':
1141 switch (c2) {
1142 case '=': return CIRCUMFLEXEQUAL;
1143 }
1144 break;
1145 }
1146 return OP;
1147 }
1148
1149 int
PyToken_ThreeChars(int c1,int c2,int c3)1150 PyToken_ThreeChars(int c1, int c2, int c3)
1151 {
1152 switch (c1) {
1153 case '<':
1154 switch (c2) {
1155 case '<':
1156 switch (c3) {
1157 case '=':
1158 return LEFTSHIFTEQUAL;
1159 }
1160 break;
1161 }
1162 break;
1163 case '>':
1164 switch (c2) {
1165 case '>':
1166 switch (c3) {
1167 case '=':
1168 return RIGHTSHIFTEQUAL;
1169 }
1170 break;
1171 }
1172 break;
1173 case '*':
1174 switch (c2) {
1175 case '*':
1176 switch (c3) {
1177 case '=':
1178 return DOUBLESTAREQUAL;
1179 }
1180 break;
1181 }
1182 break;
1183 case '/':
1184 switch (c2) {
1185 case '/':
1186 switch (c3) {
1187 case '=':
1188 return DOUBLESLASHEQUAL;
1189 }
1190 break;
1191 }
1192 break;
1193 }
1194 return OP;
1195 }
1196
1197 static int
indenterror(struct tok_state * tok)1198 indenterror(struct tok_state *tok)
1199 {
1200 if (tok->alterror) {
1201 tok->done = E_TABSPACE;
1202 tok->cur = tok->inp;
1203 return 1;
1204 }
1205 if (tok->altwarning) {
1206 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1207 "in indentation\n", tok->filename);
1208 tok->altwarning = 0;
1209 }
1210 return 0;
1211 }
1212
1213 /* Get next token, after space stripping etc. */
1214
1215 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1216 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1217 {
1218 register int c;
1219 int blankline;
1220
1221 *p_start = *p_end = NULL;
1222 nextline:
1223 tok->start = NULL;
1224 blankline = 0;
1225
1226 /* Get indentation level */
1227 if (tok->atbol) {
1228 register int col = 0;
1229 register int altcol = 0;
1230 tok->atbol = 0;
1231 for (;;) {
1232 c = tok_nextc(tok);
1233 if (c == ' ')
1234 col++, altcol++;
1235 else if (c == '\t') {
1236 col = (col/tok->tabsize + 1) * tok->tabsize;
1237 altcol = (altcol/tok->alttabsize + 1)
1238 * tok->alttabsize;
1239 }
1240 else if (c == '\014') /* Control-L (formfeed) */
1241 col = altcol = 0; /* For Emacs users */
1242 else
1243 break;
1244 }
1245 tok_backup(tok, c);
1246 if (c == '#' || c == '\n') {
1247 /* Lines with only whitespace and/or comments
1248 shouldn't affect the indentation and are
1249 not passed to the parser as NEWLINE tokens,
1250 except *totally* empty lines in interactive
1251 mode, which signal the end of a command group. */
1252 if (col == 0 && c == '\n' && tok->prompt != NULL)
1253 blankline = 0; /* Let it through */
1254 else
1255 blankline = 1; /* Ignore completely */
1256 /* We can't jump back right here since we still
1257 may need to skip to the end of a comment */
1258 }
1259 if (!blankline && tok->level == 0) {
1260 if (col == tok->indstack[tok->indent]) {
1261 /* No change */
1262 if (altcol != tok->altindstack[tok->indent]) {
1263 if (indenterror(tok))
1264 return ERRORTOKEN;
1265 }
1266 }
1267 else if (col > tok->indstack[tok->indent]) {
1268 /* Indent -- always one */
1269 if (tok->indent+1 >= MAXINDENT) {
1270 tok->done = E_TOODEEP;
1271 tok->cur = tok->inp;
1272 return ERRORTOKEN;
1273 }
1274 if (altcol <= tok->altindstack[tok->indent]) {
1275 if (indenterror(tok))
1276 return ERRORTOKEN;
1277 }
1278 tok->pendin++;
1279 tok->indstack[++tok->indent] = col;
1280 tok->altindstack[tok->indent] = altcol;
1281 }
1282 else /* col < tok->indstack[tok->indent] */ {
1283 /* Dedent -- any number, must be consistent */
1284 while (tok->indent > 0 &&
1285 col < tok->indstack[tok->indent]) {
1286 tok->pendin--;
1287 tok->indent--;
1288 }
1289 if (col != tok->indstack[tok->indent]) {
1290 tok->done = E_DEDENT;
1291 tok->cur = tok->inp;
1292 return ERRORTOKEN;
1293 }
1294 if (altcol != tok->altindstack[tok->indent]) {
1295 if (indenterror(tok))
1296 return ERRORTOKEN;
1297 }
1298 }
1299 }
1300 }
1301
1302 tok->start = tok->cur;
1303
1304 /* Return pending indents/dedents */
1305 if (tok->pendin != 0) {
1306 if (tok->pendin < 0) {
1307 tok->pendin++;
1308 return DEDENT;
1309 }
1310 else {
1311 tok->pendin--;
1312 return INDENT;
1313 }
1314 }
1315
1316 again:
1317 tok->start = NULL;
1318 /* Skip spaces */
1319 do {
1320 c = tok_nextc(tok);
1321 } while (c == ' ' || c == '\t' || c == '\014');
1322
1323 /* Set start of current token */
1324 tok->start = tok->cur - 1;
1325
1326 /* Skip comment, while looking for tab-setting magic */
1327 if (c == '#') {
1328 static char *tabforms[] = {
1329 "tab-width:", /* Emacs */
1330 ":tabstop=", /* vim, full form */
1331 ":ts=", /* vim, abbreviated form */
1332 "set tabsize=", /* will vi never die? */
1333 /* more templates can be added here to support other editors */
1334 };
1335 char cbuf[80];
1336 char *tp, **cp;
1337 tp = cbuf;
1338 do {
1339 *tp++ = c = tok_nextc(tok);
1340 } while (c != EOF && c != '\n' &&
1341 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1342 *tp = '\0';
1343 for (cp = tabforms;
1344 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1345 cp++) {
1346 if ((tp = strstr(cbuf, *cp))) {
1347 int newsize = atoi(tp + strlen(*cp));
1348
1349 if (newsize >= 1 && newsize <= 40) {
1350 tok->tabsize = newsize;
1351 if (Py_VerboseFlag)
1352 PySys_WriteStderr(
1353 "Tab size set to %d\n",
1354 newsize);
1355 }
1356 }
1357 }
1358 while (c != EOF && c != '\n')
1359 c = tok_nextc(tok);
1360 }
1361
1362 /* Check for EOF and errors now */
1363 if (c == EOF) {
1364 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1365 }
1366
1367 /* Identifier (most frequent token!) */
1368 if (Py_ISALPHA(c) || c == '_') {
1369 /* Process r"", u"" and ur"" */
1370 switch (c) {
1371 case 'b':
1372 case 'B':
1373 c = tok_nextc(tok);
1374 if (c == 'r' || c == 'R')
1375 c = tok_nextc(tok);
1376 if (c == '"' || c == '\'')
1377 goto letter_quote;
1378 break;
1379 case 'r':
1380 case 'R':
1381 c = tok_nextc(tok);
1382 if (c == '"' || c == '\'')
1383 goto letter_quote;
1384 break;
1385 case 'u':
1386 case 'U':
1387 c = tok_nextc(tok);
1388 if (c == 'r' || c == 'R')
1389 c = tok_nextc(tok);
1390 if (c == '"' || c == '\'')
1391 goto letter_quote;
1392 break;
1393 }
1394 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1395 c = tok_nextc(tok);
1396 }
1397 tok_backup(tok, c);
1398 *p_start = tok->start;
1399 *p_end = tok->cur;
1400 return NAME;
1401 }
1402
1403 /* Newline */
1404 if (c == '\n') {
1405 tok->atbol = 1;
1406 if (blankline || tok->level > 0)
1407 goto nextline;
1408 *p_start = tok->start;
1409 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1410 tok->cont_line = 0;
1411 return NEWLINE;
1412 }
1413
1414 /* Period or number starting with period? */
1415 if (c == '.') {
1416 c = tok_nextc(tok);
1417 if (isdigit(c)) {
1418 goto fraction;
1419 }
1420 else {
1421 tok_backup(tok, c);
1422 *p_start = tok->start;
1423 *p_end = tok->cur;
1424 return DOT;
1425 }
1426 }
1427
1428 /* Number */
1429 if (isdigit(c)) {
1430 if (c == '0') {
1431 /* Hex, octal or binary -- maybe. */
1432 c = tok_nextc(tok);
1433 if (c == '.')
1434 goto fraction;
1435 #ifndef WITHOUT_COMPLEX
1436 if (c == 'j' || c == 'J')
1437 goto imaginary;
1438 #endif
1439 if (c == 'x' || c == 'X') {
1440
1441 /* Hex */
1442 c = tok_nextc(tok);
1443 if (!isxdigit(c)) {
1444 tok->done = E_TOKEN;
1445 tok_backup(tok, c);
1446 return ERRORTOKEN;
1447 }
1448 do {
1449 c = tok_nextc(tok);
1450 } while (isxdigit(c));
1451 }
1452 else if (c == 'o' || c == 'O') {
1453 /* Octal */
1454 c = tok_nextc(tok);
1455 if (c < '0' || c >= '8') {
1456 tok->done = E_TOKEN;
1457 tok_backup(tok, c);
1458 return ERRORTOKEN;
1459 }
1460 do {
1461 c = tok_nextc(tok);
1462 } while ('0' <= c && c < '8');
1463 }
1464 else if (c == 'b' || c == 'B') {
1465 /* Binary */
1466 c = tok_nextc(tok);
1467 if (c != '0' && c != '1') {
1468 tok->done = E_TOKEN;
1469 tok_backup(tok, c);
1470 return ERRORTOKEN;
1471 }
1472 do {
1473 c = tok_nextc(tok);
1474 } while (c == '0' || c == '1');
1475 }
1476 else {
1477 int found_decimal = 0;
1478 /* Octal; c is first char of it */
1479 /* There's no 'isoctdigit' macro, sigh */
1480 while ('0' <= c && c < '8') {
1481 c = tok_nextc(tok);
1482 }
1483 if (isdigit(c)) {
1484 found_decimal = 1;
1485 do {
1486 c = tok_nextc(tok);
1487 } while (isdigit(c));
1488 }
1489 if (c == '.')
1490 goto fraction;
1491 else if (c == 'e' || c == 'E')
1492 goto exponent;
1493 #ifndef WITHOUT_COMPLEX
1494 else if (c == 'j' || c == 'J')
1495 goto imaginary;
1496 #endif
1497 else if (found_decimal) {
1498 tok->done = E_TOKEN;
1499 tok_backup(tok, c);
1500 return ERRORTOKEN;
1501 }
1502 }
1503 if (c == 'l' || c == 'L')
1504 c = tok_nextc(tok);
1505 }
1506 else {
1507 /* Decimal */
1508 do {
1509 c = tok_nextc(tok);
1510 } while (isdigit(c));
1511 if (c == 'l' || c == 'L')
1512 c = tok_nextc(tok);
1513 else {
1514 /* Accept floating point numbers. */
1515 if (c == '.') {
1516 fraction:
1517 /* Fraction */
1518 do {
1519 c = tok_nextc(tok);
1520 } while (isdigit(c));
1521 }
1522 if (c == 'e' || c == 'E') {
1523 int e;
1524 exponent:
1525 e = c;
1526 /* Exponent part */
1527 c = tok_nextc(tok);
1528 if (c == '+' || c == '-') {
1529 c = tok_nextc(tok);
1530 if (!isdigit(c)) {
1531 tok->done = E_TOKEN;
1532 tok_backup(tok, c);
1533 return ERRORTOKEN;
1534 }
1535 } else if (!isdigit(c)) {
1536 tok_backup(tok, c);
1537 tok_backup(tok, e);
1538 *p_start = tok->start;
1539 *p_end = tok->cur;
1540 return NUMBER;
1541 }
1542 do {
1543 c = tok_nextc(tok);
1544 } while (isdigit(c));
1545 }
1546 #ifndef WITHOUT_COMPLEX
1547 if (c == 'j' || c == 'J')
1548 /* Imaginary part */
1549 imaginary:
1550 c = tok_nextc(tok);
1551 #endif
1552 }
1553 }
1554 tok_backup(tok, c);
1555 *p_start = tok->start;
1556 *p_end = tok->cur;
1557 return NUMBER;
1558 }
1559
1560 letter_quote:
1561 /* String */
1562 if (c == '\'' || c == '"') {
1563 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1564 int quote = c;
1565 int triple = 0;
1566 int tripcount = 0;
1567 for (;;) {
1568 c = tok_nextc(tok);
1569 if (c == '\n') {
1570 if (!triple) {
1571 tok->done = E_EOLS;
1572 tok_backup(tok, c);
1573 return ERRORTOKEN;
1574 }
1575 tripcount = 0;
1576 tok->cont_line = 1; /* multiline string. */
1577 }
1578 else if (c == EOF) {
1579 if (triple)
1580 tok->done = E_EOFS;
1581 else
1582 tok->done = E_EOLS;
1583 tok->cur = tok->inp;
1584 return ERRORTOKEN;
1585 }
1586 else if (c == quote) {
1587 tripcount++;
1588 if (tok->cur - tok->start == quote2) {
1589 c = tok_nextc(tok);
1590 if (c == quote) {
1591 triple = 1;
1592 tripcount = 0;
1593 continue;
1594 }
1595 tok_backup(tok, c);
1596 }
1597 if (!triple || tripcount == 3)
1598 break;
1599 }
1600 else if (c == '\\') {
1601 tripcount = 0;
1602 c = tok_nextc(tok);
1603 if (c == EOF) {
1604 tok->done = E_EOLS;
1605 tok->cur = tok->inp;
1606 return ERRORTOKEN;
1607 }
1608 }
1609 else
1610 tripcount = 0;
1611 }
1612 *p_start = tok->start;
1613 *p_end = tok->cur;
1614 return STRING;
1615 }
1616
1617 /* Line continuation */
1618 if (c == '\\') {
1619 c = tok_nextc(tok);
1620 if (c != '\n') {
1621 tok->done = E_LINECONT;
1622 tok->cur = tok->inp;
1623 return ERRORTOKEN;
1624 }
1625 tok->cont_line = 1;
1626 goto again; /* Read next line */
1627 }
1628
1629 /* Check for two-character token */
1630 {
1631 int c2 = tok_nextc(tok);
1632 int token = PyToken_TwoChars(c, c2);
1633 #ifndef PGEN
1634 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1635 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1636 "<> not supported in 3.x; use !=",
1637 tok->filename, tok->lineno,
1638 NULL, NULL)) {
1639 return ERRORTOKEN;
1640 }
1641 }
1642 #endif
1643 if (token != OP) {
1644 int c3 = tok_nextc(tok);
1645 int token3 = PyToken_ThreeChars(c, c2, c3);
1646 if (token3 != OP) {
1647 token = token3;
1648 } else {
1649 tok_backup(tok, c3);
1650 }
1651 *p_start = tok->start;
1652 *p_end = tok->cur;
1653 return token;
1654 }
1655 tok_backup(tok, c2);
1656 }
1657
1658 /* Keep track of parentheses nesting level */
1659 switch (c) {
1660 case '(':
1661 case '[':
1662 case '{':
1663 tok->level++;
1664 break;
1665 case ')':
1666 case ']':
1667 case '}':
1668 tok->level--;
1669 break;
1670 }
1671
1672 /* Punctuation character */
1673 *p_start = tok->start;
1674 *p_end = tok->cur;
1675 return PyToken_OneChar(c);
1676 }
1677
1678 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1679 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1680 {
1681 int result = tok_get(tok, p_start, p_end);
1682 if (tok->decoding_erred) {
1683 result = ERRORTOKEN;
1684 tok->done = E_DECODE;
1685 }
1686 return result;
1687 }
1688
1689 /* This function is only called from parsetok. However, it cannot live
1690 there, as it must be empty for PGEN, and we can check for PGEN only
1691 in this file. */
1692
1693 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1694 char*
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1695 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1696 {
1697 return NULL;
1698 }
1699 #else
1700 #ifdef Py_USING_UNICODE
1701 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1702 dec_utf8(const char *enc, const char *text, size_t len) {
1703 PyObject *ret = NULL;
1704 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1705 if (unicode_text) {
1706 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1707 Py_DECREF(unicode_text);
1708 }
1709 if (!ret) {
1710 PyErr_Clear();
1711 }
1712 return ret;
1713 }
1714 char *
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1715 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1716 {
1717 char *text = NULL;
1718 if (tok->encoding) {
1719 /* convert source to original encondig */
1720 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1721 if (lineobj != NULL) {
1722 int linelen = PyString_Size(lineobj);
1723 const char *line = PyString_AsString(lineobj);
1724 text = PyObject_MALLOC(linelen + 1);
1725 if (text != NULL && line != NULL) {
1726 if (linelen)
1727 strncpy(text, line, linelen);
1728 text[linelen] = '\0';
1729 }
1730 Py_DECREF(lineobj);
1731
1732 /* adjust error offset */
1733 if (*offset > 1) {
1734 PyObject *offsetobj = dec_utf8(tok->encoding,
1735 tok->buf, *offset-1);
1736 if (offsetobj) {
1737 *offset = PyString_Size(offsetobj) + 1;
1738 Py_DECREF(offsetobj);
1739 }
1740 }
1741
1742 }
1743 }
1744 return text;
1745
1746 }
1747 #endif /* defined(Py_USING_UNICODE) */
1748 #endif
1749
1750
1751 #ifdef Py_DEBUG
1752
1753 void
tok_dump(int type,char * start,char * end)1754 tok_dump(int type, char *start, char *end)
1755 {
1756 printf("%s", _PyParser_TokenNames[type]);
1757 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1758 printf("(%.*s)", (int)(end - start), start);
1759 }
1760
1761 #endif
1762