1
2 /* Tokenizer implementation */
3
4 #include "Python.h"
5 #include "pgenheaders.h"
6
7 #include <ctype.h>
8 #include <assert.h>
9
10 #include "tokenizer.h"
11 #include "errcode.h"
12
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
21
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
29
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
34
35 /* Token names */
36
37 char *_PyParser_TokenNames[] = {
38 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93 };
94
95 /* Create and initialize a new tok_state structure */
96
97 static struct tok_state *
tok_new(void)98 tok_new(void)
99 {
100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101 sizeof(struct tok_state));
102 if (tok == NULL)
103 return NULL;
104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105 tok->done = E_OK;
106 tok->fp = NULL;
107 tok->input = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
126 #ifndef PGEN
127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
129 #endif
130 return tok;
131 }
132
133 static char *
new_string(const char * s,Py_ssize_t len)134 new_string(const char *s, Py_ssize_t len)
135 {
136 char* result = (char *)PyMem_MALLOC(len + 1);
137 if (result != NULL) {
138 memcpy(result, s, len);
139 result[len] = '\0';
140 }
141 return result;
142 }
143
144 #ifdef PGEN
145
146 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)147 decoding_fgets(char *s, int size, struct tok_state *tok)
148 {
149 return fgets(s, size, tok->fp);
150 }
151
152 static int
decoding_feof(struct tok_state * tok)153 decoding_feof(struct tok_state *tok)
154 {
155 return feof(tok->fp);
156 }
157
158 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)159 decode_str(const char *str, int exec_input, struct tok_state *tok)
160 {
161 return new_string(str, strlen(str));
162 }
163
164 #else /* PGEN */
165
166 static char *
error_ret(struct tok_state * tok)167 error_ret(struct tok_state *tok) /* XXX */
168 {
169 tok->decoding_erred = 1;
170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171 PyMem_FREE(tok->buf);
172 tok->buf = NULL;
173 return NULL; /* as if it were EOF */
174 }
175
176
177 static char *
get_normal_name(char * s)178 get_normal_name(char *s) /* for utf-8 and latin-1 */
179 {
180 char buf[13];
181 int i;
182 for (i = 0; i < 12; i++) {
183 int c = s[i];
184 if (c == '\0')
185 break;
186 else if (c == '_')
187 buf[i] = '-';
188 else
189 buf[i] = tolower(c);
190 }
191 buf[i] = '\0';
192 if (strcmp(buf, "utf-8") == 0 ||
193 strncmp(buf, "utf-8-", 6) == 0)
194 return "utf-8";
195 else if (strcmp(buf, "latin-1") == 0 ||
196 strcmp(buf, "iso-8859-1") == 0 ||
197 strcmp(buf, "iso-latin-1") == 0 ||
198 strncmp(buf, "latin-1-", 8) == 0 ||
199 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf, "iso-latin-1-", 12) == 0)
201 return "iso-8859-1";
202 else
203 return s;
204 }
205
206 /* Return the coding spec in S, or NULL if none is found. */
207
208 static char *
get_coding_spec(const char * s,Py_ssize_t size)209 get_coding_spec(const char *s, Py_ssize_t size)
210 {
211 Py_ssize_t i;
212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
232 while (Py_ISALNUM(t[0]) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
240 PyMem_FREE(r);
241 r = new_string(q, strlen(q));
242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248 }
249
250 /* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))256 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
257 int set_readline(struct tok_state *, const char *))
258 {
259 char * cs;
260 int r = 1;
261
262 if (tok->cont_line) {
263 /* It's a continuation line, so it can't be a coding spec. */
264 tok->read_coding_spec = 1;
265 return 1;
266 }
267 cs = get_coding_spec(line, size);
268 if (!cs) {
269 Py_ssize_t i;
270 for (i = 0; i < size; i++) {
271 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
272 break;
273 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
274 /* Stop checking coding spec after a line containing
275 * anything except a comment. */
276 tok->read_coding_spec = 1;
277 break;
278 }
279 }
280 } else {
281 tok->read_coding_spec = 1;
282 if (tok->encoding == NULL) {
283 assert(tok->decoding_state == 1); /* raw */
284 if (strcmp(cs, "utf-8") == 0 ||
285 strcmp(cs, "iso-8859-1") == 0) {
286 tok->encoding = cs;
287 } else {
288 #ifdef Py_USING_UNICODE
289 r = set_readline(tok, cs);
290 if (r) {
291 tok->encoding = cs;
292 tok->decoding_state = -1;
293 }
294 else {
295 PyErr_Format(PyExc_SyntaxError,
296 "encoding problem: %s", cs);
297 PyMem_FREE(cs);
298 }
299 #else
300 /* Without Unicode support, we cannot
301 process the coding spec. Since there
302 won't be any Unicode literals, that
303 won't matter. */
304 PyMem_FREE(cs);
305 #endif
306 }
307 } else { /* then, compare cs with BOM */
308 r = (strcmp(tok->encoding, cs) == 0);
309 if (!r)
310 PyErr_Format(PyExc_SyntaxError,
311 "encoding problem: %s with BOM", cs);
312 PyMem_FREE(cs);
313 }
314 }
315 return r;
316 }
317
318 /* See whether the file starts with a BOM. If it does,
319 invoke the set_readline function with the new encoding.
320 Return 1 on success, 0 on failure. */
321
322 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)323 check_bom(int get_char(struct tok_state *),
324 void unget_char(int, struct tok_state *),
325 int set_readline(struct tok_state *, const char *),
326 struct tok_state *tok)
327 {
328 int ch1, ch2, ch3;
329 ch1 = get_char(tok);
330 tok->decoding_state = 1;
331 if (ch1 == EOF) {
332 return 1;
333 } else if (ch1 == 0xEF) {
334 ch2 = get_char(tok);
335 if (ch2 != 0xBB) {
336 unget_char(ch2, tok);
337 unget_char(ch1, tok);
338 return 1;
339 }
340 ch3 = get_char(tok);
341 if (ch3 != 0xBF) {
342 unget_char(ch3, tok);
343 unget_char(ch2, tok);
344 unget_char(ch1, tok);
345 return 1;
346 }
347 #if 0
348 /* Disable support for UTF-16 BOMs until a decision
349 is made whether this needs to be supported. */
350 } else if (ch1 == 0xFE) {
351 ch2 = get_char(tok);
352 if (ch2 != 0xFF) {
353 unget_char(ch2, tok);
354 unget_char(ch1, tok);
355 return 1;
356 }
357 if (!set_readline(tok, "utf-16-be"))
358 return 0;
359 tok->decoding_state = -1;
360 } else if (ch1 == 0xFF) {
361 ch2 = get_char(tok);
362 if (ch2 != 0xFE) {
363 unget_char(ch2, tok);
364 unget_char(ch1, tok);
365 return 1;
366 }
367 if (!set_readline(tok, "utf-16-le"))
368 return 0;
369 tok->decoding_state = -1;
370 #endif
371 } else {
372 unget_char(ch1, tok);
373 return 1;
374 }
375 if (tok->encoding != NULL)
376 PyMem_FREE(tok->encoding);
377 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
378 return 1;
379 }
380
381 /* Read a line of text from TOK into S, using the stream in TOK.
382 Return NULL on failure, else S.
383
384 On entry, tok->decoding_buffer will be one of:
385 1) NULL: need to call tok->decoding_readline to get a new line
386 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
387 stored the result in tok->decoding_buffer
388 3) PyStringObject *: previous call to fp_readl did not have enough room
389 (in the s buffer) to copy entire contents of the line read
390 by tok->decoding_readline. tok->decoding_buffer has the overflow.
391 In this case, fp_readl is called in a loop (with an expanded buffer)
392 until the buffer ends with a '\n' (or until the end of the file is
393 reached): see tok_nextc and its calls to decoding_fgets.
394 */
395
396 static char *
fp_readl(char * s,int size,struct tok_state * tok)397 fp_readl(char *s, int size, struct tok_state *tok)
398 {
399 #ifndef Py_USING_UNICODE
400 /* In a non-Unicode built, this should never be called. */
401 Py_FatalError("fp_readl should not be called in this build.");
402 return NULL; /* Keep compiler happy (not reachable) */
403 #else
404 PyObject* utf8 = NULL;
405 PyObject* buf = tok->decoding_buffer;
406 char *str;
407 Py_ssize_t utf8len;
408
409 /* Ask for one less byte so we can terminate it */
410 assert(size > 0);
411 size--;
412
413 if (buf == NULL) {
414 buf = PyObject_CallObject(tok->decoding_readline, NULL);
415 if (buf == NULL)
416 return error_ret(tok);
417 if (!PyUnicode_Check(buf)) {
418 Py_DECREF(buf);
419 PyErr_SetString(PyExc_SyntaxError,
420 "codec did not return a unicode object");
421 return error_ret(tok);
422 }
423 } else {
424 tok->decoding_buffer = NULL;
425 if (PyString_CheckExact(buf))
426 utf8 = buf;
427 }
428 if (utf8 == NULL) {
429 utf8 = PyUnicode_AsUTF8String(buf);
430 Py_DECREF(buf);
431 if (utf8 == NULL)
432 return error_ret(tok);
433 }
434 str = PyString_AsString(utf8);
435 utf8len = PyString_GET_SIZE(utf8);
436 if (utf8len > size) {
437 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
438 if (tok->decoding_buffer == NULL) {
439 Py_DECREF(utf8);
440 return error_ret(tok);
441 }
442 utf8len = size;
443 }
444 memcpy(s, str, utf8len);
445 s[utf8len] = '\0';
446 Py_DECREF(utf8);
447 if (utf8len == 0)
448 return NULL; /* EOF */
449 return s;
450 #endif
451 }
452
453 /* Set the readline function for TOK to a StreamReader's
454 readline function. The StreamReader is named ENC.
455
456 This function is called from check_bom and check_coding_spec.
457
458 ENC is usually identical to the future value of tok->encoding,
459 except for the (currently unsupported) case of UTF-16.
460
461 Return 1 on success, 0 on failure. */
462
463 static int
fp_setreadl(struct tok_state * tok,const char * enc)464 fp_setreadl(struct tok_state *tok, const char* enc)
465 {
466 PyObject *reader, *stream, *readline;
467
468 /* XXX: constify filename argument. */
469 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
470 if (stream == NULL)
471 return 0;
472
473 reader = PyCodec_StreamReader(enc, stream, NULL);
474 Py_DECREF(stream);
475 if (reader == NULL)
476 return 0;
477
478 readline = PyObject_GetAttrString(reader, "readline");
479 Py_DECREF(reader);
480 if (readline == NULL)
481 return 0;
482
483 tok->decoding_readline = readline;
484 return 1;
485 }
486
487 /* Fetch the next byte from TOK. */
488
fp_getc(struct tok_state * tok)489 static int fp_getc(struct tok_state *tok) {
490 return getc(tok->fp);
491 }
492
493 /* Unfetch the last byte back into TOK. */
494
fp_ungetc(int c,struct tok_state * tok)495 static void fp_ungetc(int c, struct tok_state *tok) {
496 ungetc(c, tok->fp);
497 }
498
499 /* Read a line of input from TOK. Determine encoding
500 if necessary. */
501
502 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)503 decoding_fgets(char *s, int size, struct tok_state *tok)
504 {
505 char *line = NULL;
506 int badchar = 0;
507 for (;;) {
508 if (tok->decoding_state < 0) {
509 /* We already have a codec associated with
510 this input. */
511 line = fp_readl(s, size, tok);
512 break;
513 } else if (tok->decoding_state > 0) {
514 /* We want a 'raw' read. */
515 line = Py_UniversalNewlineFgets(s, size,
516 tok->fp, NULL);
517 break;
518 } else {
519 /* We have not yet determined the encoding.
520 If an encoding is found, use the file-pointer
521 reader functions from now on. */
522 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
523 return error_ret(tok);
524 assert(tok->decoding_state != 0);
525 }
526 }
527 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
528 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
529 return error_ret(tok);
530 }
531 }
532 #ifndef PGEN
533 /* The default encoding is ASCII, so make sure we don't have any
534 non-ASCII bytes in it. */
535 if (line && !tok->encoding) {
536 unsigned char *c;
537 for (c = (unsigned char *)line; *c; c++)
538 if (*c > 127) {
539 badchar = *c;
540 break;
541 }
542 }
543 if (badchar) {
544 char buf[500];
545 /* Need to add 1 to the line number, since this line
546 has not been counted, yet. */
547 sprintf(buf,
548 "Non-ASCII character '\\x%.2x' "
549 "in file %.200s on line %i, "
550 "but no encoding declared; "
551 "see http://python.org/dev/peps/pep-0263/ for details",
552 badchar, tok->filename, tok->lineno + 1);
553 PyErr_SetString(PyExc_SyntaxError, buf);
554 return error_ret(tok);
555 }
556 #endif
557 return line;
558 }
559
560 static int
decoding_feof(struct tok_state * tok)561 decoding_feof(struct tok_state *tok)
562 {
563 if (tok->decoding_state >= 0) {
564 return feof(tok->fp);
565 } else {
566 PyObject* buf = tok->decoding_buffer;
567 if (buf == NULL) {
568 buf = PyObject_CallObject(tok->decoding_readline, NULL);
569 if (buf == NULL) {
570 error_ret(tok);
571 return 1;
572 } else {
573 tok->decoding_buffer = buf;
574 }
575 }
576 return PyObject_Length(buf) == 0;
577 }
578 }
579
580 /* Fetch a byte from TOK, using the string buffer. */
581
582 static int
buf_getc(struct tok_state * tok)583 buf_getc(struct tok_state *tok) {
584 return Py_CHARMASK(*tok->str++);
585 }
586
587 /* Unfetch a byte from TOK, using the string buffer. */
588
589 static void
buf_ungetc(int c,struct tok_state * tok)590 buf_ungetc(int c, struct tok_state *tok) {
591 tok->str--;
592 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
593 }
594
595 /* Set the readline function for TOK to ENC. For the string-based
596 tokenizer, this means to just record the encoding. */
597
598 static int
buf_setreadl(struct tok_state * tok,const char * enc)599 buf_setreadl(struct tok_state *tok, const char* enc) {
600 tok->enc = enc;
601 return 1;
602 }
603
604 /* Return a UTF-8 encoding Python string object from the
605 C byte string STR, which is encoded with ENC. */
606
607 #ifdef Py_USING_UNICODE
608 static PyObject *
translate_into_utf8(const char * str,const char * enc)609 translate_into_utf8(const char* str, const char* enc) {
610 PyObject *utf8;
611 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
612 if (buf == NULL)
613 return NULL;
614 utf8 = PyUnicode_AsUTF8String(buf);
615 Py_DECREF(buf);
616 return utf8;
617 }
618 #endif
619
620
621 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)622 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
623 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
624 char *buf, *current;
625 char c = '\0';
626 buf = PyMem_MALLOC(needed_length);
627 if (buf == NULL) {
628 tok->done = E_NOMEM;
629 return NULL;
630 }
631 for (current = buf; *s; s++, current++) {
632 c = *s;
633 if (skip_next_lf) {
634 skip_next_lf = 0;
635 if (c == '\n') {
636 c = *++s;
637 if (!c)
638 break;
639 }
640 }
641 if (c == '\r') {
642 skip_next_lf = 1;
643 c = '\n';
644 }
645 *current = c;
646 }
647 /* If this is exec input, add a newline to the end of the string if
648 there isn't one already. */
649 if (exec_input && c != '\n') {
650 *current = '\n';
651 current++;
652 }
653 *current = '\0';
654 final_length = current - buf + 1;
655 if (final_length < needed_length && final_length)
656 /* should never fail */
657 buf = PyMem_REALLOC(buf, final_length);
658 return buf;
659 }
660
661 /* Decode a byte string STR for use as the buffer of TOK.
662 Look for encoding declarations inside STR, and record them
663 inside TOK. */
664
665 static const char *
decode_str(const char * input,int single,struct tok_state * tok)666 decode_str(const char *input, int single, struct tok_state *tok)
667 {
668 PyObject* utf8 = NULL;
669 const char *str;
670 const char *s;
671 const char *newl[2] = {NULL, NULL};
672 int lineno = 0;
673 tok->input = str = translate_newlines(input, single, tok);
674 if (str == NULL)
675 return NULL;
676 tok->enc = NULL;
677 tok->str = str;
678 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
679 return error_ret(tok);
680 str = tok->str; /* string after BOM if any */
681 assert(str);
682 #ifdef Py_USING_UNICODE
683 if (tok->enc != NULL) {
684 utf8 = translate_into_utf8(str, tok->enc);
685 if (utf8 == NULL)
686 return error_ret(tok);
687 str = PyString_AsString(utf8);
688 }
689 #endif
690 for (s = str;; s++) {
691 if (*s == '\0') break;
692 else if (*s == '\n') {
693 assert(lineno < 2);
694 newl[lineno] = s;
695 lineno++;
696 if (lineno == 2) break;
697 }
698 }
699 tok->enc = NULL;
700 /* need to check line 1 and 2 separately since check_coding_spec
701 assumes a single line as input */
702 if (newl[0]) {
703 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704 return error_ret(tok);
705 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
706 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707 tok, buf_setreadl))
708 return error_ret(tok);
709 }
710 }
711 #ifdef Py_USING_UNICODE
712 if (tok->enc != NULL) {
713 assert(utf8 == NULL);
714 utf8 = translate_into_utf8(str, tok->enc);
715 if (utf8 == NULL)
716 return error_ret(tok);
717 str = PyString_AsString(utf8);
718 }
719 #endif
720 assert(tok->decoding_buffer == NULL);
721 tok->decoding_buffer = utf8; /* CAUTION */
722 return str;
723 }
724
725 #endif /* PGEN */
726
727 /* Set up tokenizer for string */
728
729 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)730 PyTokenizer_FromString(const char *str, int exec_input)
731 {
732 struct tok_state *tok = tok_new();
733 if (tok == NULL)
734 return NULL;
735 str = (char *)decode_str(str, exec_input, tok);
736 if (str == NULL) {
737 PyTokenizer_Free(tok);
738 return NULL;
739 }
740
741 /* XXX: constify members. */
742 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
743 return tok;
744 }
745
746
747 /* Set up tokenizer for file */
748
749 struct tok_state *
PyTokenizer_FromFile(FILE * fp,char * ps1,char * ps2)750 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
751 {
752 struct tok_state *tok = tok_new();
753 if (tok == NULL)
754 return NULL;
755 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
756 PyTokenizer_Free(tok);
757 return NULL;
758 }
759 tok->cur = tok->inp = tok->buf;
760 tok->end = tok->buf + BUFSIZ;
761 tok->fp = fp;
762 tok->prompt = ps1;
763 tok->nextprompt = ps2;
764 return tok;
765 }
766
767
768 /* Free a tok_state structure */
769
770 void
PyTokenizer_Free(struct tok_state * tok)771 PyTokenizer_Free(struct tok_state *tok)
772 {
773 if (tok->encoding != NULL)
774 PyMem_FREE(tok->encoding);
775 #ifndef PGEN
776 Py_XDECREF(tok->decoding_readline);
777 Py_XDECREF(tok->decoding_buffer);
778 #endif
779 if (tok->fp != NULL && tok->buf != NULL)
780 PyMem_FREE(tok->buf);
781 if (tok->input)
782 PyMem_FREE((char *)tok->input);
783 PyMem_FREE(tok);
784 }
785
786 #if !defined(PGEN) && defined(Py_USING_UNICODE)
787 static int
tok_stdin_decode(struct tok_state * tok,char ** inp)788 tok_stdin_decode(struct tok_state *tok, char **inp)
789 {
790 PyObject *enc, *sysstdin, *decoded, *utf8;
791 const char *encoding;
792 char *converted;
793
794 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
795 return 0;
796 sysstdin = PySys_GetObject("stdin");
797 if (sysstdin == NULL || !PyFile_Check(sysstdin))
798 return 0;
799
800 enc = ((PyFileObject *)sysstdin)->f_encoding;
801 if (enc == NULL || !PyString_Check(enc))
802 return 0;
803 Py_INCREF(enc);
804
805 encoding = PyString_AsString(enc);
806 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
807 if (decoded == NULL)
808 goto error_clear;
809
810 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
811 Py_DECREF(decoded);
812 if (utf8 == NULL)
813 goto error_clear;
814
815 assert(PyString_Check(utf8));
816 converted = new_string(PyString_AS_STRING(utf8),
817 PyString_GET_SIZE(utf8));
818 Py_DECREF(utf8);
819 if (converted == NULL)
820 goto error_nomem;
821
822 PyMem_FREE(*inp);
823 *inp = converted;
824 if (tok->encoding != NULL)
825 PyMem_FREE(tok->encoding);
826 tok->encoding = new_string(encoding, strlen(encoding));
827 if (tok->encoding == NULL)
828 goto error_nomem;
829
830 Py_DECREF(enc);
831 return 0;
832
833 error_nomem:
834 Py_DECREF(enc);
835 tok->done = E_NOMEM;
836 return -1;
837
838 error_clear:
839 Py_DECREF(enc);
840 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
841 tok->done = E_ERROR;
842 return -1;
843 }
844 /* Fallback to iso-8859-1: for backward compatibility */
845 PyErr_Clear();
846 return 0;
847 }
848 #endif
849
850 /* Get next char, updating state; error code goes into tok->done */
851
852 static int
tok_nextc(register struct tok_state * tok)853 tok_nextc(register struct tok_state *tok)
854 {
855 for (;;) {
856 if (tok->cur != tok->inp) {
857 return Py_CHARMASK(*tok->cur++); /* Fast path */
858 }
859 if (tok->done != E_OK)
860 return EOF;
861 if (tok->fp == NULL) {
862 char *end = strchr(tok->inp, '\n');
863 if (end != NULL)
864 end++;
865 else {
866 end = strchr(tok->inp, '\0');
867 if (end == tok->inp) {
868 tok->done = E_EOF;
869 return EOF;
870 }
871 }
872 if (tok->start == NULL)
873 tok->buf = tok->cur;
874 tok->line_start = tok->cur;
875 tok->lineno++;
876 tok->inp = end;
877 return Py_CHARMASK(*tok->cur++);
878 }
879 if (tok->prompt != NULL) {
880 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
881 if (tok->nextprompt != NULL)
882 tok->prompt = tok->nextprompt;
883 if (newtok == NULL)
884 tok->done = E_INTR;
885 else if (*newtok == '\0') {
886 PyMem_FREE(newtok);
887 tok->done = E_EOF;
888 }
889 #if !defined(PGEN) && defined(Py_USING_UNICODE)
890 else if (tok_stdin_decode(tok, &newtok) != 0)
891 PyMem_FREE(newtok);
892 #endif
893 else if (tok->start != NULL) {
894 size_t start = tok->start - tok->buf;
895 size_t oldlen = tok->cur - tok->buf;
896 size_t newlen = oldlen + strlen(newtok);
897 char *buf = tok->buf;
898 buf = (char *)PyMem_REALLOC(buf, newlen+1);
899 tok->lineno++;
900 if (buf == NULL) {
901 PyMem_FREE(tok->buf);
902 tok->buf = NULL;
903 PyMem_FREE(newtok);
904 tok->done = E_NOMEM;
905 return EOF;
906 }
907 tok->buf = buf;
908 tok->cur = tok->buf + oldlen;
909 tok->line_start = tok->cur;
910 strcpy(tok->buf + oldlen, newtok);
911 PyMem_FREE(newtok);
912 tok->inp = tok->buf + newlen;
913 tok->end = tok->inp + 1;
914 tok->start = tok->buf + start;
915 }
916 else {
917 tok->lineno++;
918 if (tok->buf != NULL)
919 PyMem_FREE(tok->buf);
920 tok->buf = newtok;
921 tok->line_start = tok->buf;
922 tok->cur = tok->buf;
923 tok->line_start = tok->buf;
924 tok->inp = strchr(tok->buf, '\0');
925 tok->end = tok->inp + 1;
926 }
927 }
928 else {
929 int done = 0;
930 Py_ssize_t cur = 0;
931 char *pt;
932 if (tok->start == NULL) {
933 if (tok->buf == NULL) {
934 tok->buf = (char *)
935 PyMem_MALLOC(BUFSIZ);
936 if (tok->buf == NULL) {
937 tok->done = E_NOMEM;
938 return EOF;
939 }
940 tok->end = tok->buf + BUFSIZ;
941 }
942 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
943 tok) == NULL) {
944 tok->done = E_EOF;
945 done = 1;
946 }
947 else {
948 tok->done = E_OK;
949 tok->inp = strchr(tok->buf, '\0');
950 done = tok->inp[-1] == '\n';
951 }
952 }
953 else {
954 cur = tok->cur - tok->buf;
955 if (decoding_feof(tok)) {
956 tok->done = E_EOF;
957 done = 1;
958 }
959 else
960 tok->done = E_OK;
961 }
962 tok->lineno++;
963 /* Read until '\n' or EOF */
964 while (!done) {
965 Py_ssize_t curstart = tok->start == NULL ? -1 :
966 tok->start - tok->buf;
967 Py_ssize_t curvalid = tok->inp - tok->buf;
968 Py_ssize_t newsize = curvalid + BUFSIZ;
969 char *newbuf = tok->buf;
970 newbuf = (char *)PyMem_REALLOC(newbuf,
971 newsize);
972 if (newbuf == NULL) {
973 tok->done = E_NOMEM;
974 tok->cur = tok->inp;
975 return EOF;
976 }
977 tok->buf = newbuf;
978 tok->inp = tok->buf + curvalid;
979 tok->end = tok->buf + newsize;
980 tok->start = curstart < 0 ? NULL :
981 tok->buf + curstart;
982 if (decoding_fgets(tok->inp,
983 (int)(tok->end - tok->inp),
984 tok) == NULL) {
985 /* Break out early on decoding
986 errors, as tok->buf will be NULL
987 */
988 if (tok->decoding_erred)
989 return EOF;
990 /* Last line does not end in \n,
991 fake one */
992 strcpy(tok->inp, "\n");
993 }
994 tok->inp = strchr(tok->inp, '\0');
995 done = tok->inp[-1] == '\n';
996 }
997 if (tok->buf != NULL) {
998 tok->cur = tok->buf + cur;
999 tok->line_start = tok->cur;
1000 /* replace "\r\n" with "\n" */
1001 /* For Mac leave the \r, giving a syntax error */
1002 pt = tok->inp - 2;
1003 if (pt >= tok->buf && *pt == '\r') {
1004 *pt++ = '\n';
1005 *pt = '\0';
1006 tok->inp = pt;
1007 }
1008 }
1009 }
1010 if (tok->done != E_OK) {
1011 if (tok->prompt != NULL)
1012 PySys_WriteStderr("\n");
1013 tok->cur = tok->inp;
1014 return EOF;
1015 }
1016 }
1017 /*NOTREACHED*/
1018 }
1019
1020
1021 /* Back-up one character */
1022
1023 static void
tok_backup(register struct tok_state * tok,register int c)1024 tok_backup(register struct tok_state *tok, register int c)
1025 {
1026 if (c != EOF) {
1027 if (--tok->cur < tok->buf)
1028 Py_FatalError("tok_backup: beginning of buffer");
1029 if (*tok->cur != c)
1030 *tok->cur = c;
1031 }
1032 }
1033
1034
1035 /* Return the token corresponding to a single character */
1036
1037 int
PyToken_OneChar(int c)1038 PyToken_OneChar(int c)
1039 {
1040 switch (c) {
1041 case '(': return LPAR;
1042 case ')': return RPAR;
1043 case '[': return LSQB;
1044 case ']': return RSQB;
1045 case ':': return COLON;
1046 case ',': return COMMA;
1047 case ';': return SEMI;
1048 case '+': return PLUS;
1049 case '-': return MINUS;
1050 case '*': return STAR;
1051 case '/': return SLASH;
1052 case '|': return VBAR;
1053 case '&': return AMPER;
1054 case '<': return LESS;
1055 case '>': return GREATER;
1056 case '=': return EQUAL;
1057 case '.': return DOT;
1058 case '%': return PERCENT;
1059 case '`': return BACKQUOTE;
1060 case '{': return LBRACE;
1061 case '}': return RBRACE;
1062 case '^': return CIRCUMFLEX;
1063 case '~': return TILDE;
1064 case '@': return AT;
1065 default: return OP;
1066 }
1067 }
1068
1069
1070 int
PyToken_TwoChars(int c1,int c2)1071 PyToken_TwoChars(int c1, int c2)
1072 {
1073 switch (c1) {
1074 case '=':
1075 switch (c2) {
1076 case '=': return EQEQUAL;
1077 }
1078 break;
1079 case '!':
1080 switch (c2) {
1081 case '=': return NOTEQUAL;
1082 }
1083 break;
1084 case '<':
1085 switch (c2) {
1086 case '>': return NOTEQUAL;
1087 case '=': return LESSEQUAL;
1088 case '<': return LEFTSHIFT;
1089 }
1090 break;
1091 case '>':
1092 switch (c2) {
1093 case '=': return GREATEREQUAL;
1094 case '>': return RIGHTSHIFT;
1095 }
1096 break;
1097 case '+':
1098 switch (c2) {
1099 case '=': return PLUSEQUAL;
1100 }
1101 break;
1102 case '-':
1103 switch (c2) {
1104 case '=': return MINEQUAL;
1105 }
1106 break;
1107 case '*':
1108 switch (c2) {
1109 case '*': return DOUBLESTAR;
1110 case '=': return STAREQUAL;
1111 }
1112 break;
1113 case '/':
1114 switch (c2) {
1115 case '/': return DOUBLESLASH;
1116 case '=': return SLASHEQUAL;
1117 }
1118 break;
1119 case '|':
1120 switch (c2) {
1121 case '=': return VBAREQUAL;
1122 }
1123 break;
1124 case '%':
1125 switch (c2) {
1126 case '=': return PERCENTEQUAL;
1127 }
1128 break;
1129 case '&':
1130 switch (c2) {
1131 case '=': return AMPEREQUAL;
1132 }
1133 break;
1134 case '^':
1135 switch (c2) {
1136 case '=': return CIRCUMFLEXEQUAL;
1137 }
1138 break;
1139 }
1140 return OP;
1141 }
1142
1143 int
PyToken_ThreeChars(int c1,int c2,int c3)1144 PyToken_ThreeChars(int c1, int c2, int c3)
1145 {
1146 switch (c1) {
1147 case '<':
1148 switch (c2) {
1149 case '<':
1150 switch (c3) {
1151 case '=':
1152 return LEFTSHIFTEQUAL;
1153 }
1154 break;
1155 }
1156 break;
1157 case '>':
1158 switch (c2) {
1159 case '>':
1160 switch (c3) {
1161 case '=':
1162 return RIGHTSHIFTEQUAL;
1163 }
1164 break;
1165 }
1166 break;
1167 case '*':
1168 switch (c2) {
1169 case '*':
1170 switch (c3) {
1171 case '=':
1172 return DOUBLESTAREQUAL;
1173 }
1174 break;
1175 }
1176 break;
1177 case '/':
1178 switch (c2) {
1179 case '/':
1180 switch (c3) {
1181 case '=':
1182 return DOUBLESLASHEQUAL;
1183 }
1184 break;
1185 }
1186 break;
1187 }
1188 return OP;
1189 }
1190
1191 static int
indenterror(struct tok_state * tok)1192 indenterror(struct tok_state *tok)
1193 {
1194 if (tok->alterror) {
1195 tok->done = E_TABSPACE;
1196 tok->cur = tok->inp;
1197 return 1;
1198 }
1199 if (tok->altwarning) {
1200 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1201 "in indentation\n", tok->filename);
1202 tok->altwarning = 0;
1203 }
1204 return 0;
1205 }
1206
1207 /* Get next token, after space stripping etc. */
1208
1209 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1210 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1211 {
1212 register int c;
1213 int blankline;
1214
1215 *p_start = *p_end = NULL;
1216 nextline:
1217 tok->start = NULL;
1218 blankline = 0;
1219
1220 /* Get indentation level */
1221 if (tok->atbol) {
1222 register int col = 0;
1223 register int altcol = 0;
1224 tok->atbol = 0;
1225 for (;;) {
1226 c = tok_nextc(tok);
1227 if (c == ' ')
1228 col++, altcol++;
1229 else if (c == '\t') {
1230 col = (col/tok->tabsize + 1) * tok->tabsize;
1231 altcol = (altcol/tok->alttabsize + 1)
1232 * tok->alttabsize;
1233 }
1234 else if (c == '\014') /* Control-L (formfeed) */
1235 col = altcol = 0; /* For Emacs users */
1236 else
1237 break;
1238 }
1239 tok_backup(tok, c);
1240 if (c == '#' || c == '\n') {
1241 /* Lines with only whitespace and/or comments
1242 shouldn't affect the indentation and are
1243 not passed to the parser as NEWLINE tokens,
1244 except *totally* empty lines in interactive
1245 mode, which signal the end of a command group. */
1246 if (col == 0 && c == '\n' && tok->prompt != NULL)
1247 blankline = 0; /* Let it through */
1248 else
1249 blankline = 1; /* Ignore completely */
1250 /* We can't jump back right here since we still
1251 may need to skip to the end of a comment */
1252 }
1253 if (!blankline && tok->level == 0) {
1254 if (col == tok->indstack[tok->indent]) {
1255 /* No change */
1256 if (altcol != tok->altindstack[tok->indent]) {
1257 if (indenterror(tok))
1258 return ERRORTOKEN;
1259 }
1260 }
1261 else if (col > tok->indstack[tok->indent]) {
1262 /* Indent -- always one */
1263 if (tok->indent+1 >= MAXINDENT) {
1264 tok->done = E_TOODEEP;
1265 tok->cur = tok->inp;
1266 return ERRORTOKEN;
1267 }
1268 if (altcol <= tok->altindstack[tok->indent]) {
1269 if (indenterror(tok))
1270 return ERRORTOKEN;
1271 }
1272 tok->pendin++;
1273 tok->indstack[++tok->indent] = col;
1274 tok->altindstack[tok->indent] = altcol;
1275 }
1276 else /* col < tok->indstack[tok->indent] */ {
1277 /* Dedent -- any number, must be consistent */
1278 while (tok->indent > 0 &&
1279 col < tok->indstack[tok->indent]) {
1280 tok->pendin--;
1281 tok->indent--;
1282 }
1283 if (col != tok->indstack[tok->indent]) {
1284 tok->done = E_DEDENT;
1285 tok->cur = tok->inp;
1286 return ERRORTOKEN;
1287 }
1288 if (altcol != tok->altindstack[tok->indent]) {
1289 if (indenterror(tok))
1290 return ERRORTOKEN;
1291 }
1292 }
1293 }
1294 }
1295
1296 tok->start = tok->cur;
1297
1298 /* Return pending indents/dedents */
1299 if (tok->pendin != 0) {
1300 if (tok->pendin < 0) {
1301 tok->pendin++;
1302 return DEDENT;
1303 }
1304 else {
1305 tok->pendin--;
1306 return INDENT;
1307 }
1308 }
1309
1310 again:
1311 tok->start = NULL;
1312 /* Skip spaces */
1313 do {
1314 c = tok_nextc(tok);
1315 } while (c == ' ' || c == '\t' || c == '\014');
1316
1317 /* Set start of current token */
1318 tok->start = tok->cur - 1;
1319
1320 /* Skip comment, while looking for tab-setting magic */
1321 if (c == '#') {
1322 static char *tabforms[] = {
1323 "tab-width:", /* Emacs */
1324 ":tabstop=", /* vim, full form */
1325 ":ts=", /* vim, abbreviated form */
1326 "set tabsize=", /* will vi never die? */
1327 /* more templates can be added here to support other editors */
1328 };
1329 char cbuf[80];
1330 char *tp, **cp;
1331 tp = cbuf;
1332 do {
1333 *tp++ = c = tok_nextc(tok);
1334 } while (c != EOF && c != '\n' &&
1335 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1336 *tp = '\0';
1337 for (cp = tabforms;
1338 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1339 cp++) {
1340 if ((tp = strstr(cbuf, *cp))) {
1341 int newsize = atoi(tp + strlen(*cp));
1342
1343 if (newsize >= 1 && newsize <= 40) {
1344 tok->tabsize = newsize;
1345 if (Py_VerboseFlag)
1346 PySys_WriteStderr(
1347 "Tab size set to %d\n",
1348 newsize);
1349 }
1350 }
1351 }
1352 while (c != EOF && c != '\n')
1353 c = tok_nextc(tok);
1354 }
1355
1356 /* Check for EOF and errors now */
1357 if (c == EOF) {
1358 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1359 }
1360
1361 /* Identifier (most frequent token!) */
1362 if (Py_ISALPHA(c) || c == '_') {
1363 /* Process r"", u"" and ur"" */
1364 switch (c) {
1365 case 'b':
1366 case 'B':
1367 c = tok_nextc(tok);
1368 if (c == 'r' || c == 'R')
1369 c = tok_nextc(tok);
1370 if (c == '"' || c == '\'')
1371 goto letter_quote;
1372 break;
1373 case 'r':
1374 case 'R':
1375 c = tok_nextc(tok);
1376 if (c == '"' || c == '\'')
1377 goto letter_quote;
1378 break;
1379 case 'u':
1380 case 'U':
1381 c = tok_nextc(tok);
1382 if (c == 'r' || c == 'R')
1383 c = tok_nextc(tok);
1384 if (c == '"' || c == '\'')
1385 goto letter_quote;
1386 break;
1387 }
1388 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1389 c = tok_nextc(tok);
1390 }
1391 tok_backup(tok, c);
1392 *p_start = tok->start;
1393 *p_end = tok->cur;
1394 return NAME;
1395 }
1396
1397 /* Newline */
1398 if (c == '\n') {
1399 tok->atbol = 1;
1400 if (blankline || tok->level > 0)
1401 goto nextline;
1402 *p_start = tok->start;
1403 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1404 tok->cont_line = 0;
1405 return NEWLINE;
1406 }
1407
1408 /* Period or number starting with period? */
1409 if (c == '.') {
1410 c = tok_nextc(tok);
1411 if (isdigit(c)) {
1412 goto fraction;
1413 }
1414 else {
1415 tok_backup(tok, c);
1416 *p_start = tok->start;
1417 *p_end = tok->cur;
1418 return DOT;
1419 }
1420 }
1421
1422 /* Number */
1423 if (isdigit(c)) {
1424 if (c == '0') {
1425 /* Hex, octal or binary -- maybe. */
1426 c = tok_nextc(tok);
1427 if (c == '.')
1428 goto fraction;
1429 #ifndef WITHOUT_COMPLEX
1430 if (c == 'j' || c == 'J')
1431 goto imaginary;
1432 #endif
1433 if (c == 'x' || c == 'X') {
1434
1435 /* Hex */
1436 c = tok_nextc(tok);
1437 if (!isxdigit(c)) {
1438 tok->done = E_TOKEN;
1439 tok_backup(tok, c);
1440 return ERRORTOKEN;
1441 }
1442 do {
1443 c = tok_nextc(tok);
1444 } while (isxdigit(c));
1445 }
1446 else if (c == 'o' || c == 'O') {
1447 /* Octal */
1448 c = tok_nextc(tok);
1449 if (c < '0' || c >= '8') {
1450 tok->done = E_TOKEN;
1451 tok_backup(tok, c);
1452 return ERRORTOKEN;
1453 }
1454 do {
1455 c = tok_nextc(tok);
1456 } while ('0' <= c && c < '8');
1457 }
1458 else if (c == 'b' || c == 'B') {
1459 /* Binary */
1460 c = tok_nextc(tok);
1461 if (c != '0' && c != '1') {
1462 tok->done = E_TOKEN;
1463 tok_backup(tok, c);
1464 return ERRORTOKEN;
1465 }
1466 do {
1467 c = tok_nextc(tok);
1468 } while (c == '0' || c == '1');
1469 }
1470 else {
1471 int found_decimal = 0;
1472 /* Octal; c is first char of it */
1473 /* There's no 'isoctdigit' macro, sigh */
1474 while ('0' <= c && c < '8') {
1475 c = tok_nextc(tok);
1476 }
1477 if (isdigit(c)) {
1478 found_decimal = 1;
1479 do {
1480 c = tok_nextc(tok);
1481 } while (isdigit(c));
1482 }
1483 if (c == '.')
1484 goto fraction;
1485 else if (c == 'e' || c == 'E')
1486 goto exponent;
1487 #ifndef WITHOUT_COMPLEX
1488 else if (c == 'j' || c == 'J')
1489 goto imaginary;
1490 #endif
1491 else if (found_decimal) {
1492 tok->done = E_TOKEN;
1493 tok_backup(tok, c);
1494 return ERRORTOKEN;
1495 }
1496 }
1497 if (c == 'l' || c == 'L')
1498 c = tok_nextc(tok);
1499 }
1500 else {
1501 /* Decimal */
1502 do {
1503 c = tok_nextc(tok);
1504 } while (isdigit(c));
1505 if (c == 'l' || c == 'L')
1506 c = tok_nextc(tok);
1507 else {
1508 /* Accept floating point numbers. */
1509 if (c == '.') {
1510 fraction:
1511 /* Fraction */
1512 do {
1513 c = tok_nextc(tok);
1514 } while (isdigit(c));
1515 }
1516 if (c == 'e' || c == 'E') {
1517 int e;
1518 exponent:
1519 e = c;
1520 /* Exponent part */
1521 c = tok_nextc(tok);
1522 if (c == '+' || c == '-') {
1523 c = tok_nextc(tok);
1524 if (!isdigit(c)) {
1525 tok->done = E_TOKEN;
1526 tok_backup(tok, c);
1527 return ERRORTOKEN;
1528 }
1529 } else if (!isdigit(c)) {
1530 tok_backup(tok, c);
1531 tok_backup(tok, e);
1532 *p_start = tok->start;
1533 *p_end = tok->cur;
1534 return NUMBER;
1535 }
1536 do {
1537 c = tok_nextc(tok);
1538 } while (isdigit(c));
1539 }
1540 #ifndef WITHOUT_COMPLEX
1541 if (c == 'j' || c == 'J')
1542 /* Imaginary part */
1543 imaginary:
1544 c = tok_nextc(tok);
1545 #endif
1546 }
1547 }
1548 tok_backup(tok, c);
1549 *p_start = tok->start;
1550 *p_end = tok->cur;
1551 return NUMBER;
1552 }
1553
1554 letter_quote:
1555 /* String */
1556 if (c == '\'' || c == '"') {
1557 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1558 int quote = c;
1559 int triple = 0;
1560 int tripcount = 0;
1561 for (;;) {
1562 c = tok_nextc(tok);
1563 if (c == '\n') {
1564 if (!triple) {
1565 tok->done = E_EOLS;
1566 tok_backup(tok, c);
1567 return ERRORTOKEN;
1568 }
1569 tripcount = 0;
1570 tok->cont_line = 1; /* multiline string. */
1571 }
1572 else if (c == EOF) {
1573 if (triple)
1574 tok->done = E_EOFS;
1575 else
1576 tok->done = E_EOLS;
1577 tok->cur = tok->inp;
1578 return ERRORTOKEN;
1579 }
1580 else if (c == quote) {
1581 tripcount++;
1582 if (tok->cur - tok->start == quote2) {
1583 c = tok_nextc(tok);
1584 if (c == quote) {
1585 triple = 1;
1586 tripcount = 0;
1587 continue;
1588 }
1589 tok_backup(tok, c);
1590 }
1591 if (!triple || tripcount == 3)
1592 break;
1593 }
1594 else if (c == '\\') {
1595 tripcount = 0;
1596 c = tok_nextc(tok);
1597 if (c == EOF) {
1598 tok->done = E_EOLS;
1599 tok->cur = tok->inp;
1600 return ERRORTOKEN;
1601 }
1602 }
1603 else
1604 tripcount = 0;
1605 }
1606 *p_start = tok->start;
1607 *p_end = tok->cur;
1608 return STRING;
1609 }
1610
1611 /* Line continuation */
1612 if (c == '\\') {
1613 c = tok_nextc(tok);
1614 if (c != '\n') {
1615 tok->done = E_LINECONT;
1616 tok->cur = tok->inp;
1617 return ERRORTOKEN;
1618 }
1619 tok->cont_line = 1;
1620 goto again; /* Read next line */
1621 }
1622
1623 /* Check for two-character token */
1624 {
1625 int c2 = tok_nextc(tok);
1626 int token = PyToken_TwoChars(c, c2);
1627 #ifndef PGEN
1628 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1629 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1630 "<> not supported in 3.x; use !=",
1631 tok->filename, tok->lineno,
1632 NULL, NULL)) {
1633 return ERRORTOKEN;
1634 }
1635 }
1636 #endif
1637 if (token != OP) {
1638 int c3 = tok_nextc(tok);
1639 int token3 = PyToken_ThreeChars(c, c2, c3);
1640 if (token3 != OP) {
1641 token = token3;
1642 } else {
1643 tok_backup(tok, c3);
1644 }
1645 *p_start = tok->start;
1646 *p_end = tok->cur;
1647 return token;
1648 }
1649 tok_backup(tok, c2);
1650 }
1651
1652 /* Keep track of parentheses nesting level */
1653 switch (c) {
1654 case '(':
1655 case '[':
1656 case '{':
1657 tok->level++;
1658 break;
1659 case ')':
1660 case ']':
1661 case '}':
1662 tok->level--;
1663 break;
1664 }
1665
1666 /* Punctuation character */
1667 *p_start = tok->start;
1668 *p_end = tok->cur;
1669 return PyToken_OneChar(c);
1670 }
1671
1672 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1673 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1674 {
1675 int result = tok_get(tok, p_start, p_end);
1676 if (tok->decoding_erred) {
1677 result = ERRORTOKEN;
1678 tok->done = E_DECODE;
1679 }
1680 return result;
1681 }
1682
1683 /* This function is only called from parsetok. However, it cannot live
1684 there, as it must be empty for PGEN, and we can check for PGEN only
1685 in this file. */
1686
1687 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1688 char*
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1689 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1690 {
1691 return NULL;
1692 }
1693 #else
1694 #ifdef Py_USING_UNICODE
1695 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1696 dec_utf8(const char *enc, const char *text, size_t len) {
1697 PyObject *ret = NULL;
1698 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1699 if (unicode_text) {
1700 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1701 Py_DECREF(unicode_text);
1702 }
1703 if (!ret) {
1704 PyErr_Clear();
1705 }
1706 return ret;
1707 }
1708 char *
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1709 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1710 {
1711 char *text = NULL;
1712 if (tok->encoding) {
1713 /* convert source to original encondig */
1714 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1715 if (lineobj != NULL) {
1716 int linelen = PyString_Size(lineobj);
1717 const char *line = PyString_AsString(lineobj);
1718 text = PyObject_MALLOC(linelen + 1);
1719 if (text != NULL && line != NULL) {
1720 if (linelen)
1721 strncpy(text, line, linelen);
1722 text[linelen] = '\0';
1723 }
1724 Py_DECREF(lineobj);
1725
1726 /* adjust error offset */
1727 if (*offset > 1) {
1728 PyObject *offsetobj = dec_utf8(tok->encoding,
1729 tok->buf, *offset-1);
1730 if (offsetobj) {
1731 *offset = PyString_Size(offsetobj) + 1;
1732 Py_DECREF(offsetobj);
1733 }
1734 }
1735
1736 }
1737 }
1738 return text;
1739
1740 }
1741 #endif /* defined(Py_USING_UNICODE) */
1742 #endif
1743
1744
1745 #ifdef Py_DEBUG
1746
1747 void
tok_dump(int type,char * start,char * end)1748 tok_dump(int type, char *start, char *end)
1749 {
1750 printf("%s", _PyParser_TokenNames[type]);
1751 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1752 printf("(%.*s)", (int)(end - start), start);
1753 }
1754
1755 #endif
1756