• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /* Tokenizer implementation */
3 
4 #include "Python.h"
5 #include "pgenheaders.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "errcode.h"
12 
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
21 
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24    empty malloc'ed string for EOF;
25    NULL if interrupted */
26 
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
29 
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
34 
35 /* Token names */
36 
37 char *_PyParser_TokenNames[] = {
38     "ENDMARKER",
39     "NAME",
40     "NUMBER",
41     "STRING",
42     "NEWLINE",
43     "INDENT",
44     "DEDENT",
45     "LPAR",
46     "RPAR",
47     "LSQB",
48     "RSQB",
49     "COLON",
50     "COMMA",
51     "SEMI",
52     "PLUS",
53     "MINUS",
54     "STAR",
55     "SLASH",
56     "VBAR",
57     "AMPER",
58     "LESS",
59     "GREATER",
60     "EQUAL",
61     "DOT",
62     "PERCENT",
63     "BACKQUOTE",
64     "LBRACE",
65     "RBRACE",
66     "EQEQUAL",
67     "NOTEQUAL",
68     "LESSEQUAL",
69     "GREATEREQUAL",
70     "TILDE",
71     "CIRCUMFLEX",
72     "LEFTSHIFT",
73     "RIGHTSHIFT",
74     "DOUBLESTAR",
75     "PLUSEQUAL",
76     "MINEQUAL",
77     "STAREQUAL",
78     "SLASHEQUAL",
79     "PERCENTEQUAL",
80     "AMPEREQUAL",
81     "VBAREQUAL",
82     "CIRCUMFLEXEQUAL",
83     "LEFTSHIFTEQUAL",
84     "RIGHTSHIFTEQUAL",
85     "DOUBLESTAREQUAL",
86     "DOUBLESLASH",
87     "DOUBLESLASHEQUAL",
88     "AT",
89     /* This table must match the #defines in token.h! */
90     "OP",
91     "<ERRORTOKEN>",
92     "<N_TOKENS>"
93 };
94 
95 /* Create and initialize a new tok_state structure */
96 
97 static struct tok_state *
tok_new(void)98 tok_new(void)
99 {
100     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101                                             sizeof(struct tok_state));
102     if (tok == NULL)
103         return NULL;
104     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105     tok->done = E_OK;
106     tok->fp = NULL;
107     tok->input = NULL;
108     tok->tabsize = TABSIZE;
109     tok->indent = 0;
110     tok->indstack[0] = 0;
111     tok->atbol = 1;
112     tok->pendin = 0;
113     tok->prompt = tok->nextprompt = NULL;
114     tok->lineno = 0;
115     tok->level = 0;
116     tok->filename = NULL;
117     tok->altwarning = 0;
118     tok->alterror = 0;
119     tok->alttabsize = 1;
120     tok->altindstack[0] = 0;
121     tok->decoding_state = 0;
122     tok->decoding_erred = 0;
123     tok->read_coding_spec = 0;
124     tok->encoding = NULL;
125     tok->cont_line = 0;
126 #ifndef PGEN
127     tok->decoding_readline = NULL;
128     tok->decoding_buffer = NULL;
129 #endif
130     return tok;
131 }
132 
133 static char *
new_string(const char * s,Py_ssize_t len)134 new_string(const char *s, Py_ssize_t len)
135 {
136     char* result = (char *)PyMem_MALLOC(len + 1);
137     if (result != NULL) {
138         memcpy(result, s, len);
139         result[len] = '\0';
140     }
141     return result;
142 }
143 
144 #ifdef PGEN
145 
146 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)147 decoding_fgets(char *s, int size, struct tok_state *tok)
148 {
149     return fgets(s, size, tok->fp);
150 }
151 
152 static int
decoding_feof(struct tok_state * tok)153 decoding_feof(struct tok_state *tok)
154 {
155     return feof(tok->fp);
156 }
157 
158 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)159 decode_str(const char *str, int exec_input, struct tok_state *tok)
160 {
161     return new_string(str, strlen(str));
162 }
163 
164 #else /* PGEN */
165 
166 static char *
error_ret(struct tok_state * tok)167 error_ret(struct tok_state *tok) /* XXX */
168 {
169     tok->decoding_erred = 1;
170     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171         PyMem_FREE(tok->buf);
172     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
173     tok->done = E_DECODE;
174     return NULL;                /* as if it were EOF */
175 }
176 
177 
178 static char *
get_normal_name(char * s)179 get_normal_name(char *s)        /* for utf-8 and latin-1 */
180 {
181     char buf[13];
182     int i;
183     for (i = 0; i < 12; i++) {
184         int c = s[i];
185         if (c == '\0')
186             break;
187         else if (c == '_')
188             buf[i] = '-';
189         else
190             buf[i] = tolower(c);
191     }
192     buf[i] = '\0';
193     if (strcmp(buf, "utf-8") == 0 ||
194         strncmp(buf, "utf-8-", 6) == 0)
195         return "utf-8";
196     else if (strcmp(buf, "latin-1") == 0 ||
197              strcmp(buf, "iso-8859-1") == 0 ||
198              strcmp(buf, "iso-latin-1") == 0 ||
199              strncmp(buf, "latin-1-", 8) == 0 ||
200              strncmp(buf, "iso-8859-1-", 11) == 0 ||
201              strncmp(buf, "iso-latin-1-", 12) == 0)
202         return "iso-8859-1";
203     else
204         return s;
205 }
206 
207 /* Return the coding spec in S, or NULL if none is found.  */
208 
209 static char *
get_coding_spec(const char * s,Py_ssize_t size)210 get_coding_spec(const char *s, Py_ssize_t size)
211 {
212     Py_ssize_t i;
213     /* Coding spec must be in a comment, and that comment must be
214      * the only statement on the source code line. */
215     for (i = 0; i < size - 6; i++) {
216         if (s[i] == '#')
217             break;
218         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219             return NULL;
220     }
221     for (; i < size - 6; i++) { /* XXX inefficient search */
222         const char* t = s + i;
223         if (strncmp(t, "coding", 6) == 0) {
224             const char* begin = NULL;
225             t += 6;
226             if (t[0] != ':' && t[0] != '=')
227                 continue;
228             do {
229                 t++;
230             } while (t[0] == '\x20' || t[0] == '\t');
231 
232             begin = t;
233             while (Py_ISALNUM(t[0]) ||
234                    t[0] == '-' || t[0] == '_' || t[0] == '.')
235                 t++;
236 
237             if (begin < t) {
238                 char* r = new_string(begin, t - begin);
239                 char* q;
240                 if (!r)
241                     return NULL;
242                 q = get_normal_name(r);
243                 if (r != q) {
244                     PyMem_FREE(r);
245                     r = new_string(q, strlen(q));
246                 }
247                 return r;
248             }
249         }
250     }
251     return NULL;
252 }
253 
254 /* Check whether the line contains a coding spec. If it does,
255    invoke the set_readline function for the new encoding.
256    This function receives the tok_state and the new encoding.
257    Return 1 on success, 0 on failure.  */
258 
259 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))260 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
261                   int set_readline(struct tok_state *, const char *))
262 {
263     char * cs;
264     int r = 1;
265 
266     if (tok->cont_line) {
267         /* It's a continuation line, so it can't be a coding spec. */
268         tok->read_coding_spec = 1;
269         return 1;
270     }
271     cs = get_coding_spec(line, size);
272     if (!cs) {
273         Py_ssize_t i;
274         for (i = 0; i < size; i++) {
275             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
276                 break;
277             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
278                 /* Stop checking coding spec after a line containing
279                  * anything except a comment. */
280                 tok->read_coding_spec = 1;
281                 break;
282             }
283         }
284     } else {
285         tok->read_coding_spec = 1;
286         if (tok->encoding == NULL) {
287             assert(tok->decoding_state == 1); /* raw */
288             if (strcmp(cs, "utf-8") == 0 ||
289                 strcmp(cs, "iso-8859-1") == 0) {
290                 tok->encoding = cs;
291             } else {
292 #ifdef Py_USING_UNICODE
293                 r = set_readline(tok, cs);
294                 if (r) {
295                     tok->encoding = cs;
296                     tok->decoding_state = -1;
297                 }
298                 else {
299                     PyErr_Format(PyExc_SyntaxError,
300                                  "encoding problem: %s", cs);
301                     PyMem_FREE(cs);
302                 }
303 #else
304                 /* Without Unicode support, we cannot
305                    process the coding spec. Since there
306                    won't be any Unicode literals, that
307                    won't matter. */
308                 PyMem_FREE(cs);
309 #endif
310             }
311         } else {                /* then, compare cs with BOM */
312             r = (strcmp(tok->encoding, cs) == 0);
313             if (!r)
314                 PyErr_Format(PyExc_SyntaxError,
315                              "encoding problem: %s with BOM", cs);
316             PyMem_FREE(cs);
317         }
318     }
319     return r;
320 }
321 
322 /* See whether the file starts with a BOM. If it does,
323    invoke the set_readline function with the new encoding.
324    Return 1 on success, 0 on failure.  */
325 
326 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)327 check_bom(int get_char(struct tok_state *),
328           void unget_char(int, struct tok_state *),
329           int set_readline(struct tok_state *, const char *),
330           struct tok_state *tok)
331 {
332     int ch1, ch2, ch3;
333     ch1 = get_char(tok);
334     tok->decoding_state = 1;
335     if (ch1 == EOF) {
336         return 1;
337     } else if (ch1 == 0xEF) {
338         ch2 = get_char(tok);
339         if (ch2 != 0xBB) {
340             unget_char(ch2, tok);
341             unget_char(ch1, tok);
342             return 1;
343         }
344         ch3 = get_char(tok);
345         if (ch3 != 0xBF) {
346             unget_char(ch3, tok);
347             unget_char(ch2, tok);
348             unget_char(ch1, tok);
349             return 1;
350         }
351 #if 0
352     /* Disable support for UTF-16 BOMs until a decision
353        is made whether this needs to be supported.  */
354     } else if (ch1 == 0xFE) {
355         ch2 = get_char(tok);
356         if (ch2 != 0xFF) {
357             unget_char(ch2, tok);
358             unget_char(ch1, tok);
359             return 1;
360         }
361         if (!set_readline(tok, "utf-16-be"))
362             return 0;
363         tok->decoding_state = -1;
364     } else if (ch1 == 0xFF) {
365         ch2 = get_char(tok);
366         if (ch2 != 0xFE) {
367             unget_char(ch2, tok);
368             unget_char(ch1, tok);
369             return 1;
370         }
371         if (!set_readline(tok, "utf-16-le"))
372             return 0;
373         tok->decoding_state = -1;
374 #endif
375     } else {
376         unget_char(ch1, tok);
377         return 1;
378     }
379     if (tok->encoding != NULL)
380         PyMem_FREE(tok->encoding);
381     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
382     return 1;
383 }
384 
385 /* Read a line of text from TOK into S, using the stream in TOK.
386    Return NULL on failure, else S.
387 
388    On entry, tok->decoding_buffer will be one of:
389      1) NULL: need to call tok->decoding_readline to get a new line
390      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
391        stored the result in tok->decoding_buffer
392      3) PyStringObject *: previous call to fp_readl did not have enough room
393        (in the s buffer) to copy entire contents of the line read
394        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
395        In this case, fp_readl is called in a loop (with an expanded buffer)
396        until the buffer ends with a '\n' (or until the end of the file is
397        reached): see tok_nextc and its calls to decoding_fgets.
398 */
399 
400 static char *
fp_readl(char * s,int size,struct tok_state * tok)401 fp_readl(char *s, int size, struct tok_state *tok)
402 {
403 #ifndef Py_USING_UNICODE
404     /* In a non-Unicode built, this should never be called. */
405     Py_FatalError("fp_readl should not be called in this build.");
406     return NULL; /* Keep compiler happy (not reachable) */
407 #else
408     PyObject* utf8 = NULL;
409     PyObject* buf = tok->decoding_buffer;
410     char *str;
411     Py_ssize_t utf8len;
412 
413     /* Ask for one less byte so we can terminate it */
414     assert(size > 0);
415     size--;
416 
417     if (buf == NULL) {
418         buf = PyObject_CallObject(tok->decoding_readline, NULL);
419         if (buf == NULL)
420             return error_ret(tok);
421         if (!PyUnicode_Check(buf)) {
422             Py_DECREF(buf);
423             PyErr_SetString(PyExc_SyntaxError,
424                             "codec did not return a unicode object");
425             return error_ret(tok);
426         }
427     } else {
428         tok->decoding_buffer = NULL;
429         if (PyString_CheckExact(buf))
430             utf8 = buf;
431     }
432     if (utf8 == NULL) {
433         utf8 = PyUnicode_AsUTF8String(buf);
434         Py_DECREF(buf);
435         if (utf8 == NULL)
436             return error_ret(tok);
437     }
438     str = PyString_AsString(utf8);
439     utf8len = PyString_GET_SIZE(utf8);
440     if (utf8len > size) {
441         tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
442         if (tok->decoding_buffer == NULL) {
443             Py_DECREF(utf8);
444             return error_ret(tok);
445         }
446         utf8len = size;
447     }
448     memcpy(s, str, utf8len);
449     s[utf8len] = '\0';
450     Py_DECREF(utf8);
451     if (utf8len == 0)
452         return NULL; /* EOF */
453     return s;
454 #endif
455 }
456 
457 /* Set the readline function for TOK to a StreamReader's
458    readline function. The StreamReader is named ENC.
459 
460    This function is called from check_bom and check_coding_spec.
461 
462    ENC is usually identical to the future value of tok->encoding,
463    except for the (currently unsupported) case of UTF-16.
464 
465    Return 1 on success, 0 on failure. */
466 
467 static int
fp_setreadl(struct tok_state * tok,const char * enc)468 fp_setreadl(struct tok_state *tok, const char* enc)
469 {
470     PyObject *reader, *stream, *readline;
471 
472     /* XXX: constify filename argument. */
473     stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
474     if (stream == NULL)
475         return 0;
476 
477     reader = PyCodec_StreamReader(enc, stream, NULL);
478     Py_DECREF(stream);
479     if (reader == NULL)
480         return 0;
481 
482     readline = PyObject_GetAttrString(reader, "readline");
483     Py_DECREF(reader);
484     if (readline == NULL)
485         return 0;
486 
487     tok->decoding_readline = readline;
488     return 1;
489 }
490 
491 /* Fetch the next byte from TOK. */
492 
fp_getc(struct tok_state * tok)493 static int fp_getc(struct tok_state *tok) {
494     return getc(tok->fp);
495 }
496 
497 /* Unfetch the last byte back into TOK.  */
498 
fp_ungetc(int c,struct tok_state * tok)499 static void fp_ungetc(int c, struct tok_state *tok) {
500     ungetc(c, tok->fp);
501 }
502 
503 /* Read a line of input from TOK. Determine encoding
504    if necessary.  */
505 
506 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)507 decoding_fgets(char *s, int size, struct tok_state *tok)
508 {
509     char *line = NULL;
510     int badchar = 0;
511     for (;;) {
512         if (tok->decoding_state < 0) {
513             /* We already have a codec associated with
514                this input. */
515             line = fp_readl(s, size, tok);
516             break;
517         } else if (tok->decoding_state > 0) {
518             /* We want a 'raw' read. */
519             line = Py_UniversalNewlineFgets(s, size,
520                                             tok->fp, NULL);
521             break;
522         } else {
523             /* We have not yet determined the encoding.
524                If an encoding is found, use the file-pointer
525                reader functions from now on. */
526             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
527                 return error_ret(tok);
528             assert(tok->decoding_state != 0);
529         }
530     }
531     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
532         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
533             return error_ret(tok);
534         }
535     }
536 #ifndef PGEN
537     /* The default encoding is ASCII, so make sure we don't have any
538        non-ASCII bytes in it. */
539     if (line && !tok->encoding) {
540         unsigned char *c;
541         for (c = (unsigned char *)line; *c; c++)
542             if (*c > 127) {
543                 badchar = *c;
544                 break;
545             }
546     }
547     if (badchar) {
548         char buf[500];
549         /* Need to add 1 to the line number, since this line
550            has not been counted, yet.  */
551         sprintf(buf,
552             "Non-ASCII character '\\x%.2x' "
553             "in file %.200s on line %i, "
554             "but no encoding declared; "
555             "see http://python.org/dev/peps/pep-0263/ for details",
556             badchar, tok->filename, tok->lineno + 1);
557         PyErr_SetString(PyExc_SyntaxError, buf);
558         return error_ret(tok);
559     }
560 #endif
561     return line;
562 }
563 
564 static int
decoding_feof(struct tok_state * tok)565 decoding_feof(struct tok_state *tok)
566 {
567     if (tok->decoding_state >= 0) {
568         return feof(tok->fp);
569     } else {
570         PyObject* buf = tok->decoding_buffer;
571         if (buf == NULL) {
572             buf = PyObject_CallObject(tok->decoding_readline, NULL);
573             if (buf == NULL) {
574                 error_ret(tok);
575                 return 1;
576             } else {
577                 tok->decoding_buffer = buf;
578             }
579         }
580         return PyObject_Length(buf) == 0;
581     }
582 }
583 
584 /* Fetch a byte from TOK, using the string buffer. */
585 
586 static int
buf_getc(struct tok_state * tok)587 buf_getc(struct tok_state *tok) {
588     return Py_CHARMASK(*tok->str++);
589 }
590 
591 /* Unfetch a byte from TOK, using the string buffer. */
592 
593 static void
buf_ungetc(int c,struct tok_state * tok)594 buf_ungetc(int c, struct tok_state *tok) {
595     tok->str--;
596     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
597 }
598 
599 /* Set the readline function for TOK to ENC. For the string-based
600    tokenizer, this means to just record the encoding. */
601 
602 static int
buf_setreadl(struct tok_state * tok,const char * enc)603 buf_setreadl(struct tok_state *tok, const char* enc) {
604     tok->enc = enc;
605     return 1;
606 }
607 
608 /* Return a UTF-8 encoding Python string object from the
609    C byte string STR, which is encoded with ENC. */
610 
611 #ifdef Py_USING_UNICODE
612 static PyObject *
translate_into_utf8(const char * str,const char * enc)613 translate_into_utf8(const char* str, const char* enc) {
614     PyObject *utf8;
615     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
616     if (buf == NULL)
617         return NULL;
618     utf8 = PyUnicode_AsUTF8String(buf);
619     Py_DECREF(buf);
620     return utf8;
621 }
622 #endif
623 
624 
625 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)626 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
627     int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
628     char *buf, *current;
629     char c = '\0';
630     buf = PyMem_MALLOC(needed_length);
631     if (buf == NULL) {
632         tok->done = E_NOMEM;
633         return NULL;
634     }
635     for (current = buf; *s; s++, current++) {
636         c = *s;
637         if (skip_next_lf) {
638             skip_next_lf = 0;
639             if (c == '\n') {
640                 c = *++s;
641                 if (!c)
642                     break;
643             }
644         }
645         if (c == '\r') {
646             skip_next_lf = 1;
647             c = '\n';
648         }
649         *current = c;
650     }
651     /* If this is exec input, add a newline to the end of the string if
652        there isn't one already. */
653     if (exec_input && c != '\n') {
654         *current = '\n';
655         current++;
656     }
657     *current = '\0';
658     final_length = current - buf + 1;
659     if (final_length < needed_length && final_length)
660         /* should never fail */
661         buf = PyMem_REALLOC(buf, final_length);
662     return buf;
663 }
664 
665 /* Decode a byte string STR for use as the buffer of TOK.
666    Look for encoding declarations inside STR, and record them
667    inside TOK.  */
668 
669 static const char *
decode_str(const char * input,int single,struct tok_state * tok)670 decode_str(const char *input, int single, struct tok_state *tok)
671 {
672     PyObject* utf8 = NULL;
673     const char *str;
674     const char *s;
675     const char *newl[2] = {NULL, NULL};
676     int lineno = 0;
677     tok->input = str = translate_newlines(input, single, tok);
678     if (str == NULL)
679         return NULL;
680     tok->enc = NULL;
681     tok->str = str;
682     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
683         return error_ret(tok);
684     str = tok->str;             /* string after BOM if any */
685     assert(str);
686 #ifdef Py_USING_UNICODE
687     if (tok->enc != NULL) {
688         utf8 = translate_into_utf8(str, tok->enc);
689         if (utf8 == NULL)
690             return error_ret(tok);
691         str = PyString_AsString(utf8);
692     }
693 #endif
694     for (s = str;; s++) {
695         if (*s == '\0') break;
696         else if (*s == '\n') {
697             assert(lineno < 2);
698             newl[lineno] = s;
699             lineno++;
700             if (lineno == 2) break;
701         }
702     }
703     tok->enc = NULL;
704     /* need to check line 1 and 2 separately since check_coding_spec
705        assumes a single line as input */
706     if (newl[0]) {
707         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
708             return error_ret(tok);
709         if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
710             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
711                                    tok, buf_setreadl))
712                 return error_ret(tok);
713         }
714     }
715 #ifdef Py_USING_UNICODE
716     if (tok->enc != NULL) {
717         assert(utf8 == NULL);
718         utf8 = translate_into_utf8(str, tok->enc);
719         if (utf8 == NULL)
720             return error_ret(tok);
721         str = PyString_AsString(utf8);
722     }
723 #endif
724     assert(tok->decoding_buffer == NULL);
725     tok->decoding_buffer = utf8; /* CAUTION */
726     return str;
727 }
728 
729 #endif /* PGEN */
730 
731 /* Set up tokenizer for string */
732 
733 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)734 PyTokenizer_FromString(const char *str, int exec_input)
735 {
736     struct tok_state *tok = tok_new();
737     if (tok == NULL)
738         return NULL;
739     str = (char *)decode_str(str, exec_input, tok);
740     if (str == NULL) {
741         PyTokenizer_Free(tok);
742         return NULL;
743     }
744 
745     /* XXX: constify members. */
746     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
747     return tok;
748 }
749 
750 
751 /* Set up tokenizer for file */
752 
753 struct tok_state *
PyTokenizer_FromFile(FILE * fp,char * ps1,char * ps2)754 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
755 {
756     struct tok_state *tok = tok_new();
757     if (tok == NULL)
758         return NULL;
759     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
760         PyTokenizer_Free(tok);
761         return NULL;
762     }
763     tok->cur = tok->inp = tok->buf;
764     tok->end = tok->buf + BUFSIZ;
765     tok->fp = fp;
766     tok->prompt = ps1;
767     tok->nextprompt = ps2;
768     return tok;
769 }
770 
771 
772 /* Free a tok_state structure */
773 
774 void
PyTokenizer_Free(struct tok_state * tok)775 PyTokenizer_Free(struct tok_state *tok)
776 {
777     if (tok->encoding != NULL)
778         PyMem_FREE(tok->encoding);
779 #ifndef PGEN
780     Py_XDECREF(tok->decoding_readline);
781     Py_XDECREF(tok->decoding_buffer);
782 #endif
783     if (tok->fp != NULL && tok->buf != NULL)
784         PyMem_FREE(tok->buf);
785     if (tok->input)
786         PyMem_FREE((char *)tok->input);
787     PyMem_FREE(tok);
788 }
789 
790 #if !defined(PGEN) && defined(Py_USING_UNICODE)
791 static int
tok_stdin_decode(struct tok_state * tok,char ** inp)792 tok_stdin_decode(struct tok_state *tok, char **inp)
793 {
794     PyObject *enc, *sysstdin, *decoded, *utf8;
795     const char *encoding;
796     char *converted;
797 
798     if (PySys_GetFile((char *)"stdin", NULL) != stdin)
799         return 0;
800     sysstdin = PySys_GetObject("stdin");
801     if (sysstdin == NULL || !PyFile_Check(sysstdin))
802         return 0;
803 
804     enc = ((PyFileObject *)sysstdin)->f_encoding;
805     if (enc == NULL || !PyString_Check(enc))
806         return 0;
807     Py_INCREF(enc);
808 
809     encoding = PyString_AsString(enc);
810     decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
811     if (decoded == NULL)
812         goto error_clear;
813 
814     utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
815     Py_DECREF(decoded);
816     if (utf8 == NULL)
817         goto error_clear;
818 
819     assert(PyString_Check(utf8));
820     converted = new_string(PyString_AS_STRING(utf8),
821                            PyString_GET_SIZE(utf8));
822     Py_DECREF(utf8);
823     if (converted == NULL)
824         goto error_nomem;
825 
826     PyMem_FREE(*inp);
827     *inp = converted;
828     if (tok->encoding != NULL)
829         PyMem_FREE(tok->encoding);
830     tok->encoding = new_string(encoding, strlen(encoding));
831     if (tok->encoding == NULL)
832         goto error_nomem;
833 
834     Py_DECREF(enc);
835     return 0;
836 
837 error_nomem:
838     Py_DECREF(enc);
839     tok->done = E_NOMEM;
840     return -1;
841 
842 error_clear:
843     Py_DECREF(enc);
844     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
845         tok->done = E_ERROR;
846         return -1;
847     }
848     /* Fallback to iso-8859-1: for backward compatibility */
849     PyErr_Clear();
850     return 0;
851 }
852 #endif
853 
854 /* Get next char, updating state; error code goes into tok->done */
855 
856 static int
tok_nextc(register struct tok_state * tok)857 tok_nextc(register struct tok_state *tok)
858 {
859     for (;;) {
860         if (tok->cur != tok->inp) {
861             return Py_CHARMASK(*tok->cur++); /* Fast path */
862         }
863         if (tok->done != E_OK)
864             return EOF;
865         if (tok->fp == NULL) {
866             char *end = strchr(tok->inp, '\n');
867             if (end != NULL)
868                 end++;
869             else {
870                 end = strchr(tok->inp, '\0');
871                 if (end == tok->inp) {
872                     tok->done = E_EOF;
873                     return EOF;
874                 }
875             }
876             if (tok->start == NULL)
877                 tok->buf = tok->cur;
878             tok->line_start = tok->cur;
879             tok->lineno++;
880             tok->inp = end;
881             return Py_CHARMASK(*tok->cur++);
882         }
883         if (tok->prompt != NULL) {
884             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
885             if (tok->nextprompt != NULL)
886                 tok->prompt = tok->nextprompt;
887             if (newtok == NULL)
888                 tok->done = E_INTR;
889             else if (*newtok == '\0') {
890                 PyMem_FREE(newtok);
891                 tok->done = E_EOF;
892             }
893 #if !defined(PGEN) && defined(Py_USING_UNICODE)
894             else if (tok_stdin_decode(tok, &newtok) != 0)
895                 PyMem_FREE(newtok);
896 #endif
897             else if (tok->start != NULL) {
898                 size_t start = tok->start - tok->buf;
899                 size_t oldlen = tok->cur - tok->buf;
900                 size_t newlen = oldlen + strlen(newtok);
901                 char *buf = tok->buf;
902                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
903                 tok->lineno++;
904                 if (buf == NULL) {
905                     PyMem_FREE(tok->buf);
906                     tok->buf = NULL;
907                     PyMem_FREE(newtok);
908                     tok->done = E_NOMEM;
909                     return EOF;
910                 }
911                 tok->buf = buf;
912                 tok->cur = tok->buf + oldlen;
913                 tok->line_start = tok->cur;
914                 strcpy(tok->buf + oldlen, newtok);
915                 PyMem_FREE(newtok);
916                 tok->inp = tok->buf + newlen;
917                 tok->end = tok->inp + 1;
918                 tok->start = tok->buf + start;
919             }
920             else {
921                 tok->lineno++;
922                 if (tok->buf != NULL)
923                     PyMem_FREE(tok->buf);
924                 tok->buf = newtok;
925                 tok->cur = tok->buf;
926                 tok->line_start = tok->buf;
927                 tok->inp = strchr(tok->buf, '\0');
928                 tok->end = tok->inp + 1;
929             }
930         }
931         else {
932             int done = 0;
933             Py_ssize_t cur = 0;
934             char *pt;
935             if (tok->start == NULL) {
936                 if (tok->buf == NULL) {
937                     tok->buf = (char *)
938                         PyMem_MALLOC(BUFSIZ);
939                     if (tok->buf == NULL) {
940                         tok->done = E_NOMEM;
941                         return EOF;
942                     }
943                     tok->end = tok->buf + BUFSIZ;
944                 }
945                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
946                           tok) == NULL) {
947                     if (!tok->decoding_erred)
948                         tok->done = E_EOF;
949                     done = 1;
950                 }
951                 else {
952                     tok->done = E_OK;
953                     tok->inp = strchr(tok->buf, '\0');
954                     done = tok->inp == tok->buf || tok->inp[-1] == '\n';
955                 }
956             }
957             else {
958                 cur = tok->cur - tok->buf;
959                 if (decoding_feof(tok)) {
960                     tok->done = E_EOF;
961                     done = 1;
962                 }
963                 else
964                     tok->done = E_OK;
965             }
966             tok->lineno++;
967             /* Read until '\n' or EOF */
968             while (!done) {
969                 Py_ssize_t curstart = tok->start == NULL ? -1 :
970                           tok->start - tok->buf;
971                 Py_ssize_t curvalid = tok->inp - tok->buf;
972                 Py_ssize_t newsize = curvalid + BUFSIZ;
973                 char *newbuf = tok->buf;
974                 newbuf = (char *)PyMem_REALLOC(newbuf,
975                                                newsize);
976                 if (newbuf == NULL) {
977                     tok->done = E_NOMEM;
978                     tok->cur = tok->inp;
979                     return EOF;
980                 }
981                 tok->buf = newbuf;
982                 tok->cur = tok->buf + cur;
983                 tok->line_start = tok->cur;
984                 tok->inp = tok->buf + curvalid;
985                 tok->end = tok->buf + newsize;
986                 tok->start = curstart < 0 ? NULL :
987                          tok->buf + curstart;
988                 if (decoding_fgets(tok->inp,
989                                (int)(tok->end - tok->inp),
990                                tok) == NULL) {
991                     /* Break out early on decoding
992                        errors, as tok->buf will be NULL
993                      */
994                     if (tok->decoding_erred)
995                         return EOF;
996                     /* Last line does not end in \n,
997                        fake one */
998                     strcpy(tok->inp, "\n");
999                 }
1000                 tok->inp = strchr(tok->inp, '\0');
1001                 done = tok->inp[-1] == '\n';
1002             }
1003             if (tok->buf != NULL) {
1004                 tok->cur = tok->buf + cur;
1005                 tok->line_start = tok->cur;
1006                 /* replace "\r\n" with "\n" */
1007                 /* For Mac leave the \r, giving a syntax error */
1008                 pt = tok->inp - 2;
1009                 if (pt >= tok->buf && *pt == '\r') {
1010                     *pt++ = '\n';
1011                     *pt = '\0';
1012                     tok->inp = pt;
1013                 }
1014             }
1015         }
1016         if (tok->done != E_OK) {
1017             if (tok->prompt != NULL)
1018                 PySys_WriteStderr("\n");
1019             tok->cur = tok->inp;
1020             return EOF;
1021         }
1022     }
1023     /*NOTREACHED*/
1024 }
1025 
1026 
1027 /* Back-up one character */
1028 
1029 static void
tok_backup(register struct tok_state * tok,register int c)1030 tok_backup(register struct tok_state *tok, register int c)
1031 {
1032     if (c != EOF) {
1033         if (--tok->cur < tok->buf)
1034             Py_FatalError("tok_backup: beginning of buffer");
1035         if (*tok->cur != c)
1036             *tok->cur = c;
1037     }
1038 }
1039 
1040 
1041 /* Return the token corresponding to a single character */
1042 
1043 int
PyToken_OneChar(int c)1044 PyToken_OneChar(int c)
1045 {
1046     switch (c) {
1047     case '(':           return LPAR;
1048     case ')':           return RPAR;
1049     case '[':           return LSQB;
1050     case ']':           return RSQB;
1051     case ':':           return COLON;
1052     case ',':           return COMMA;
1053     case ';':           return SEMI;
1054     case '+':           return PLUS;
1055     case '-':           return MINUS;
1056     case '*':           return STAR;
1057     case '/':           return SLASH;
1058     case '|':           return VBAR;
1059     case '&':           return AMPER;
1060     case '<':           return LESS;
1061     case '>':           return GREATER;
1062     case '=':           return EQUAL;
1063     case '.':           return DOT;
1064     case '%':           return PERCENT;
1065     case '`':           return BACKQUOTE;
1066     case '{':           return LBRACE;
1067     case '}':           return RBRACE;
1068     case '^':           return CIRCUMFLEX;
1069     case '~':           return TILDE;
1070     case '@':       return AT;
1071     default:            return OP;
1072     }
1073 }
1074 
1075 
1076 int
PyToken_TwoChars(int c1,int c2)1077 PyToken_TwoChars(int c1, int c2)
1078 {
1079     switch (c1) {
1080     case '=':
1081         switch (c2) {
1082         case '=':               return EQEQUAL;
1083         }
1084         break;
1085     case '!':
1086         switch (c2) {
1087         case '=':               return NOTEQUAL;
1088         }
1089         break;
1090     case '<':
1091         switch (c2) {
1092         case '>':               return NOTEQUAL;
1093         case '=':               return LESSEQUAL;
1094         case '<':               return LEFTSHIFT;
1095         }
1096         break;
1097     case '>':
1098         switch (c2) {
1099         case '=':               return GREATEREQUAL;
1100         case '>':               return RIGHTSHIFT;
1101         }
1102         break;
1103     case '+':
1104         switch (c2) {
1105         case '=':               return PLUSEQUAL;
1106         }
1107         break;
1108     case '-':
1109         switch (c2) {
1110         case '=':               return MINEQUAL;
1111         }
1112         break;
1113     case '*':
1114         switch (c2) {
1115         case '*':               return DOUBLESTAR;
1116         case '=':               return STAREQUAL;
1117         }
1118         break;
1119     case '/':
1120         switch (c2) {
1121         case '/':               return DOUBLESLASH;
1122         case '=':               return SLASHEQUAL;
1123         }
1124         break;
1125     case '|':
1126         switch (c2) {
1127         case '=':               return VBAREQUAL;
1128         }
1129         break;
1130     case '%':
1131         switch (c2) {
1132         case '=':               return PERCENTEQUAL;
1133         }
1134         break;
1135     case '&':
1136         switch (c2) {
1137         case '=':               return AMPEREQUAL;
1138         }
1139         break;
1140     case '^':
1141         switch (c2) {
1142         case '=':               return CIRCUMFLEXEQUAL;
1143         }
1144         break;
1145     }
1146     return OP;
1147 }
1148 
1149 int
PyToken_ThreeChars(int c1,int c2,int c3)1150 PyToken_ThreeChars(int c1, int c2, int c3)
1151 {
1152     switch (c1) {
1153     case '<':
1154         switch (c2) {
1155         case '<':
1156             switch (c3) {
1157             case '=':
1158                 return LEFTSHIFTEQUAL;
1159             }
1160             break;
1161         }
1162         break;
1163     case '>':
1164         switch (c2) {
1165         case '>':
1166             switch (c3) {
1167             case '=':
1168                 return RIGHTSHIFTEQUAL;
1169             }
1170             break;
1171         }
1172         break;
1173     case '*':
1174         switch (c2) {
1175         case '*':
1176             switch (c3) {
1177             case '=':
1178                 return DOUBLESTAREQUAL;
1179             }
1180             break;
1181         }
1182         break;
1183     case '/':
1184         switch (c2) {
1185         case '/':
1186             switch (c3) {
1187             case '=':
1188                 return DOUBLESLASHEQUAL;
1189             }
1190             break;
1191         }
1192         break;
1193     }
1194     return OP;
1195 }
1196 
1197 static int
indenterror(struct tok_state * tok)1198 indenterror(struct tok_state *tok)
1199 {
1200     if (tok->alterror) {
1201         tok->done = E_TABSPACE;
1202         tok->cur = tok->inp;
1203         return 1;
1204     }
1205     if (tok->altwarning) {
1206         PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1207                           "in indentation\n", tok->filename);
1208         tok->altwarning = 0;
1209     }
1210     return 0;
1211 }
1212 
1213 /* Get next token, after space stripping etc. */
1214 
1215 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1216 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1217 {
1218     register int c;
1219     int blankline;
1220 
1221     *p_start = *p_end = NULL;
1222   nextline:
1223     tok->start = NULL;
1224     blankline = 0;
1225 
1226     /* Get indentation level */
1227     if (tok->atbol) {
1228         register int col = 0;
1229         register int altcol = 0;
1230         tok->atbol = 0;
1231         for (;;) {
1232             c = tok_nextc(tok);
1233             if (c == ' ')
1234                 col++, altcol++;
1235             else if (c == '\t') {
1236                 col = (col/tok->tabsize + 1) * tok->tabsize;
1237                 altcol = (altcol/tok->alttabsize + 1)
1238                     * tok->alttabsize;
1239             }
1240             else if (c == '\014') /* Control-L (formfeed) */
1241                 col = altcol = 0; /* For Emacs users */
1242             else
1243                 break;
1244         }
1245         tok_backup(tok, c);
1246         if (c == '#' || c == '\n') {
1247             /* Lines with only whitespace and/or comments
1248                shouldn't affect the indentation and are
1249                not passed to the parser as NEWLINE tokens,
1250                except *totally* empty lines in interactive
1251                mode, which signal the end of a command group. */
1252             if (col == 0 && c == '\n' && tok->prompt != NULL)
1253                 blankline = 0; /* Let it through */
1254             else
1255                 blankline = 1; /* Ignore completely */
1256             /* We can't jump back right here since we still
1257                may need to skip to the end of a comment */
1258         }
1259         if (!blankline && tok->level == 0) {
1260             if (col == tok->indstack[tok->indent]) {
1261                 /* No change */
1262                 if (altcol != tok->altindstack[tok->indent]) {
1263                     if (indenterror(tok))
1264                         return ERRORTOKEN;
1265                 }
1266             }
1267             else if (col > tok->indstack[tok->indent]) {
1268                 /* Indent -- always one */
1269                 if (tok->indent+1 >= MAXINDENT) {
1270                     tok->done = E_TOODEEP;
1271                     tok->cur = tok->inp;
1272                     return ERRORTOKEN;
1273                 }
1274                 if (altcol <= tok->altindstack[tok->indent]) {
1275                     if (indenterror(tok))
1276                         return ERRORTOKEN;
1277                 }
1278                 tok->pendin++;
1279                 tok->indstack[++tok->indent] = col;
1280                 tok->altindstack[tok->indent] = altcol;
1281             }
1282             else /* col < tok->indstack[tok->indent] */ {
1283                 /* Dedent -- any number, must be consistent */
1284                 while (tok->indent > 0 &&
1285                     col < tok->indstack[tok->indent]) {
1286                     tok->pendin--;
1287                     tok->indent--;
1288                 }
1289                 if (col != tok->indstack[tok->indent]) {
1290                     tok->done = E_DEDENT;
1291                     tok->cur = tok->inp;
1292                     return ERRORTOKEN;
1293                 }
1294                 if (altcol != tok->altindstack[tok->indent]) {
1295                     if (indenterror(tok))
1296                         return ERRORTOKEN;
1297                 }
1298             }
1299         }
1300     }
1301 
1302     tok->start = tok->cur;
1303 
1304     /* Return pending indents/dedents */
1305     if (tok->pendin != 0) {
1306         if (tok->pendin < 0) {
1307             tok->pendin++;
1308             return DEDENT;
1309         }
1310         else {
1311             tok->pendin--;
1312             return INDENT;
1313         }
1314     }
1315 
1316  again:
1317     tok->start = NULL;
1318     /* Skip spaces */
1319     do {
1320         c = tok_nextc(tok);
1321     } while (c == ' ' || c == '\t' || c == '\014');
1322 
1323     /* Set start of current token */
1324     tok->start = tok->cur - 1;
1325 
1326     /* Skip comment, while looking for tab-setting magic */
1327     if (c == '#') {
1328         static char *tabforms[] = {
1329             "tab-width:",                       /* Emacs */
1330             ":tabstop=",                        /* vim, full form */
1331             ":ts=",                             /* vim, abbreviated form */
1332             "set tabsize=",                     /* will vi never die? */
1333         /* more templates can be added here to support other editors */
1334         };
1335         char cbuf[80];
1336         char *tp, **cp;
1337         tp = cbuf;
1338         do {
1339             *tp++ = c = tok_nextc(tok);
1340         } while (c != EOF && c != '\n' &&
1341                  (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1342         *tp = '\0';
1343         for (cp = tabforms;
1344              cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1345              cp++) {
1346             if ((tp = strstr(cbuf, *cp))) {
1347                 int newsize = atoi(tp + strlen(*cp));
1348 
1349                 if (newsize >= 1 && newsize <= 40) {
1350                     tok->tabsize = newsize;
1351                     if (Py_VerboseFlag)
1352                         PySys_WriteStderr(
1353                         "Tab size set to %d\n",
1354                         newsize);
1355                 }
1356             }
1357         }
1358         while (c != EOF && c != '\n')
1359             c = tok_nextc(tok);
1360     }
1361 
1362     /* Check for EOF and errors now */
1363     if (c == EOF) {
1364         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1365     }
1366 
1367     /* Identifier (most frequent token!) */
1368     if (Py_ISALPHA(c) || c == '_') {
1369         /* Process r"", u"" and ur"" */
1370         switch (c) {
1371         case 'b':
1372         case 'B':
1373             c = tok_nextc(tok);
1374             if (c == 'r' || c == 'R')
1375                 c = tok_nextc(tok);
1376             if (c == '"' || c == '\'')
1377                 goto letter_quote;
1378             break;
1379         case 'r':
1380         case 'R':
1381             c = tok_nextc(tok);
1382             if (c == '"' || c == '\'')
1383                 goto letter_quote;
1384             break;
1385         case 'u':
1386         case 'U':
1387             c = tok_nextc(tok);
1388             if (c == 'r' || c == 'R')
1389                 c = tok_nextc(tok);
1390             if (c == '"' || c == '\'')
1391                 goto letter_quote;
1392             break;
1393         }
1394         while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1395             c = tok_nextc(tok);
1396         }
1397         tok_backup(tok, c);
1398         *p_start = tok->start;
1399         *p_end = tok->cur;
1400         return NAME;
1401     }
1402 
1403     /* Newline */
1404     if (c == '\n') {
1405         tok->atbol = 1;
1406         if (blankline || tok->level > 0)
1407             goto nextline;
1408         *p_start = tok->start;
1409         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1410         tok->cont_line = 0;
1411         return NEWLINE;
1412     }
1413 
1414     /* Period or number starting with period? */
1415     if (c == '.') {
1416         c = tok_nextc(tok);
1417         if (isdigit(c)) {
1418             goto fraction;
1419         }
1420         else {
1421             tok_backup(tok, c);
1422             *p_start = tok->start;
1423             *p_end = tok->cur;
1424             return DOT;
1425         }
1426     }
1427 
1428     /* Number */
1429     if (isdigit(c)) {
1430         if (c == '0') {
1431             /* Hex, octal or binary -- maybe. */
1432             c = tok_nextc(tok);
1433             if (c == '.')
1434                 goto fraction;
1435 #ifndef WITHOUT_COMPLEX
1436             if (c == 'j' || c == 'J')
1437                 goto imaginary;
1438 #endif
1439             if (c == 'x' || c == 'X') {
1440 
1441                 /* Hex */
1442                 c = tok_nextc(tok);
1443                 if (!isxdigit(c)) {
1444                     tok->done = E_TOKEN;
1445                     tok_backup(tok, c);
1446                     return ERRORTOKEN;
1447                 }
1448                 do {
1449                     c = tok_nextc(tok);
1450                 } while (isxdigit(c));
1451             }
1452             else if (c == 'o' || c == 'O') {
1453                 /* Octal */
1454                 c = tok_nextc(tok);
1455                 if (c < '0' || c >= '8') {
1456                     tok->done = E_TOKEN;
1457                     tok_backup(tok, c);
1458                     return ERRORTOKEN;
1459                 }
1460                 do {
1461                     c = tok_nextc(tok);
1462                 } while ('0' <= c && c < '8');
1463             }
1464             else if (c == 'b' || c == 'B') {
1465                 /* Binary */
1466                 c = tok_nextc(tok);
1467                 if (c != '0' && c != '1') {
1468                     tok->done = E_TOKEN;
1469                     tok_backup(tok, c);
1470                     return ERRORTOKEN;
1471                 }
1472                 do {
1473                     c = tok_nextc(tok);
1474                 } while (c == '0' || c == '1');
1475             }
1476             else {
1477                 int found_decimal = 0;
1478                 /* Octal; c is first char of it */
1479                 /* There's no 'isoctdigit' macro, sigh */
1480                 while ('0' <= c && c < '8') {
1481                     c = tok_nextc(tok);
1482                 }
1483                 if (isdigit(c)) {
1484                     found_decimal = 1;
1485                     do {
1486                         c = tok_nextc(tok);
1487                     } while (isdigit(c));
1488                 }
1489                 if (c == '.')
1490                     goto fraction;
1491                 else if (c == 'e' || c == 'E')
1492                     goto exponent;
1493 #ifndef WITHOUT_COMPLEX
1494                 else if (c == 'j' || c == 'J')
1495                     goto imaginary;
1496 #endif
1497                 else if (found_decimal) {
1498                     tok->done = E_TOKEN;
1499                     tok_backup(tok, c);
1500                     return ERRORTOKEN;
1501                 }
1502             }
1503             if (c == 'l' || c == 'L')
1504                 c = tok_nextc(tok);
1505         }
1506         else {
1507             /* Decimal */
1508             do {
1509                 c = tok_nextc(tok);
1510             } while (isdigit(c));
1511             if (c == 'l' || c == 'L')
1512                 c = tok_nextc(tok);
1513             else {
1514                 /* Accept floating point numbers. */
1515                 if (c == '.') {
1516         fraction:
1517                     /* Fraction */
1518                     do {
1519                         c = tok_nextc(tok);
1520                     } while (isdigit(c));
1521                 }
1522                 if (c == 'e' || c == 'E') {
1523                     int e;
1524                   exponent:
1525                     e = c;
1526                     /* Exponent part */
1527                     c = tok_nextc(tok);
1528                     if (c == '+' || c == '-') {
1529                         c = tok_nextc(tok);
1530                         if (!isdigit(c)) {
1531                             tok->done = E_TOKEN;
1532                             tok_backup(tok, c);
1533                             return ERRORTOKEN;
1534                         }
1535                     } else if (!isdigit(c)) {
1536                         tok_backup(tok, c);
1537                         tok_backup(tok, e);
1538                         *p_start = tok->start;
1539                         *p_end = tok->cur;
1540                         return NUMBER;
1541                     }
1542                     do {
1543                         c = tok_nextc(tok);
1544                     } while (isdigit(c));
1545                 }
1546 #ifndef WITHOUT_COMPLEX
1547                 if (c == 'j' || c == 'J')
1548                     /* Imaginary part */
1549         imaginary:
1550                     c = tok_nextc(tok);
1551 #endif
1552             }
1553         }
1554         tok_backup(tok, c);
1555         *p_start = tok->start;
1556         *p_end = tok->cur;
1557         return NUMBER;
1558     }
1559 
1560   letter_quote:
1561     /* String */
1562     if (c == '\'' || c == '"') {
1563         Py_ssize_t quote2 = tok->cur - tok->start + 1;
1564         int quote = c;
1565         int triple = 0;
1566         int tripcount = 0;
1567         for (;;) {
1568             c = tok_nextc(tok);
1569             if (c == '\n') {
1570                 if (!triple) {
1571                     tok->done = E_EOLS;
1572                     tok_backup(tok, c);
1573                     return ERRORTOKEN;
1574                 }
1575                 tripcount = 0;
1576                 tok->cont_line = 1; /* multiline string. */
1577             }
1578             else if (c == EOF) {
1579                 if (triple)
1580                     tok->done = E_EOFS;
1581                 else
1582                     tok->done = E_EOLS;
1583                 tok->cur = tok->inp;
1584                 return ERRORTOKEN;
1585             }
1586             else if (c == quote) {
1587                 tripcount++;
1588                 if (tok->cur - tok->start == quote2) {
1589                     c = tok_nextc(tok);
1590                     if (c == quote) {
1591                         triple = 1;
1592                         tripcount = 0;
1593                         continue;
1594                     }
1595                     tok_backup(tok, c);
1596                 }
1597                 if (!triple || tripcount == 3)
1598                     break;
1599             }
1600             else if (c == '\\') {
1601                 tripcount = 0;
1602                 c = tok_nextc(tok);
1603                 if (c == EOF) {
1604                     tok->done = E_EOLS;
1605                     tok->cur = tok->inp;
1606                     return ERRORTOKEN;
1607                 }
1608             }
1609             else
1610                 tripcount = 0;
1611         }
1612         *p_start = tok->start;
1613         *p_end = tok->cur;
1614         return STRING;
1615     }
1616 
1617     /* Line continuation */
1618     if (c == '\\') {
1619         c = tok_nextc(tok);
1620         if (c != '\n') {
1621             tok->done = E_LINECONT;
1622             tok->cur = tok->inp;
1623             return ERRORTOKEN;
1624         }
1625         tok->cont_line = 1;
1626         goto again; /* Read next line */
1627     }
1628 
1629     /* Check for two-character token */
1630     {
1631         int c2 = tok_nextc(tok);
1632         int token = PyToken_TwoChars(c, c2);
1633 #ifndef PGEN
1634         if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1635             if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1636                                    "<> not supported in 3.x; use !=",
1637                                    tok->filename, tok->lineno,
1638                                    NULL, NULL)) {
1639                 return ERRORTOKEN;
1640             }
1641         }
1642 #endif
1643         if (token != OP) {
1644             int c3 = tok_nextc(tok);
1645             int token3 = PyToken_ThreeChars(c, c2, c3);
1646             if (token3 != OP) {
1647                 token = token3;
1648             } else {
1649                 tok_backup(tok, c3);
1650             }
1651             *p_start = tok->start;
1652             *p_end = tok->cur;
1653             return token;
1654         }
1655         tok_backup(tok, c2);
1656     }
1657 
1658     /* Keep track of parentheses nesting level */
1659     switch (c) {
1660     case '(':
1661     case '[':
1662     case '{':
1663         tok->level++;
1664         break;
1665     case ')':
1666     case ']':
1667     case '}':
1668         tok->level--;
1669         break;
1670     }
1671 
1672     /* Punctuation character */
1673     *p_start = tok->start;
1674     *p_end = tok->cur;
1675     return PyToken_OneChar(c);
1676 }
1677 
1678 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1679 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1680 {
1681     int result = tok_get(tok, p_start, p_end);
1682     if (tok->decoding_erred) {
1683         result = ERRORTOKEN;
1684         tok->done = E_DECODE;
1685     }
1686     return result;
1687 }
1688 
1689 /* This function is only called from parsetok. However, it cannot live
1690    there, as it must be empty for PGEN, and we can check for PGEN only
1691    in this file. */
1692 
1693 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1694 char*
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1695 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1696 {
1697     return NULL;
1698 }
1699 #else
1700 #ifdef Py_USING_UNICODE
1701 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1702 dec_utf8(const char *enc, const char *text, size_t len) {
1703     PyObject *ret = NULL;
1704     PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1705     if (unicode_text) {
1706         ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1707         Py_DECREF(unicode_text);
1708     }
1709     if (!ret) {
1710         PyErr_Clear();
1711     }
1712     return ret;
1713 }
1714 char *
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1715 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1716 {
1717     char *text = NULL;
1718     if (tok->encoding) {
1719         /* convert source to original encondig */
1720         PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1721         if (lineobj != NULL) {
1722             int linelen = PyString_Size(lineobj);
1723             const char *line = PyString_AsString(lineobj);
1724             text = PyObject_MALLOC(linelen + 1);
1725             if (text != NULL && line != NULL) {
1726                 if (linelen)
1727                     strncpy(text, line, linelen);
1728                 text[linelen] = '\0';
1729             }
1730             Py_DECREF(lineobj);
1731 
1732             /* adjust error offset */
1733             if (*offset > 1) {
1734                 PyObject *offsetobj = dec_utf8(tok->encoding,
1735                                                tok->buf, *offset-1);
1736                 if (offsetobj) {
1737                     *offset = PyString_Size(offsetobj) + 1;
1738                     Py_DECREF(offsetobj);
1739                 }
1740             }
1741 
1742         }
1743     }
1744     return text;
1745 
1746 }
1747 #endif /* defined(Py_USING_UNICODE) */
1748 #endif
1749 
1750 
1751 #ifdef Py_DEBUG
1752 
1753 void
tok_dump(int type,char * start,char * end)1754 tok_dump(int type, char *start, char *end)
1755 {
1756     printf("%s", _PyParser_TokenNames[type]);
1757     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1758         printf("(%.*s)", (int)(end - start), start);
1759 }
1760 
1761 #endif
1762