• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /* Tokenizer implementation */
3 
4 #include "Python.h"
5 #include "pgenheaders.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "errcode.h"
12 
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
21 
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24    empty malloc'ed string for EOF;
25    NULL if interrupted */
26 
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
29 
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
34 
35 /* Token names */
36 
37 char *_PyParser_TokenNames[] = {
38     "ENDMARKER",
39     "NAME",
40     "NUMBER",
41     "STRING",
42     "NEWLINE",
43     "INDENT",
44     "DEDENT",
45     "LPAR",
46     "RPAR",
47     "LSQB",
48     "RSQB",
49     "COLON",
50     "COMMA",
51     "SEMI",
52     "PLUS",
53     "MINUS",
54     "STAR",
55     "SLASH",
56     "VBAR",
57     "AMPER",
58     "LESS",
59     "GREATER",
60     "EQUAL",
61     "DOT",
62     "PERCENT",
63     "BACKQUOTE",
64     "LBRACE",
65     "RBRACE",
66     "EQEQUAL",
67     "NOTEQUAL",
68     "LESSEQUAL",
69     "GREATEREQUAL",
70     "TILDE",
71     "CIRCUMFLEX",
72     "LEFTSHIFT",
73     "RIGHTSHIFT",
74     "DOUBLESTAR",
75     "PLUSEQUAL",
76     "MINEQUAL",
77     "STAREQUAL",
78     "SLASHEQUAL",
79     "PERCENTEQUAL",
80     "AMPEREQUAL",
81     "VBAREQUAL",
82     "CIRCUMFLEXEQUAL",
83     "LEFTSHIFTEQUAL",
84     "RIGHTSHIFTEQUAL",
85     "DOUBLESTAREQUAL",
86     "DOUBLESLASH",
87     "DOUBLESLASHEQUAL",
88     "AT",
89     /* This table must match the #defines in token.h! */
90     "OP",
91     "<ERRORTOKEN>",
92     "<N_TOKENS>"
93 };
94 
95 /* Create and initialize a new tok_state structure */
96 
97 static struct tok_state *
tok_new(void)98 tok_new(void)
99 {
100     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101                                             sizeof(struct tok_state));
102     if (tok == NULL)
103         return NULL;
104     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105     tok->done = E_OK;
106     tok->fp = NULL;
107     tok->input = NULL;
108     tok->tabsize = TABSIZE;
109     tok->indent = 0;
110     tok->indstack[0] = 0;
111     tok->atbol = 1;
112     tok->pendin = 0;
113     tok->prompt = tok->nextprompt = NULL;
114     tok->lineno = 0;
115     tok->level = 0;
116     tok->filename = NULL;
117     tok->altwarning = 0;
118     tok->alterror = 0;
119     tok->alttabsize = 1;
120     tok->altindstack[0] = 0;
121     tok->decoding_state = 0;
122     tok->decoding_erred = 0;
123     tok->read_coding_spec = 0;
124     tok->encoding = NULL;
125     tok->cont_line = 0;
126 #ifndef PGEN
127     tok->decoding_readline = NULL;
128     tok->decoding_buffer = NULL;
129 #endif
130     return tok;
131 }
132 
133 static char *
new_string(const char * s,Py_ssize_t len)134 new_string(const char *s, Py_ssize_t len)
135 {
136     char* result = (char *)PyMem_MALLOC(len + 1);
137     if (result != NULL) {
138         memcpy(result, s, len);
139         result[len] = '\0';
140     }
141     return result;
142 }
143 
144 #ifdef PGEN
145 
146 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)147 decoding_fgets(char *s, int size, struct tok_state *tok)
148 {
149     return fgets(s, size, tok->fp);
150 }
151 
152 static int
decoding_feof(struct tok_state * tok)153 decoding_feof(struct tok_state *tok)
154 {
155     return feof(tok->fp);
156 }
157 
158 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)159 decode_str(const char *str, int exec_input, struct tok_state *tok)
160 {
161     return new_string(str, strlen(str));
162 }
163 
164 #else /* PGEN */
165 
166 static char *
error_ret(struct tok_state * tok)167 error_ret(struct tok_state *tok) /* XXX */
168 {
169     tok->decoding_erred = 1;
170     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171         PyMem_FREE(tok->buf);
172     tok->buf = NULL;
173     return NULL;                /* as if it were EOF */
174 }
175 
176 
177 static char *
get_normal_name(char * s)178 get_normal_name(char *s)        /* for utf-8 and latin-1 */
179 {
180     char buf[13];
181     int i;
182     for (i = 0; i < 12; i++) {
183         int c = s[i];
184         if (c == '\0')
185             break;
186         else if (c == '_')
187             buf[i] = '-';
188         else
189             buf[i] = tolower(c);
190     }
191     buf[i] = '\0';
192     if (strcmp(buf, "utf-8") == 0 ||
193         strncmp(buf, "utf-8-", 6) == 0)
194         return "utf-8";
195     else if (strcmp(buf, "latin-1") == 0 ||
196              strcmp(buf, "iso-8859-1") == 0 ||
197              strcmp(buf, "iso-latin-1") == 0 ||
198              strncmp(buf, "latin-1-", 8) == 0 ||
199              strncmp(buf, "iso-8859-1-", 11) == 0 ||
200              strncmp(buf, "iso-latin-1-", 12) == 0)
201         return "iso-8859-1";
202     else
203         return s;
204 }
205 
206 /* Return the coding spec in S, or NULL if none is found.  */
207 
208 static char *
get_coding_spec(const char * s,Py_ssize_t size)209 get_coding_spec(const char *s, Py_ssize_t size)
210 {
211     Py_ssize_t i;
212     /* Coding spec must be in a comment, and that comment must be
213      * the only statement on the source code line. */
214     for (i = 0; i < size - 6; i++) {
215         if (s[i] == '#')
216             break;
217         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218             return NULL;
219     }
220     for (; i < size - 6; i++) { /* XXX inefficient search */
221         const char* t = s + i;
222         if (strncmp(t, "coding", 6) == 0) {
223             const char* begin = NULL;
224             t += 6;
225             if (t[0] != ':' && t[0] != '=')
226                 continue;
227             do {
228                 t++;
229             } while (t[0] == '\x20' || t[0] == '\t');
230 
231             begin = t;
232             while (Py_ISALNUM(t[0]) ||
233                    t[0] == '-' || t[0] == '_' || t[0] == '.')
234                 t++;
235 
236             if (begin < t) {
237                 char* r = new_string(begin, t - begin);
238                 char* q = get_normal_name(r);
239                 if (r != q) {
240                     PyMem_FREE(r);
241                     r = new_string(q, strlen(q));
242                 }
243                 return r;
244             }
245         }
246     }
247     return NULL;
248 }
249 
250 /* Check whether the line contains a coding spec. If it does,
251    invoke the set_readline function for the new encoding.
252    This function receives the tok_state and the new encoding.
253    Return 1 on success, 0 on failure.  */
254 
255 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))256 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
257                   int set_readline(struct tok_state *, const char *))
258 {
259     char * cs;
260     int r = 1;
261 
262     if (tok->cont_line) {
263         /* It's a continuation line, so it can't be a coding spec. */
264         tok->read_coding_spec = 1;
265         return 1;
266     }
267     cs = get_coding_spec(line, size);
268     if (!cs) {
269         Py_ssize_t i;
270         for (i = 0; i < size; i++) {
271             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
272                 break;
273             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
274                 /* Stop checking coding spec after a line containing
275                  * anything except a comment. */
276                 tok->read_coding_spec = 1;
277                 break;
278             }
279         }
280     } else {
281         tok->read_coding_spec = 1;
282         if (tok->encoding == NULL) {
283             assert(tok->decoding_state == 1); /* raw */
284             if (strcmp(cs, "utf-8") == 0 ||
285                 strcmp(cs, "iso-8859-1") == 0) {
286                 tok->encoding = cs;
287             } else {
288 #ifdef Py_USING_UNICODE
289                 r = set_readline(tok, cs);
290                 if (r) {
291                     tok->encoding = cs;
292                     tok->decoding_state = -1;
293                 }
294                 else {
295                     PyErr_Format(PyExc_SyntaxError,
296                                  "encoding problem: %s", cs);
297                     PyMem_FREE(cs);
298                 }
299 #else
300                 /* Without Unicode support, we cannot
301                    process the coding spec. Since there
302                    won't be any Unicode literals, that
303                    won't matter. */
304                 PyMem_FREE(cs);
305 #endif
306             }
307         } else {                /* then, compare cs with BOM */
308             r = (strcmp(tok->encoding, cs) == 0);
309             if (!r)
310                 PyErr_Format(PyExc_SyntaxError,
311                              "encoding problem: %s with BOM", cs);
312             PyMem_FREE(cs);
313         }
314     }
315     return r;
316 }
317 
318 /* See whether the file starts with a BOM. If it does,
319    invoke the set_readline function with the new encoding.
320    Return 1 on success, 0 on failure.  */
321 
322 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)323 check_bom(int get_char(struct tok_state *),
324           void unget_char(int, struct tok_state *),
325           int set_readline(struct tok_state *, const char *),
326           struct tok_state *tok)
327 {
328     int ch1, ch2, ch3;
329     ch1 = get_char(tok);
330     tok->decoding_state = 1;
331     if (ch1 == EOF) {
332         return 1;
333     } else if (ch1 == 0xEF) {
334         ch2 = get_char(tok);
335         if (ch2 != 0xBB) {
336             unget_char(ch2, tok);
337             unget_char(ch1, tok);
338             return 1;
339         }
340         ch3 = get_char(tok);
341         if (ch3 != 0xBF) {
342             unget_char(ch3, tok);
343             unget_char(ch2, tok);
344             unget_char(ch1, tok);
345             return 1;
346         }
347 #if 0
348     /* Disable support for UTF-16 BOMs until a decision
349        is made whether this needs to be supported.  */
350     } else if (ch1 == 0xFE) {
351         ch2 = get_char(tok);
352         if (ch2 != 0xFF) {
353             unget_char(ch2, tok);
354             unget_char(ch1, tok);
355             return 1;
356         }
357         if (!set_readline(tok, "utf-16-be"))
358             return 0;
359         tok->decoding_state = -1;
360     } else if (ch1 == 0xFF) {
361         ch2 = get_char(tok);
362         if (ch2 != 0xFE) {
363             unget_char(ch2, tok);
364             unget_char(ch1, tok);
365             return 1;
366         }
367         if (!set_readline(tok, "utf-16-le"))
368             return 0;
369         tok->decoding_state = -1;
370 #endif
371     } else {
372         unget_char(ch1, tok);
373         return 1;
374     }
375     if (tok->encoding != NULL)
376         PyMem_FREE(tok->encoding);
377     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
378     return 1;
379 }
380 
381 /* Read a line of text from TOK into S, using the stream in TOK.
382    Return NULL on failure, else S.
383 
384    On entry, tok->decoding_buffer will be one of:
385      1) NULL: need to call tok->decoding_readline to get a new line
386      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
387        stored the result in tok->decoding_buffer
388      3) PyStringObject *: previous call to fp_readl did not have enough room
389        (in the s buffer) to copy entire contents of the line read
390        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
391        In this case, fp_readl is called in a loop (with an expanded buffer)
392        until the buffer ends with a '\n' (or until the end of the file is
393        reached): see tok_nextc and its calls to decoding_fgets.
394 */
395 
396 static char *
fp_readl(char * s,int size,struct tok_state * tok)397 fp_readl(char *s, int size, struct tok_state *tok)
398 {
399 #ifndef Py_USING_UNICODE
400     /* In a non-Unicode built, this should never be called. */
401     Py_FatalError("fp_readl should not be called in this build.");
402     return NULL; /* Keep compiler happy (not reachable) */
403 #else
404     PyObject* utf8 = NULL;
405     PyObject* buf = tok->decoding_buffer;
406     char *str;
407     Py_ssize_t utf8len;
408 
409     /* Ask for one less byte so we can terminate it */
410     assert(size > 0);
411     size--;
412 
413     if (buf == NULL) {
414         buf = PyObject_CallObject(tok->decoding_readline, NULL);
415         if (buf == NULL)
416             return error_ret(tok);
417         if (!PyUnicode_Check(buf)) {
418             Py_DECREF(buf);
419             PyErr_SetString(PyExc_SyntaxError,
420                             "codec did not return a unicode object");
421             return error_ret(tok);
422         }
423     } else {
424         tok->decoding_buffer = NULL;
425         if (PyString_CheckExact(buf))
426             utf8 = buf;
427     }
428     if (utf8 == NULL) {
429         utf8 = PyUnicode_AsUTF8String(buf);
430         Py_DECREF(buf);
431         if (utf8 == NULL)
432             return error_ret(tok);
433     }
434     str = PyString_AsString(utf8);
435     utf8len = PyString_GET_SIZE(utf8);
436     if (utf8len > size) {
437         tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
438         if (tok->decoding_buffer == NULL) {
439             Py_DECREF(utf8);
440             return error_ret(tok);
441         }
442         utf8len = size;
443     }
444     memcpy(s, str, utf8len);
445     s[utf8len] = '\0';
446     Py_DECREF(utf8);
447     if (utf8len == 0)
448         return NULL; /* EOF */
449     return s;
450 #endif
451 }
452 
453 /* Set the readline function for TOK to a StreamReader's
454    readline function. The StreamReader is named ENC.
455 
456    This function is called from check_bom and check_coding_spec.
457 
458    ENC is usually identical to the future value of tok->encoding,
459    except for the (currently unsupported) case of UTF-16.
460 
461    Return 1 on success, 0 on failure. */
462 
463 static int
fp_setreadl(struct tok_state * tok,const char * enc)464 fp_setreadl(struct tok_state *tok, const char* enc)
465 {
466     PyObject *reader, *stream, *readline;
467 
468     /* XXX: constify filename argument. */
469     stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
470     if (stream == NULL)
471         return 0;
472 
473     reader = PyCodec_StreamReader(enc, stream, NULL);
474     Py_DECREF(stream);
475     if (reader == NULL)
476         return 0;
477 
478     readline = PyObject_GetAttrString(reader, "readline");
479     Py_DECREF(reader);
480     if (readline == NULL)
481         return 0;
482 
483     tok->decoding_readline = readline;
484     return 1;
485 }
486 
487 /* Fetch the next byte from TOK. */
488 
fp_getc(struct tok_state * tok)489 static int fp_getc(struct tok_state *tok) {
490     return getc(tok->fp);
491 }
492 
493 /* Unfetch the last byte back into TOK.  */
494 
fp_ungetc(int c,struct tok_state * tok)495 static void fp_ungetc(int c, struct tok_state *tok) {
496     ungetc(c, tok->fp);
497 }
498 
499 /* Read a line of input from TOK. Determine encoding
500    if necessary.  */
501 
502 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)503 decoding_fgets(char *s, int size, struct tok_state *tok)
504 {
505     char *line = NULL;
506     int badchar = 0;
507     for (;;) {
508         if (tok->decoding_state < 0) {
509             /* We already have a codec associated with
510                this input. */
511             line = fp_readl(s, size, tok);
512             break;
513         } else if (tok->decoding_state > 0) {
514             /* We want a 'raw' read. */
515             line = Py_UniversalNewlineFgets(s, size,
516                                             tok->fp, NULL);
517             break;
518         } else {
519             /* We have not yet determined the encoding.
520                If an encoding is found, use the file-pointer
521                reader functions from now on. */
522             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
523                 return error_ret(tok);
524             assert(tok->decoding_state != 0);
525         }
526     }
527     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
528         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
529             return error_ret(tok);
530         }
531     }
532 #ifndef PGEN
533     /* The default encoding is ASCII, so make sure we don't have any
534        non-ASCII bytes in it. */
535     if (line && !tok->encoding) {
536         unsigned char *c;
537         for (c = (unsigned char *)line; *c; c++)
538             if (*c > 127) {
539                 badchar = *c;
540                 break;
541             }
542     }
543     if (badchar) {
544         char buf[500];
545         /* Need to add 1 to the line number, since this line
546            has not been counted, yet.  */
547         sprintf(buf,
548             "Non-ASCII character '\\x%.2x' "
549             "in file %.200s on line %i, "
550             "but no encoding declared; "
551             "see http://python.org/dev/peps/pep-0263/ for details",
552             badchar, tok->filename, tok->lineno + 1);
553         PyErr_SetString(PyExc_SyntaxError, buf);
554         return error_ret(tok);
555     }
556 #endif
557     return line;
558 }
559 
560 static int
decoding_feof(struct tok_state * tok)561 decoding_feof(struct tok_state *tok)
562 {
563     if (tok->decoding_state >= 0) {
564         return feof(tok->fp);
565     } else {
566         PyObject* buf = tok->decoding_buffer;
567         if (buf == NULL) {
568             buf = PyObject_CallObject(tok->decoding_readline, NULL);
569             if (buf == NULL) {
570                 error_ret(tok);
571                 return 1;
572             } else {
573                 tok->decoding_buffer = buf;
574             }
575         }
576         return PyObject_Length(buf) == 0;
577     }
578 }
579 
580 /* Fetch a byte from TOK, using the string buffer. */
581 
582 static int
buf_getc(struct tok_state * tok)583 buf_getc(struct tok_state *tok) {
584     return Py_CHARMASK(*tok->str++);
585 }
586 
587 /* Unfetch a byte from TOK, using the string buffer. */
588 
589 static void
buf_ungetc(int c,struct tok_state * tok)590 buf_ungetc(int c, struct tok_state *tok) {
591     tok->str--;
592     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
593 }
594 
595 /* Set the readline function for TOK to ENC. For the string-based
596    tokenizer, this means to just record the encoding. */
597 
598 static int
buf_setreadl(struct tok_state * tok,const char * enc)599 buf_setreadl(struct tok_state *tok, const char* enc) {
600     tok->enc = enc;
601     return 1;
602 }
603 
604 /* Return a UTF-8 encoding Python string object from the
605    C byte string STR, which is encoded with ENC. */
606 
607 #ifdef Py_USING_UNICODE
608 static PyObject *
translate_into_utf8(const char * str,const char * enc)609 translate_into_utf8(const char* str, const char* enc) {
610     PyObject *utf8;
611     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
612     if (buf == NULL)
613         return NULL;
614     utf8 = PyUnicode_AsUTF8String(buf);
615     Py_DECREF(buf);
616     return utf8;
617 }
618 #endif
619 
620 
621 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)622 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
623     int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
624     char *buf, *current;
625     char c = '\0';
626     buf = PyMem_MALLOC(needed_length);
627     if (buf == NULL) {
628         tok->done = E_NOMEM;
629         return NULL;
630     }
631     for (current = buf; *s; s++, current++) {
632         c = *s;
633         if (skip_next_lf) {
634             skip_next_lf = 0;
635             if (c == '\n') {
636                 c = *++s;
637                 if (!c)
638                     break;
639             }
640         }
641         if (c == '\r') {
642             skip_next_lf = 1;
643             c = '\n';
644         }
645         *current = c;
646     }
647     /* If this is exec input, add a newline to the end of the string if
648        there isn't one already. */
649     if (exec_input && c != '\n') {
650         *current = '\n';
651         current++;
652     }
653     *current = '\0';
654     final_length = current - buf + 1;
655     if (final_length < needed_length && final_length)
656         /* should never fail */
657         buf = PyMem_REALLOC(buf, final_length);
658     return buf;
659 }
660 
661 /* Decode a byte string STR for use as the buffer of TOK.
662    Look for encoding declarations inside STR, and record them
663    inside TOK.  */
664 
665 static const char *
decode_str(const char * input,int single,struct tok_state * tok)666 decode_str(const char *input, int single, struct tok_state *tok)
667 {
668     PyObject* utf8 = NULL;
669     const char *str;
670     const char *s;
671     const char *newl[2] = {NULL, NULL};
672     int lineno = 0;
673     tok->input = str = translate_newlines(input, single, tok);
674     if (str == NULL)
675         return NULL;
676     tok->enc = NULL;
677     tok->str = str;
678     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
679         return error_ret(tok);
680     str = tok->str;             /* string after BOM if any */
681     assert(str);
682 #ifdef Py_USING_UNICODE
683     if (tok->enc != NULL) {
684         utf8 = translate_into_utf8(str, tok->enc);
685         if (utf8 == NULL)
686             return error_ret(tok);
687         str = PyString_AsString(utf8);
688     }
689 #endif
690     for (s = str;; s++) {
691         if (*s == '\0') break;
692         else if (*s == '\n') {
693             assert(lineno < 2);
694             newl[lineno] = s;
695             lineno++;
696             if (lineno == 2) break;
697         }
698     }
699     tok->enc = NULL;
700     /* need to check line 1 and 2 separately since check_coding_spec
701        assumes a single line as input */
702     if (newl[0]) {
703         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704             return error_ret(tok);
705         if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
706             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707                                    tok, buf_setreadl))
708                 return error_ret(tok);
709         }
710     }
711 #ifdef Py_USING_UNICODE
712     if (tok->enc != NULL) {
713         assert(utf8 == NULL);
714         utf8 = translate_into_utf8(str, tok->enc);
715         if (utf8 == NULL)
716             return error_ret(tok);
717         str = PyString_AsString(utf8);
718     }
719 #endif
720     assert(tok->decoding_buffer == NULL);
721     tok->decoding_buffer = utf8; /* CAUTION */
722     return str;
723 }
724 
725 #endif /* PGEN */
726 
727 /* Set up tokenizer for string */
728 
729 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)730 PyTokenizer_FromString(const char *str, int exec_input)
731 {
732     struct tok_state *tok = tok_new();
733     if (tok == NULL)
734         return NULL;
735     str = (char *)decode_str(str, exec_input, tok);
736     if (str == NULL) {
737         PyTokenizer_Free(tok);
738         return NULL;
739     }
740 
741     /* XXX: constify members. */
742     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
743     return tok;
744 }
745 
746 
747 /* Set up tokenizer for file */
748 
749 struct tok_state *
PyTokenizer_FromFile(FILE * fp,char * ps1,char * ps2)750 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
751 {
752     struct tok_state *tok = tok_new();
753     if (tok == NULL)
754         return NULL;
755     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
756         PyTokenizer_Free(tok);
757         return NULL;
758     }
759     tok->cur = tok->inp = tok->buf;
760     tok->end = tok->buf + BUFSIZ;
761     tok->fp = fp;
762     tok->prompt = ps1;
763     tok->nextprompt = ps2;
764     return tok;
765 }
766 
767 
768 /* Free a tok_state structure */
769 
770 void
PyTokenizer_Free(struct tok_state * tok)771 PyTokenizer_Free(struct tok_state *tok)
772 {
773     if (tok->encoding != NULL)
774         PyMem_FREE(tok->encoding);
775 #ifndef PGEN
776     Py_XDECREF(tok->decoding_readline);
777     Py_XDECREF(tok->decoding_buffer);
778 #endif
779     if (tok->fp != NULL && tok->buf != NULL)
780         PyMem_FREE(tok->buf);
781     if (tok->input)
782         PyMem_FREE((char *)tok->input);
783     PyMem_FREE(tok);
784 }
785 
786 #if !defined(PGEN) && defined(Py_USING_UNICODE)
787 static int
tok_stdin_decode(struct tok_state * tok,char ** inp)788 tok_stdin_decode(struct tok_state *tok, char **inp)
789 {
790     PyObject *enc, *sysstdin, *decoded, *utf8;
791     const char *encoding;
792     char *converted;
793 
794     if (PySys_GetFile((char *)"stdin", NULL) != stdin)
795         return 0;
796     sysstdin = PySys_GetObject("stdin");
797     if (sysstdin == NULL || !PyFile_Check(sysstdin))
798         return 0;
799 
800     enc = ((PyFileObject *)sysstdin)->f_encoding;
801     if (enc == NULL || !PyString_Check(enc))
802         return 0;
803     Py_INCREF(enc);
804 
805     encoding = PyString_AsString(enc);
806     decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
807     if (decoded == NULL)
808         goto error_clear;
809 
810     utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
811     Py_DECREF(decoded);
812     if (utf8 == NULL)
813         goto error_clear;
814 
815     assert(PyString_Check(utf8));
816     converted = new_string(PyString_AS_STRING(utf8),
817                            PyString_GET_SIZE(utf8));
818     Py_DECREF(utf8);
819     if (converted == NULL)
820         goto error_nomem;
821 
822     PyMem_FREE(*inp);
823     *inp = converted;
824     if (tok->encoding != NULL)
825         PyMem_FREE(tok->encoding);
826     tok->encoding = new_string(encoding, strlen(encoding));
827     if (tok->encoding == NULL)
828         goto error_nomem;
829 
830     Py_DECREF(enc);
831     return 0;
832 
833 error_nomem:
834     Py_DECREF(enc);
835     tok->done = E_NOMEM;
836     return -1;
837 
838 error_clear:
839     Py_DECREF(enc);
840     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
841         tok->done = E_ERROR;
842         return -1;
843     }
844     /* Fallback to iso-8859-1: for backward compatibility */
845     PyErr_Clear();
846     return 0;
847 }
848 #endif
849 
850 /* Get next char, updating state; error code goes into tok->done */
851 
852 static int
tok_nextc(register struct tok_state * tok)853 tok_nextc(register struct tok_state *tok)
854 {
855     for (;;) {
856         if (tok->cur != tok->inp) {
857             return Py_CHARMASK(*tok->cur++); /* Fast path */
858         }
859         if (tok->done != E_OK)
860             return EOF;
861         if (tok->fp == NULL) {
862             char *end = strchr(tok->inp, '\n');
863             if (end != NULL)
864                 end++;
865             else {
866                 end = strchr(tok->inp, '\0');
867                 if (end == tok->inp) {
868                     tok->done = E_EOF;
869                     return EOF;
870                 }
871             }
872             if (tok->start == NULL)
873                 tok->buf = tok->cur;
874             tok->line_start = tok->cur;
875             tok->lineno++;
876             tok->inp = end;
877             return Py_CHARMASK(*tok->cur++);
878         }
879         if (tok->prompt != NULL) {
880             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
881             if (tok->nextprompt != NULL)
882                 tok->prompt = tok->nextprompt;
883             if (newtok == NULL)
884                 tok->done = E_INTR;
885             else if (*newtok == '\0') {
886                 PyMem_FREE(newtok);
887                 tok->done = E_EOF;
888             }
889 #if !defined(PGEN) && defined(Py_USING_UNICODE)
890             else if (tok_stdin_decode(tok, &newtok) != 0)
891                 PyMem_FREE(newtok);
892 #endif
893             else if (tok->start != NULL) {
894                 size_t start = tok->start - tok->buf;
895                 size_t oldlen = tok->cur - tok->buf;
896                 size_t newlen = oldlen + strlen(newtok);
897                 char *buf = tok->buf;
898                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
899                 tok->lineno++;
900                 if (buf == NULL) {
901                     PyMem_FREE(tok->buf);
902                     tok->buf = NULL;
903                     PyMem_FREE(newtok);
904                     tok->done = E_NOMEM;
905                     return EOF;
906                 }
907                 tok->buf = buf;
908                 tok->cur = tok->buf + oldlen;
909                 tok->line_start = tok->cur;
910                 strcpy(tok->buf + oldlen, newtok);
911                 PyMem_FREE(newtok);
912                 tok->inp = tok->buf + newlen;
913                 tok->end = tok->inp + 1;
914                 tok->start = tok->buf + start;
915             }
916             else {
917                 tok->lineno++;
918                 if (tok->buf != NULL)
919                     PyMem_FREE(tok->buf);
920                 tok->buf = newtok;
921                 tok->line_start = tok->buf;
922                 tok->cur = tok->buf;
923                 tok->line_start = tok->buf;
924                 tok->inp = strchr(tok->buf, '\0');
925                 tok->end = tok->inp + 1;
926             }
927         }
928         else {
929             int done = 0;
930             Py_ssize_t cur = 0;
931             char *pt;
932             if (tok->start == NULL) {
933                 if (tok->buf == NULL) {
934                     tok->buf = (char *)
935                         PyMem_MALLOC(BUFSIZ);
936                     if (tok->buf == NULL) {
937                         tok->done = E_NOMEM;
938                         return EOF;
939                     }
940                     tok->end = tok->buf + BUFSIZ;
941                 }
942                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
943                           tok) == NULL) {
944                     tok->done = E_EOF;
945                     done = 1;
946                 }
947                 else {
948                     tok->done = E_OK;
949                     tok->inp = strchr(tok->buf, '\0');
950                     done = tok->inp[-1] == '\n';
951                 }
952             }
953             else {
954                 cur = tok->cur - tok->buf;
955                 if (decoding_feof(tok)) {
956                     tok->done = E_EOF;
957                     done = 1;
958                 }
959                 else
960                     tok->done = E_OK;
961             }
962             tok->lineno++;
963             /* Read until '\n' or EOF */
964             while (!done) {
965                 Py_ssize_t curstart = tok->start == NULL ? -1 :
966                           tok->start - tok->buf;
967                 Py_ssize_t curvalid = tok->inp - tok->buf;
968                 Py_ssize_t newsize = curvalid + BUFSIZ;
969                 char *newbuf = tok->buf;
970                 newbuf = (char *)PyMem_REALLOC(newbuf,
971                                                newsize);
972                 if (newbuf == NULL) {
973                     tok->done = E_NOMEM;
974                     tok->cur = tok->inp;
975                     return EOF;
976                 }
977                 tok->buf = newbuf;
978                 tok->inp = tok->buf + curvalid;
979                 tok->end = tok->buf + newsize;
980                 tok->start = curstart < 0 ? NULL :
981                          tok->buf + curstart;
982                 if (decoding_fgets(tok->inp,
983                                (int)(tok->end - tok->inp),
984                                tok) == NULL) {
985                     /* Break out early on decoding
986                        errors, as tok->buf will be NULL
987                      */
988                     if (tok->decoding_erred)
989                         return EOF;
990                     /* Last line does not end in \n,
991                        fake one */
992                     strcpy(tok->inp, "\n");
993                 }
994                 tok->inp = strchr(tok->inp, '\0');
995                 done = tok->inp[-1] == '\n';
996             }
997             if (tok->buf != NULL) {
998                 tok->cur = tok->buf + cur;
999                 tok->line_start = tok->cur;
1000                 /* replace "\r\n" with "\n" */
1001                 /* For Mac leave the \r, giving a syntax error */
1002                 pt = tok->inp - 2;
1003                 if (pt >= tok->buf && *pt == '\r') {
1004                     *pt++ = '\n';
1005                     *pt = '\0';
1006                     tok->inp = pt;
1007                 }
1008             }
1009         }
1010         if (tok->done != E_OK) {
1011             if (tok->prompt != NULL)
1012                 PySys_WriteStderr("\n");
1013             tok->cur = tok->inp;
1014             return EOF;
1015         }
1016     }
1017     /*NOTREACHED*/
1018 }
1019 
1020 
1021 /* Back-up one character */
1022 
1023 static void
tok_backup(register struct tok_state * tok,register int c)1024 tok_backup(register struct tok_state *tok, register int c)
1025 {
1026     if (c != EOF) {
1027         if (--tok->cur < tok->buf)
1028             Py_FatalError("tok_backup: beginning of buffer");
1029         if (*tok->cur != c)
1030             *tok->cur = c;
1031     }
1032 }
1033 
1034 
1035 /* Return the token corresponding to a single character */
1036 
1037 int
PyToken_OneChar(int c)1038 PyToken_OneChar(int c)
1039 {
1040     switch (c) {
1041     case '(':           return LPAR;
1042     case ')':           return RPAR;
1043     case '[':           return LSQB;
1044     case ']':           return RSQB;
1045     case ':':           return COLON;
1046     case ',':           return COMMA;
1047     case ';':           return SEMI;
1048     case '+':           return PLUS;
1049     case '-':           return MINUS;
1050     case '*':           return STAR;
1051     case '/':           return SLASH;
1052     case '|':           return VBAR;
1053     case '&':           return AMPER;
1054     case '<':           return LESS;
1055     case '>':           return GREATER;
1056     case '=':           return EQUAL;
1057     case '.':           return DOT;
1058     case '%':           return PERCENT;
1059     case '`':           return BACKQUOTE;
1060     case '{':           return LBRACE;
1061     case '}':           return RBRACE;
1062     case '^':           return CIRCUMFLEX;
1063     case '~':           return TILDE;
1064     case '@':       return AT;
1065     default:            return OP;
1066     }
1067 }
1068 
1069 
1070 int
PyToken_TwoChars(int c1,int c2)1071 PyToken_TwoChars(int c1, int c2)
1072 {
1073     switch (c1) {
1074     case '=':
1075         switch (c2) {
1076         case '=':               return EQEQUAL;
1077         }
1078         break;
1079     case '!':
1080         switch (c2) {
1081         case '=':               return NOTEQUAL;
1082         }
1083         break;
1084     case '<':
1085         switch (c2) {
1086         case '>':               return NOTEQUAL;
1087         case '=':               return LESSEQUAL;
1088         case '<':               return LEFTSHIFT;
1089         }
1090         break;
1091     case '>':
1092         switch (c2) {
1093         case '=':               return GREATEREQUAL;
1094         case '>':               return RIGHTSHIFT;
1095         }
1096         break;
1097     case '+':
1098         switch (c2) {
1099         case '=':               return PLUSEQUAL;
1100         }
1101         break;
1102     case '-':
1103         switch (c2) {
1104         case '=':               return MINEQUAL;
1105         }
1106         break;
1107     case '*':
1108         switch (c2) {
1109         case '*':               return DOUBLESTAR;
1110         case '=':               return STAREQUAL;
1111         }
1112         break;
1113     case '/':
1114         switch (c2) {
1115         case '/':               return DOUBLESLASH;
1116         case '=':               return SLASHEQUAL;
1117         }
1118         break;
1119     case '|':
1120         switch (c2) {
1121         case '=':               return VBAREQUAL;
1122         }
1123         break;
1124     case '%':
1125         switch (c2) {
1126         case '=':               return PERCENTEQUAL;
1127         }
1128         break;
1129     case '&':
1130         switch (c2) {
1131         case '=':               return AMPEREQUAL;
1132         }
1133         break;
1134     case '^':
1135         switch (c2) {
1136         case '=':               return CIRCUMFLEXEQUAL;
1137         }
1138         break;
1139     }
1140     return OP;
1141 }
1142 
1143 int
PyToken_ThreeChars(int c1,int c2,int c3)1144 PyToken_ThreeChars(int c1, int c2, int c3)
1145 {
1146     switch (c1) {
1147     case '<':
1148         switch (c2) {
1149         case '<':
1150             switch (c3) {
1151             case '=':
1152                 return LEFTSHIFTEQUAL;
1153             }
1154             break;
1155         }
1156         break;
1157     case '>':
1158         switch (c2) {
1159         case '>':
1160             switch (c3) {
1161             case '=':
1162                 return RIGHTSHIFTEQUAL;
1163             }
1164             break;
1165         }
1166         break;
1167     case '*':
1168         switch (c2) {
1169         case '*':
1170             switch (c3) {
1171             case '=':
1172                 return DOUBLESTAREQUAL;
1173             }
1174             break;
1175         }
1176         break;
1177     case '/':
1178         switch (c2) {
1179         case '/':
1180             switch (c3) {
1181             case '=':
1182                 return DOUBLESLASHEQUAL;
1183             }
1184             break;
1185         }
1186         break;
1187     }
1188     return OP;
1189 }
1190 
1191 static int
indenterror(struct tok_state * tok)1192 indenterror(struct tok_state *tok)
1193 {
1194     if (tok->alterror) {
1195         tok->done = E_TABSPACE;
1196         tok->cur = tok->inp;
1197         return 1;
1198     }
1199     if (tok->altwarning) {
1200         PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1201                           "in indentation\n", tok->filename);
1202         tok->altwarning = 0;
1203     }
1204     return 0;
1205 }
1206 
1207 /* Get next token, after space stripping etc. */
1208 
1209 static int
tok_get(register struct tok_state * tok,char ** p_start,char ** p_end)1210 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1211 {
1212     register int c;
1213     int blankline;
1214 
1215     *p_start = *p_end = NULL;
1216   nextline:
1217     tok->start = NULL;
1218     blankline = 0;
1219 
1220     /* Get indentation level */
1221     if (tok->atbol) {
1222         register int col = 0;
1223         register int altcol = 0;
1224         tok->atbol = 0;
1225         for (;;) {
1226             c = tok_nextc(tok);
1227             if (c == ' ')
1228                 col++, altcol++;
1229             else if (c == '\t') {
1230                 col = (col/tok->tabsize + 1) * tok->tabsize;
1231                 altcol = (altcol/tok->alttabsize + 1)
1232                     * tok->alttabsize;
1233             }
1234             else if (c == '\014') /* Control-L (formfeed) */
1235                 col = altcol = 0; /* For Emacs users */
1236             else
1237                 break;
1238         }
1239         tok_backup(tok, c);
1240         if (c == '#' || c == '\n') {
1241             /* Lines with only whitespace and/or comments
1242                shouldn't affect the indentation and are
1243                not passed to the parser as NEWLINE tokens,
1244                except *totally* empty lines in interactive
1245                mode, which signal the end of a command group. */
1246             if (col == 0 && c == '\n' && tok->prompt != NULL)
1247                 blankline = 0; /* Let it through */
1248             else
1249                 blankline = 1; /* Ignore completely */
1250             /* We can't jump back right here since we still
1251                may need to skip to the end of a comment */
1252         }
1253         if (!blankline && tok->level == 0) {
1254             if (col == tok->indstack[tok->indent]) {
1255                 /* No change */
1256                 if (altcol != tok->altindstack[tok->indent]) {
1257                     if (indenterror(tok))
1258                         return ERRORTOKEN;
1259                 }
1260             }
1261             else if (col > tok->indstack[tok->indent]) {
1262                 /* Indent -- always one */
1263                 if (tok->indent+1 >= MAXINDENT) {
1264                     tok->done = E_TOODEEP;
1265                     tok->cur = tok->inp;
1266                     return ERRORTOKEN;
1267                 }
1268                 if (altcol <= tok->altindstack[tok->indent]) {
1269                     if (indenterror(tok))
1270                         return ERRORTOKEN;
1271                 }
1272                 tok->pendin++;
1273                 tok->indstack[++tok->indent] = col;
1274                 tok->altindstack[tok->indent] = altcol;
1275             }
1276             else /* col < tok->indstack[tok->indent] */ {
1277                 /* Dedent -- any number, must be consistent */
1278                 while (tok->indent > 0 &&
1279                     col < tok->indstack[tok->indent]) {
1280                     tok->pendin--;
1281                     tok->indent--;
1282                 }
1283                 if (col != tok->indstack[tok->indent]) {
1284                     tok->done = E_DEDENT;
1285                     tok->cur = tok->inp;
1286                     return ERRORTOKEN;
1287                 }
1288                 if (altcol != tok->altindstack[tok->indent]) {
1289                     if (indenterror(tok))
1290                         return ERRORTOKEN;
1291                 }
1292             }
1293         }
1294     }
1295 
1296     tok->start = tok->cur;
1297 
1298     /* Return pending indents/dedents */
1299     if (tok->pendin != 0) {
1300         if (tok->pendin < 0) {
1301             tok->pendin++;
1302             return DEDENT;
1303         }
1304         else {
1305             tok->pendin--;
1306             return INDENT;
1307         }
1308     }
1309 
1310  again:
1311     tok->start = NULL;
1312     /* Skip spaces */
1313     do {
1314         c = tok_nextc(tok);
1315     } while (c == ' ' || c == '\t' || c == '\014');
1316 
1317     /* Set start of current token */
1318     tok->start = tok->cur - 1;
1319 
1320     /* Skip comment, while looking for tab-setting magic */
1321     if (c == '#') {
1322         static char *tabforms[] = {
1323             "tab-width:",                       /* Emacs */
1324             ":tabstop=",                        /* vim, full form */
1325             ":ts=",                             /* vim, abbreviated form */
1326             "set tabsize=",                     /* will vi never die? */
1327         /* more templates can be added here to support other editors */
1328         };
1329         char cbuf[80];
1330         char *tp, **cp;
1331         tp = cbuf;
1332         do {
1333             *tp++ = c = tok_nextc(tok);
1334         } while (c != EOF && c != '\n' &&
1335                  (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1336         *tp = '\0';
1337         for (cp = tabforms;
1338              cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1339              cp++) {
1340             if ((tp = strstr(cbuf, *cp))) {
1341                 int newsize = atoi(tp + strlen(*cp));
1342 
1343                 if (newsize >= 1 && newsize <= 40) {
1344                     tok->tabsize = newsize;
1345                     if (Py_VerboseFlag)
1346                         PySys_WriteStderr(
1347                         "Tab size set to %d\n",
1348                         newsize);
1349                 }
1350             }
1351         }
1352         while (c != EOF && c != '\n')
1353             c = tok_nextc(tok);
1354     }
1355 
1356     /* Check for EOF and errors now */
1357     if (c == EOF) {
1358         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1359     }
1360 
1361     /* Identifier (most frequent token!) */
1362     if (Py_ISALPHA(c) || c == '_') {
1363         /* Process r"", u"" and ur"" */
1364         switch (c) {
1365         case 'b':
1366         case 'B':
1367             c = tok_nextc(tok);
1368             if (c == 'r' || c == 'R')
1369                 c = tok_nextc(tok);
1370             if (c == '"' || c == '\'')
1371                 goto letter_quote;
1372             break;
1373         case 'r':
1374         case 'R':
1375             c = tok_nextc(tok);
1376             if (c == '"' || c == '\'')
1377                 goto letter_quote;
1378             break;
1379         case 'u':
1380         case 'U':
1381             c = tok_nextc(tok);
1382             if (c == 'r' || c == 'R')
1383                 c = tok_nextc(tok);
1384             if (c == '"' || c == '\'')
1385                 goto letter_quote;
1386             break;
1387         }
1388         while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1389             c = tok_nextc(tok);
1390         }
1391         tok_backup(tok, c);
1392         *p_start = tok->start;
1393         *p_end = tok->cur;
1394         return NAME;
1395     }
1396 
1397     /* Newline */
1398     if (c == '\n') {
1399         tok->atbol = 1;
1400         if (blankline || tok->level > 0)
1401             goto nextline;
1402         *p_start = tok->start;
1403         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1404         tok->cont_line = 0;
1405         return NEWLINE;
1406     }
1407 
1408     /* Period or number starting with period? */
1409     if (c == '.') {
1410         c = tok_nextc(tok);
1411         if (isdigit(c)) {
1412             goto fraction;
1413         }
1414         else {
1415             tok_backup(tok, c);
1416             *p_start = tok->start;
1417             *p_end = tok->cur;
1418             return DOT;
1419         }
1420     }
1421 
1422     /* Number */
1423     if (isdigit(c)) {
1424         if (c == '0') {
1425             /* Hex, octal or binary -- maybe. */
1426             c = tok_nextc(tok);
1427             if (c == '.')
1428                 goto fraction;
1429 #ifndef WITHOUT_COMPLEX
1430             if (c == 'j' || c == 'J')
1431                 goto imaginary;
1432 #endif
1433             if (c == 'x' || c == 'X') {
1434 
1435                 /* Hex */
1436                 c = tok_nextc(tok);
1437                 if (!isxdigit(c)) {
1438                     tok->done = E_TOKEN;
1439                     tok_backup(tok, c);
1440                     return ERRORTOKEN;
1441                 }
1442                 do {
1443                     c = tok_nextc(tok);
1444                 } while (isxdigit(c));
1445             }
1446             else if (c == 'o' || c == 'O') {
1447                 /* Octal */
1448                 c = tok_nextc(tok);
1449                 if (c < '0' || c >= '8') {
1450                     tok->done = E_TOKEN;
1451                     tok_backup(tok, c);
1452                     return ERRORTOKEN;
1453                 }
1454                 do {
1455                     c = tok_nextc(tok);
1456                 } while ('0' <= c && c < '8');
1457             }
1458             else if (c == 'b' || c == 'B') {
1459                 /* Binary */
1460                 c = tok_nextc(tok);
1461                 if (c != '0' && c != '1') {
1462                     tok->done = E_TOKEN;
1463                     tok_backup(tok, c);
1464                     return ERRORTOKEN;
1465                 }
1466                 do {
1467                     c = tok_nextc(tok);
1468                 } while (c == '0' || c == '1');
1469             }
1470             else {
1471                 int found_decimal = 0;
1472                 /* Octal; c is first char of it */
1473                 /* There's no 'isoctdigit' macro, sigh */
1474                 while ('0' <= c && c < '8') {
1475                     c = tok_nextc(tok);
1476                 }
1477                 if (isdigit(c)) {
1478                     found_decimal = 1;
1479                     do {
1480                         c = tok_nextc(tok);
1481                     } while (isdigit(c));
1482                 }
1483                 if (c == '.')
1484                     goto fraction;
1485                 else if (c == 'e' || c == 'E')
1486                     goto exponent;
1487 #ifndef WITHOUT_COMPLEX
1488                 else if (c == 'j' || c == 'J')
1489                     goto imaginary;
1490 #endif
1491                 else if (found_decimal) {
1492                     tok->done = E_TOKEN;
1493                     tok_backup(tok, c);
1494                     return ERRORTOKEN;
1495                 }
1496             }
1497             if (c == 'l' || c == 'L')
1498                 c = tok_nextc(tok);
1499         }
1500         else {
1501             /* Decimal */
1502             do {
1503                 c = tok_nextc(tok);
1504             } while (isdigit(c));
1505             if (c == 'l' || c == 'L')
1506                 c = tok_nextc(tok);
1507             else {
1508                 /* Accept floating point numbers. */
1509                 if (c == '.') {
1510         fraction:
1511                     /* Fraction */
1512                     do {
1513                         c = tok_nextc(tok);
1514                     } while (isdigit(c));
1515                 }
1516                 if (c == 'e' || c == 'E') {
1517                     int e;
1518                   exponent:
1519                     e = c;
1520                     /* Exponent part */
1521                     c = tok_nextc(tok);
1522                     if (c == '+' || c == '-') {
1523                         c = tok_nextc(tok);
1524                         if (!isdigit(c)) {
1525                             tok->done = E_TOKEN;
1526                             tok_backup(tok, c);
1527                             return ERRORTOKEN;
1528                         }
1529                     } else if (!isdigit(c)) {
1530                         tok_backup(tok, c);
1531                         tok_backup(tok, e);
1532                         *p_start = tok->start;
1533                         *p_end = tok->cur;
1534                         return NUMBER;
1535                     }
1536                     do {
1537                         c = tok_nextc(tok);
1538                     } while (isdigit(c));
1539                 }
1540 #ifndef WITHOUT_COMPLEX
1541                 if (c == 'j' || c == 'J')
1542                     /* Imaginary part */
1543         imaginary:
1544                     c = tok_nextc(tok);
1545 #endif
1546             }
1547         }
1548         tok_backup(tok, c);
1549         *p_start = tok->start;
1550         *p_end = tok->cur;
1551         return NUMBER;
1552     }
1553 
1554   letter_quote:
1555     /* String */
1556     if (c == '\'' || c == '"') {
1557         Py_ssize_t quote2 = tok->cur - tok->start + 1;
1558         int quote = c;
1559         int triple = 0;
1560         int tripcount = 0;
1561         for (;;) {
1562             c = tok_nextc(tok);
1563             if (c == '\n') {
1564                 if (!triple) {
1565                     tok->done = E_EOLS;
1566                     tok_backup(tok, c);
1567                     return ERRORTOKEN;
1568                 }
1569                 tripcount = 0;
1570                 tok->cont_line = 1; /* multiline string. */
1571             }
1572             else if (c == EOF) {
1573                 if (triple)
1574                     tok->done = E_EOFS;
1575                 else
1576                     tok->done = E_EOLS;
1577                 tok->cur = tok->inp;
1578                 return ERRORTOKEN;
1579             }
1580             else if (c == quote) {
1581                 tripcount++;
1582                 if (tok->cur - tok->start == quote2) {
1583                     c = tok_nextc(tok);
1584                     if (c == quote) {
1585                         triple = 1;
1586                         tripcount = 0;
1587                         continue;
1588                     }
1589                     tok_backup(tok, c);
1590                 }
1591                 if (!triple || tripcount == 3)
1592                     break;
1593             }
1594             else if (c == '\\') {
1595                 tripcount = 0;
1596                 c = tok_nextc(tok);
1597                 if (c == EOF) {
1598                     tok->done = E_EOLS;
1599                     tok->cur = tok->inp;
1600                     return ERRORTOKEN;
1601                 }
1602             }
1603             else
1604                 tripcount = 0;
1605         }
1606         *p_start = tok->start;
1607         *p_end = tok->cur;
1608         return STRING;
1609     }
1610 
1611     /* Line continuation */
1612     if (c == '\\') {
1613         c = tok_nextc(tok);
1614         if (c != '\n') {
1615             tok->done = E_LINECONT;
1616             tok->cur = tok->inp;
1617             return ERRORTOKEN;
1618         }
1619         tok->cont_line = 1;
1620         goto again; /* Read next line */
1621     }
1622 
1623     /* Check for two-character token */
1624     {
1625         int c2 = tok_nextc(tok);
1626         int token = PyToken_TwoChars(c, c2);
1627 #ifndef PGEN
1628         if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1629             if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1630                                    "<> not supported in 3.x; use !=",
1631                                    tok->filename, tok->lineno,
1632                                    NULL, NULL)) {
1633                 return ERRORTOKEN;
1634             }
1635         }
1636 #endif
1637         if (token != OP) {
1638             int c3 = tok_nextc(tok);
1639             int token3 = PyToken_ThreeChars(c, c2, c3);
1640             if (token3 != OP) {
1641                 token = token3;
1642             } else {
1643                 tok_backup(tok, c3);
1644             }
1645             *p_start = tok->start;
1646             *p_end = tok->cur;
1647             return token;
1648         }
1649         tok_backup(tok, c2);
1650     }
1651 
1652     /* Keep track of parentheses nesting level */
1653     switch (c) {
1654     case '(':
1655     case '[':
1656     case '{':
1657         tok->level++;
1658         break;
1659     case ')':
1660     case ']':
1661     case '}':
1662         tok->level--;
1663         break;
1664     }
1665 
1666     /* Punctuation character */
1667     *p_start = tok->start;
1668     *p_end = tok->cur;
1669     return PyToken_OneChar(c);
1670 }
1671 
1672 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1673 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1674 {
1675     int result = tok_get(tok, p_start, p_end);
1676     if (tok->decoding_erred) {
1677         result = ERRORTOKEN;
1678         tok->done = E_DECODE;
1679     }
1680     return result;
1681 }
1682 
1683 /* This function is only called from parsetok. However, it cannot live
1684    there, as it must be empty for PGEN, and we can check for PGEN only
1685    in this file. */
1686 
1687 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1688 char*
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1689 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1690 {
1691     return NULL;
1692 }
1693 #else
1694 #ifdef Py_USING_UNICODE
1695 static PyObject *
dec_utf8(const char * enc,const char * text,size_t len)1696 dec_utf8(const char *enc, const char *text, size_t len) {
1697     PyObject *ret = NULL;
1698     PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1699     if (unicode_text) {
1700         ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1701         Py_DECREF(unicode_text);
1702     }
1703     if (!ret) {
1704         PyErr_Clear();
1705     }
1706     return ret;
1707 }
1708 char *
PyTokenizer_RestoreEncoding(struct tok_state * tok,int len,int * offset)1709 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1710 {
1711     char *text = NULL;
1712     if (tok->encoding) {
1713         /* convert source to original encondig */
1714         PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1715         if (lineobj != NULL) {
1716             int linelen = PyString_Size(lineobj);
1717             const char *line = PyString_AsString(lineobj);
1718             text = PyObject_MALLOC(linelen + 1);
1719             if (text != NULL && line != NULL) {
1720                 if (linelen)
1721                     strncpy(text, line, linelen);
1722                 text[linelen] = '\0';
1723             }
1724             Py_DECREF(lineobj);
1725 
1726             /* adjust error offset */
1727             if (*offset > 1) {
1728                 PyObject *offsetobj = dec_utf8(tok->encoding,
1729                                                tok->buf, *offset-1);
1730                 if (offsetobj) {
1731                     *offset = PyString_Size(offsetobj) + 1;
1732                     Py_DECREF(offsetobj);
1733                 }
1734             }
1735 
1736         }
1737     }
1738     return text;
1739 
1740 }
1741 #endif /* defined(Py_USING_UNICODE) */
1742 #endif
1743 
1744 
1745 #ifdef Py_DEBUG
1746 
1747 void
tok_dump(int type,char * start,char * end)1748 tok_dump(int type, char *start, char *end)
1749 {
1750     printf("%s", _PyParser_TokenNames[type]);
1751     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1752         printf("(%.*s)", (int)(end - start), start);
1753 }
1754 
1755 #endif
1756