• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "Python.h"
2 #include "errcode.h"
3 #include "internal/pycore_critical_section.h"   // Py_BEGIN_CRITICAL_SECTION
4 #include "../Parser/lexer/state.h"
5 #include "../Parser/lexer/lexer.h"
6 #include "../Parser/tokenizer/tokenizer.h"
7 #include "../Parser/pegen.h"                    // _PyPegen_byte_offset_to_character_offset()
8 
9 static struct PyModuleDef _tokenizemodule;
10 
11 typedef struct {
12     PyTypeObject *TokenizerIter;
13 } tokenize_state;
14 
15 static tokenize_state *
get_tokenize_state(PyObject * module)16 get_tokenize_state(PyObject *module) {
17     return (tokenize_state *)PyModule_GetState(module);
18 }
19 
20 #define _tokenize_get_state_by_type(type) \
21     get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
22 
23 #include "pycore_runtime.h"
24 #include "clinic/Python-tokenize.c.h"
25 
26 /*[clinic input]
27 module _tokenizer
28 class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
29 [clinic start generated code]*/
30 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
31 
32 typedef struct
33 {
34     PyObject_HEAD struct tok_state *tok;
35     int done;
36 
37     /* Needed to cache line for performance */
38     PyObject *last_line;
39     Py_ssize_t last_lineno;
40     Py_ssize_t last_end_lineno;
41     Py_ssize_t byte_col_offset_diff;
42 } tokenizeriterobject;
43 
44 /*[clinic input]
45 @classmethod
46 _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
47 
48     readline: object
49     /
50     *
51     extra_tokens: bool
52     encoding: str(c_default="NULL") = 'utf-8'
53 [clinic start generated code]*/
54 
55 static PyObject *
tokenizeriter_new_impl(PyTypeObject * type,PyObject * readline,int extra_tokens,const char * encoding)56 tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
57                        int extra_tokens, const char *encoding)
58 /*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
59 {
60     tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
61     if (self == NULL) {
62         return NULL;
63     }
64     PyObject *filename = PyUnicode_FromString("<string>");
65     if (filename == NULL) {
66         return NULL;
67     }
68     self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
69     if (self->tok == NULL) {
70         Py_DECREF(filename);
71         return NULL;
72     }
73     self->tok->filename = filename;
74     if (extra_tokens) {
75         self->tok->tok_extra_tokens = 1;
76     }
77     self->done = 0;
78 
79     self->last_line = NULL;
80     self->byte_col_offset_diff = 0;
81     self->last_lineno = 0;
82     self->last_end_lineno = 0;
83 
84     return (PyObject *)self;
85 }
86 
87 static int
_tokenizer_error(tokenizeriterobject * it)88 _tokenizer_error(tokenizeriterobject *it)
89 {
90     _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
91     if (PyErr_Occurred()) {
92         return -1;
93     }
94 
95     const char *msg = NULL;
96     PyObject* errtype = PyExc_SyntaxError;
97     struct tok_state *tok = it->tok;
98     switch (tok->done) {
99         case E_TOKEN:
100             msg = "invalid token";
101             break;
102         case E_EOF:
103             PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
104             PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
105                                        tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
106             return -1;
107         case E_DEDENT:
108             msg = "unindent does not match any outer indentation level";
109             errtype = PyExc_IndentationError;
110             break;
111         case E_INTR:
112             if (!PyErr_Occurred()) {
113                 PyErr_SetNone(PyExc_KeyboardInterrupt);
114             }
115             return -1;
116         case E_NOMEM:
117             PyErr_NoMemory();
118             return -1;
119         case E_TABSPACE:
120             errtype = PyExc_TabError;
121             msg = "inconsistent use of tabs and spaces in indentation";
122             break;
123         case E_TOODEEP:
124             errtype = PyExc_IndentationError;
125             msg = "too many levels of indentation";
126             break;
127         case E_LINECONT: {
128             msg = "unexpected character after line continuation character";
129             break;
130         }
131         default:
132             msg = "unknown tokenization error";
133     }
134 
135     PyObject* errstr = NULL;
136     PyObject* error_line = NULL;
137     PyObject* tmp = NULL;
138     PyObject* value = NULL;
139     int result = 0;
140 
141     Py_ssize_t size = tok->inp - tok->buf;
142     assert(tok->buf[size-1] == '\n');
143     size -= 1; // Remove the newline character from the end of the line
144     error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
145     if (!error_line) {
146         result = -1;
147         goto exit;
148     }
149 
150     Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
151     if (offset == -1) {
152         result = -1;
153         goto exit;
154     }
155     tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
156     if (!tmp) {
157         result = -1;
158         goto exit;
159     }
160 
161     errstr = PyUnicode_FromString(msg);
162     if (!errstr) {
163         result = -1;
164         goto exit;
165     }
166 
167     value = PyTuple_Pack(2, errstr, tmp);
168     if (!value) {
169         result = -1;
170         goto exit;
171     }
172 
173     PyErr_SetObject(errtype, value);
174 
175 exit:
176     Py_XDECREF(errstr);
177     Py_XDECREF(error_line);
178     Py_XDECREF(tmp);
179     Py_XDECREF(value);
180     return result;
181 }
182 
183 static PyObject *
_get_current_line(tokenizeriterobject * it,const char * line_start,Py_ssize_t size,int * line_changed)184 _get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size,
185                   int *line_changed)
186 {
187     _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
188     PyObject *line;
189     if (it->tok->lineno != it->last_lineno) {
190         // Line has changed since last token, so we fetch the new line and cache it
191         // in the iter object.
192         Py_XDECREF(it->last_line);
193         line = PyUnicode_DecodeUTF8(line_start, size, "replace");
194         it->last_line = line;
195         it->byte_col_offset_diff = 0;
196     }
197     else {
198         line = it->last_line;
199         *line_changed = 0;
200     }
201     return line;
202 }
203 
204 static void
_get_col_offsets(tokenizeriterobject * it,struct token token,const char * line_start,PyObject * line,int line_changed,Py_ssize_t lineno,Py_ssize_t end_lineno,Py_ssize_t * col_offset,Py_ssize_t * end_col_offset)205 _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
206                  PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
207                  Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
208 {
209     _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
210     Py_ssize_t byte_offset = -1;
211     if (token.start != NULL && token.start >= line_start) {
212         byte_offset = token.start - line_start;
213         if (line_changed) {
214             *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
215             it->byte_col_offset_diff = byte_offset - *col_offset;
216         }
217         else {
218             *col_offset = byte_offset - it->byte_col_offset_diff;
219         }
220     }
221 
222     if (token.end != NULL && token.end >= it->tok->line_start) {
223         Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
224         if (lineno == end_lineno) {
225             // If the whole token is at the same line, we can just use the token.start
226             // buffer for figuring out the new column offset, since using line is not
227             // performant for very long lines.
228             Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
229             *end_col_offset = *col_offset + token_col_offset;
230             it->byte_col_offset_diff += token.end - token.start - token_col_offset;
231         }
232         else {
233             *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
234             it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
235         }
236     }
237     it->last_lineno = lineno;
238     it->last_end_lineno = end_lineno;
239 }
240 
241 static PyObject *
tokenizeriter_next(tokenizeriterobject * it)242 tokenizeriter_next(tokenizeriterobject *it)
243 {
244     PyObject* result = NULL;
245 
246     Py_BEGIN_CRITICAL_SECTION(it);
247 
248     struct token token;
249     _PyToken_Init(&token);
250 
251     int type = _PyTokenizer_Get(it->tok, &token);
252     if (type == ERRORTOKEN) {
253         if(!PyErr_Occurred()) {
254             _tokenizer_error(it);
255             assert(PyErr_Occurred());
256         }
257         goto exit;
258     }
259     if (it->done || type == ERRORTOKEN) {
260         PyErr_SetString(PyExc_StopIteration, "EOF");
261         it->done = 1;
262         goto exit;
263     }
264     PyObject *str = NULL;
265     if (token.start == NULL || token.end == NULL) {
266         str = PyUnicode_FromString("");
267     }
268     else {
269         str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
270     }
271     if (str == NULL) {
272         goto exit;
273     }
274 
275     int is_trailing_token = 0;
276     if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
277         is_trailing_token = 1;
278     }
279 
280     const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
281     PyObject* line = NULL;
282     int line_changed = 1;
283     if (it->tok->tok_extra_tokens && is_trailing_token) {
284         line = PyUnicode_FromString("");
285     } else {
286         Py_ssize_t size = it->tok->inp - line_start;
287         if (size >= 1 && it->tok->implicit_newline) {
288             size -= 1;
289         }
290 
291         line = _get_current_line(it, line_start, size, &line_changed);
292     }
293     if (line == NULL) {
294         Py_DECREF(str);
295         goto exit;
296     }
297 
298     Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
299     Py_ssize_t end_lineno = it->tok->lineno;
300     Py_ssize_t col_offset = -1;
301     Py_ssize_t end_col_offset = -1;
302     _get_col_offsets(it, token, line_start, line, line_changed,
303                      lineno, end_lineno, &col_offset, &end_col_offset);
304 
305     if (it->tok->tok_extra_tokens) {
306         if (is_trailing_token) {
307             lineno = end_lineno = lineno + 1;
308             col_offset = end_col_offset = 0;
309         }
310         // Necessary adjustments to match the original Python tokenize
311         // implementation
312         if (type > DEDENT && type < OP) {
313             type = OP;
314         }
315         else if (type == NEWLINE) {
316             Py_DECREF(str);
317             if (!it->tok->implicit_newline) {
318                 if (it->tok->start[0] == '\r') {
319                     str = PyUnicode_FromString("\r\n");
320                 } else {
321                     str = PyUnicode_FromString("\n");
322                 }
323             }
324             end_col_offset++;
325         }
326         else if (type == NL) {
327             if (it->tok->implicit_newline) {
328                 Py_DECREF(str);
329                 str = PyUnicode_FromString("");
330             }
331         }
332 
333         if (str == NULL) {
334             Py_DECREF(line);
335             goto exit;
336         }
337     }
338 
339     result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
340 exit:
341     _PyToken_Free(&token);
342     if (type == ENDMARKER) {
343         it->done = 1;
344     }
345 
346     Py_END_CRITICAL_SECTION();
347     return result;
348 }
349 
350 static void
tokenizeriter_dealloc(tokenizeriterobject * it)351 tokenizeriter_dealloc(tokenizeriterobject *it)
352 {
353     PyTypeObject *tp = Py_TYPE(it);
354     Py_XDECREF(it->last_line);
355     _PyTokenizer_Free(it->tok);
356     tp->tp_free(it);
357     Py_DECREF(tp);
358 }
359 
360 static PyType_Slot tokenizeriter_slots[] = {
361     {Py_tp_new, tokenizeriter_new},
362     {Py_tp_dealloc, tokenizeriter_dealloc},
363     {Py_tp_getattro, PyObject_GenericGetAttr},
364     {Py_tp_iter, PyObject_SelfIter},
365     {Py_tp_iternext, tokenizeriter_next},
366     {0, NULL},
367 };
368 
369 static PyType_Spec tokenizeriter_spec = {
370     .name = "_tokenize.TokenizerIter",
371     .basicsize = sizeof(tokenizeriterobject),
372     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
373     .slots = tokenizeriter_slots,
374 };
375 
376 static int
tokenizemodule_exec(PyObject * m)377 tokenizemodule_exec(PyObject *m)
378 {
379     tokenize_state *state = get_tokenize_state(m);
380     if (state == NULL) {
381         return -1;
382     }
383 
384     state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
385     if (state->TokenizerIter == NULL) {
386         return -1;
387     }
388     if (PyModule_AddType(m, state->TokenizerIter) < 0) {
389         return -1;
390     }
391 
392     return 0;
393 }
394 
395 static PyMethodDef tokenize_methods[] = {
396     {NULL, NULL, 0, NULL} /* Sentinel */
397 };
398 
399 static PyModuleDef_Slot tokenizemodule_slots[] = {
400     {Py_mod_exec, tokenizemodule_exec},
401     {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
402     {Py_mod_gil, Py_MOD_GIL_NOT_USED},
403     {0, NULL}
404 };
405 
406 static int
tokenizemodule_traverse(PyObject * m,visitproc visit,void * arg)407 tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
408 {
409     tokenize_state *state = get_tokenize_state(m);
410     Py_VISIT(state->TokenizerIter);
411     return 0;
412 }
413 
414 static int
tokenizemodule_clear(PyObject * m)415 tokenizemodule_clear(PyObject *m)
416 {
417     tokenize_state *state = get_tokenize_state(m);
418     Py_CLEAR(state->TokenizerIter);
419     return 0;
420 }
421 
422 static void
tokenizemodule_free(void * m)423 tokenizemodule_free(void *m)
424 {
425     tokenizemodule_clear((PyObject *)m);
426 }
427 
428 static struct PyModuleDef _tokenizemodule = {
429     PyModuleDef_HEAD_INIT,
430     .m_name = "_tokenize",
431     .m_size = sizeof(tokenize_state),
432     .m_slots = tokenizemodule_slots,
433     .m_methods = tokenize_methods,
434     .m_traverse = tokenizemodule_traverse,
435     .m_clear = tokenizemodule_clear,
436     .m_free = tokenizemodule_free,
437 };
438 
439 PyMODINIT_FUNC
PyInit__tokenize(void)440 PyInit__tokenize(void)
441 {
442     return PyModuleDef_Init(&_tokenizemodule);
443 }
444