1 #include "Python.h"
2 #include "errcode.h"
3 #include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
4 #include "../Parser/lexer/state.h"
5 #include "../Parser/lexer/lexer.h"
6 #include "../Parser/tokenizer/tokenizer.h"
7 #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
8
9 static struct PyModuleDef _tokenizemodule;
10
11 typedef struct {
12 PyTypeObject *TokenizerIter;
13 } tokenize_state;
14
15 static tokenize_state *
get_tokenize_state(PyObject * module)16 get_tokenize_state(PyObject *module) {
17 return (tokenize_state *)PyModule_GetState(module);
18 }
19
20 #define _tokenize_get_state_by_type(type) \
21 get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
22
23 #include "pycore_runtime.h"
24 #include "clinic/Python-tokenize.c.h"
25
26 /*[clinic input]
27 module _tokenizer
28 class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
29 [clinic start generated code]*/
30 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
31
32 typedef struct
33 {
34 PyObject_HEAD struct tok_state *tok;
35 int done;
36
37 /* Needed to cache line for performance */
38 PyObject *last_line;
39 Py_ssize_t last_lineno;
40 Py_ssize_t last_end_lineno;
41 Py_ssize_t byte_col_offset_diff;
42 } tokenizeriterobject;
43
44 /*[clinic input]
45 @classmethod
46 _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
47
48 readline: object
49 /
50 *
51 extra_tokens: bool
52 encoding: str(c_default="NULL") = 'utf-8'
53 [clinic start generated code]*/
54
55 static PyObject *
tokenizeriter_new_impl(PyTypeObject * type,PyObject * readline,int extra_tokens,const char * encoding)56 tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
57 int extra_tokens, const char *encoding)
58 /*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
59 {
60 tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
61 if (self == NULL) {
62 return NULL;
63 }
64 PyObject *filename = PyUnicode_FromString("<string>");
65 if (filename == NULL) {
66 return NULL;
67 }
68 self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
69 if (self->tok == NULL) {
70 Py_DECREF(filename);
71 return NULL;
72 }
73 self->tok->filename = filename;
74 if (extra_tokens) {
75 self->tok->tok_extra_tokens = 1;
76 }
77 self->done = 0;
78
79 self->last_line = NULL;
80 self->byte_col_offset_diff = 0;
81 self->last_lineno = 0;
82 self->last_end_lineno = 0;
83
84 return (PyObject *)self;
85 }
86
87 static int
_tokenizer_error(tokenizeriterobject * it)88 _tokenizer_error(tokenizeriterobject *it)
89 {
90 _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
91 if (PyErr_Occurred()) {
92 return -1;
93 }
94
95 const char *msg = NULL;
96 PyObject* errtype = PyExc_SyntaxError;
97 struct tok_state *tok = it->tok;
98 switch (tok->done) {
99 case E_TOKEN:
100 msg = "invalid token";
101 break;
102 case E_EOF:
103 PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
104 PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
105 tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
106 return -1;
107 case E_DEDENT:
108 msg = "unindent does not match any outer indentation level";
109 errtype = PyExc_IndentationError;
110 break;
111 case E_INTR:
112 if (!PyErr_Occurred()) {
113 PyErr_SetNone(PyExc_KeyboardInterrupt);
114 }
115 return -1;
116 case E_NOMEM:
117 PyErr_NoMemory();
118 return -1;
119 case E_TABSPACE:
120 errtype = PyExc_TabError;
121 msg = "inconsistent use of tabs and spaces in indentation";
122 break;
123 case E_TOODEEP:
124 errtype = PyExc_IndentationError;
125 msg = "too many levels of indentation";
126 break;
127 case E_LINECONT: {
128 msg = "unexpected character after line continuation character";
129 break;
130 }
131 default:
132 msg = "unknown tokenization error";
133 }
134
135 PyObject* errstr = NULL;
136 PyObject* error_line = NULL;
137 PyObject* tmp = NULL;
138 PyObject* value = NULL;
139 int result = 0;
140
141 Py_ssize_t size = tok->inp - tok->buf;
142 assert(tok->buf[size-1] == '\n');
143 size -= 1; // Remove the newline character from the end of the line
144 error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
145 if (!error_line) {
146 result = -1;
147 goto exit;
148 }
149
150 Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
151 if (offset == -1) {
152 result = -1;
153 goto exit;
154 }
155 tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
156 if (!tmp) {
157 result = -1;
158 goto exit;
159 }
160
161 errstr = PyUnicode_FromString(msg);
162 if (!errstr) {
163 result = -1;
164 goto exit;
165 }
166
167 value = PyTuple_Pack(2, errstr, tmp);
168 if (!value) {
169 result = -1;
170 goto exit;
171 }
172
173 PyErr_SetObject(errtype, value);
174
175 exit:
176 Py_XDECREF(errstr);
177 Py_XDECREF(error_line);
178 Py_XDECREF(tmp);
179 Py_XDECREF(value);
180 return result;
181 }
182
183 static PyObject *
_get_current_line(tokenizeriterobject * it,const char * line_start,Py_ssize_t size,int * line_changed)184 _get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size,
185 int *line_changed)
186 {
187 _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
188 PyObject *line;
189 if (it->tok->lineno != it->last_lineno) {
190 // Line has changed since last token, so we fetch the new line and cache it
191 // in the iter object.
192 Py_XDECREF(it->last_line);
193 line = PyUnicode_DecodeUTF8(line_start, size, "replace");
194 it->last_line = line;
195 it->byte_col_offset_diff = 0;
196 }
197 else {
198 line = it->last_line;
199 *line_changed = 0;
200 }
201 return line;
202 }
203
204 static void
_get_col_offsets(tokenizeriterobject * it,struct token token,const char * line_start,PyObject * line,int line_changed,Py_ssize_t lineno,Py_ssize_t end_lineno,Py_ssize_t * col_offset,Py_ssize_t * end_col_offset)205 _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
206 PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
207 Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
208 {
209 _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
210 Py_ssize_t byte_offset = -1;
211 if (token.start != NULL && token.start >= line_start) {
212 byte_offset = token.start - line_start;
213 if (line_changed) {
214 *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
215 it->byte_col_offset_diff = byte_offset - *col_offset;
216 }
217 else {
218 *col_offset = byte_offset - it->byte_col_offset_diff;
219 }
220 }
221
222 if (token.end != NULL && token.end >= it->tok->line_start) {
223 Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
224 if (lineno == end_lineno) {
225 // If the whole token is at the same line, we can just use the token.start
226 // buffer for figuring out the new column offset, since using line is not
227 // performant for very long lines.
228 Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
229 *end_col_offset = *col_offset + token_col_offset;
230 it->byte_col_offset_diff += token.end - token.start - token_col_offset;
231 }
232 else {
233 *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
234 it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
235 }
236 }
237 it->last_lineno = lineno;
238 it->last_end_lineno = end_lineno;
239 }
240
241 static PyObject *
tokenizeriter_next(tokenizeriterobject * it)242 tokenizeriter_next(tokenizeriterobject *it)
243 {
244 PyObject* result = NULL;
245
246 Py_BEGIN_CRITICAL_SECTION(it);
247
248 struct token token;
249 _PyToken_Init(&token);
250
251 int type = _PyTokenizer_Get(it->tok, &token);
252 if (type == ERRORTOKEN) {
253 if(!PyErr_Occurred()) {
254 _tokenizer_error(it);
255 assert(PyErr_Occurred());
256 }
257 goto exit;
258 }
259 if (it->done || type == ERRORTOKEN) {
260 PyErr_SetString(PyExc_StopIteration, "EOF");
261 it->done = 1;
262 goto exit;
263 }
264 PyObject *str = NULL;
265 if (token.start == NULL || token.end == NULL) {
266 str = PyUnicode_FromString("");
267 }
268 else {
269 str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
270 }
271 if (str == NULL) {
272 goto exit;
273 }
274
275 int is_trailing_token = 0;
276 if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
277 is_trailing_token = 1;
278 }
279
280 const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
281 PyObject* line = NULL;
282 int line_changed = 1;
283 if (it->tok->tok_extra_tokens && is_trailing_token) {
284 line = PyUnicode_FromString("");
285 } else {
286 Py_ssize_t size = it->tok->inp - line_start;
287 if (size >= 1 && it->tok->implicit_newline) {
288 size -= 1;
289 }
290
291 line = _get_current_line(it, line_start, size, &line_changed);
292 }
293 if (line == NULL) {
294 Py_DECREF(str);
295 goto exit;
296 }
297
298 Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
299 Py_ssize_t end_lineno = it->tok->lineno;
300 Py_ssize_t col_offset = -1;
301 Py_ssize_t end_col_offset = -1;
302 _get_col_offsets(it, token, line_start, line, line_changed,
303 lineno, end_lineno, &col_offset, &end_col_offset);
304
305 if (it->tok->tok_extra_tokens) {
306 if (is_trailing_token) {
307 lineno = end_lineno = lineno + 1;
308 col_offset = end_col_offset = 0;
309 }
310 // Necessary adjustments to match the original Python tokenize
311 // implementation
312 if (type > DEDENT && type < OP) {
313 type = OP;
314 }
315 else if (type == NEWLINE) {
316 Py_DECREF(str);
317 if (!it->tok->implicit_newline) {
318 if (it->tok->start[0] == '\r') {
319 str = PyUnicode_FromString("\r\n");
320 } else {
321 str = PyUnicode_FromString("\n");
322 }
323 }
324 end_col_offset++;
325 }
326 else if (type == NL) {
327 if (it->tok->implicit_newline) {
328 Py_DECREF(str);
329 str = PyUnicode_FromString("");
330 }
331 }
332
333 if (str == NULL) {
334 Py_DECREF(line);
335 goto exit;
336 }
337 }
338
339 result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
340 exit:
341 _PyToken_Free(&token);
342 if (type == ENDMARKER) {
343 it->done = 1;
344 }
345
346 Py_END_CRITICAL_SECTION();
347 return result;
348 }
349
350 static void
tokenizeriter_dealloc(tokenizeriterobject * it)351 tokenizeriter_dealloc(tokenizeriterobject *it)
352 {
353 PyTypeObject *tp = Py_TYPE(it);
354 Py_XDECREF(it->last_line);
355 _PyTokenizer_Free(it->tok);
356 tp->tp_free(it);
357 Py_DECREF(tp);
358 }
359
360 static PyType_Slot tokenizeriter_slots[] = {
361 {Py_tp_new, tokenizeriter_new},
362 {Py_tp_dealloc, tokenizeriter_dealloc},
363 {Py_tp_getattro, PyObject_GenericGetAttr},
364 {Py_tp_iter, PyObject_SelfIter},
365 {Py_tp_iternext, tokenizeriter_next},
366 {0, NULL},
367 };
368
369 static PyType_Spec tokenizeriter_spec = {
370 .name = "_tokenize.TokenizerIter",
371 .basicsize = sizeof(tokenizeriterobject),
372 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
373 .slots = tokenizeriter_slots,
374 };
375
376 static int
tokenizemodule_exec(PyObject * m)377 tokenizemodule_exec(PyObject *m)
378 {
379 tokenize_state *state = get_tokenize_state(m);
380 if (state == NULL) {
381 return -1;
382 }
383
384 state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
385 if (state->TokenizerIter == NULL) {
386 return -1;
387 }
388 if (PyModule_AddType(m, state->TokenizerIter) < 0) {
389 return -1;
390 }
391
392 return 0;
393 }
394
395 static PyMethodDef tokenize_methods[] = {
396 {NULL, NULL, 0, NULL} /* Sentinel */
397 };
398
399 static PyModuleDef_Slot tokenizemodule_slots[] = {
400 {Py_mod_exec, tokenizemodule_exec},
401 {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
402 {Py_mod_gil, Py_MOD_GIL_NOT_USED},
403 {0, NULL}
404 };
405
406 static int
tokenizemodule_traverse(PyObject * m,visitproc visit,void * arg)407 tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
408 {
409 tokenize_state *state = get_tokenize_state(m);
410 Py_VISIT(state->TokenizerIter);
411 return 0;
412 }
413
414 static int
tokenizemodule_clear(PyObject * m)415 tokenizemodule_clear(PyObject *m)
416 {
417 tokenize_state *state = get_tokenize_state(m);
418 Py_CLEAR(state->TokenizerIter);
419 return 0;
420 }
421
422 static void
tokenizemodule_free(void * m)423 tokenizemodule_free(void *m)
424 {
425 tokenizemodule_clear((PyObject *)m);
426 }
427
428 static struct PyModuleDef _tokenizemodule = {
429 PyModuleDef_HEAD_INIT,
430 .m_name = "_tokenize",
431 .m_size = sizeof(tokenize_state),
432 .m_slots = tokenizemodule_slots,
433 .m_methods = tokenize_methods,
434 .m_traverse = tokenizemodule_traverse,
435 .m_clear = tokenizemodule_clear,
436 .m_free = tokenizemodule_free,
437 };
438
439 PyMODINIT_FUNC
PyInit__tokenize(void)440 PyInit__tokenize(void)
441 {
442 return PyModuleDef_Init(&_tokenizemodule);
443 }
444