1 #include "Python.h"
2 #include "errcode.h"
3
4 #include "helpers.h"
5 #include "../lexer/state.h"
6
7 static int
tok_underflow_string(struct tok_state * tok)8 tok_underflow_string(struct tok_state *tok) {
9 char *end = strchr(tok->inp, '\n');
10 if (end != NULL) {
11 end++;
12 }
13 else {
14 end = strchr(tok->inp, '\0');
15 if (end == tok->inp) {
16 tok->done = E_EOF;
17 return 0;
18 }
19 }
20 if (tok->start == NULL) {
21 tok->buf = tok->cur;
22 }
23 tok->line_start = tok->cur;
24 ADVANCE_LINENO();
25 tok->inp = end;
26 return 1;
27 }
28
29 /* Fetch a byte from TOK, using the string buffer. */
30 static int
buf_getc(struct tok_state * tok)31 buf_getc(struct tok_state *tok) {
32 return Py_CHARMASK(*tok->str++);
33 }
34
35 /* Unfetch a byte from TOK, using the string buffer. */
36 static void
buf_ungetc(int c,struct tok_state * tok)37 buf_ungetc(int c, struct tok_state *tok) {
38 tok->str--;
39 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
40 }
41
42 /* Set the readline function for TOK to ENC. For the string-based
43 tokenizer, this means to just record the encoding. */
44 static int
buf_setreadl(struct tok_state * tok,const char * enc)45 buf_setreadl(struct tok_state *tok, const char* enc) {
46 tok->enc = enc;
47 return 1;
48 }
49
50 /* Decode a byte string STR for use as the buffer of TOK.
51 Look for encoding declarations inside STR, and record them
52 inside TOK. */
53 static char *
decode_str(const char * input,int single,struct tok_state * tok,int preserve_crlf)54 decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
55 {
56 PyObject* utf8 = NULL;
57 char *str;
58 const char *s;
59 const char *newl[2] = {NULL, NULL};
60 int lineno = 0;
61 tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
62 if (str == NULL)
63 return NULL;
64 tok->enc = NULL;
65 tok->str = str;
66 if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
67 return _PyTokenizer_error_ret(tok);
68 str = tok->str; /* string after BOM if any */
69 assert(str);
70 if (tok->enc != NULL) {
71 utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
72 if (utf8 == NULL)
73 return _PyTokenizer_error_ret(tok);
74 str = PyBytes_AsString(utf8);
75 }
76 for (s = str;; s++) {
77 if (*s == '\0') break;
78 else if (*s == '\n') {
79 assert(lineno < 2);
80 newl[lineno] = s;
81 lineno++;
82 if (lineno == 2) break;
83 }
84 }
85 tok->enc = NULL;
86 /* need to check line 1 and 2 separately since check_coding_spec
87 assumes a single line as input */
88 if (newl[0]) {
89 if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
90 return NULL;
91 }
92 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
93 if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
94 tok, buf_setreadl))
95 return NULL;
96 }
97 }
98 if (tok->enc != NULL) {
99 assert(utf8 == NULL);
100 utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
101 if (utf8 == NULL)
102 return _PyTokenizer_error_ret(tok);
103 str = PyBytes_AS_STRING(utf8);
104 }
105 assert(tok->decoding_buffer == NULL);
106 tok->decoding_buffer = utf8; /* CAUTION */
107 return str;
108 }
109
110 /* Set up tokenizer for string */
111 struct tok_state *
_PyTokenizer_FromString(const char * str,int exec_input,int preserve_crlf)112 _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
113 {
114 struct tok_state *tok = _PyTokenizer_tok_new();
115 char *decoded;
116
117 if (tok == NULL)
118 return NULL;
119 decoded = decode_str(str, exec_input, tok, preserve_crlf);
120 if (decoded == NULL) {
121 _PyTokenizer_Free(tok);
122 return NULL;
123 }
124
125 tok->buf = tok->cur = tok->inp = decoded;
126 tok->end = decoded;
127 tok->underflow = &tok_underflow_string;
128 return tok;
129 }
130