1 #include <stdbool.h>
2
3 #include <Python.h>
4 #include "pycore_bytesobject.h" // _PyBytes_DecodeEscape()
5 #include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal()
6
7 #include "lexer/state.h"
8 #include "pegen.h"
9 #include "string_parser.h"
10
11 //// STRING HANDLING FUNCTIONS ////
12
13 static int
warn_invalid_escape_sequence(Parser * p,const char * first_invalid_escape,Token * t)14 warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
15 {
16 if (p->call_invalid_rules) {
17 // Do not report warnings if we are in the second pass of the parser
18 // to avoid showing the warning twice.
19 return 0;
20 }
21 unsigned char c = (unsigned char)*first_invalid_escape;
22 if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {
23 // in this case the tokenizer has already emitted a warning,
24 // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence
25 return 0;
26 }
27
28 int octal = ('4' <= c && c <= '7');
29 PyObject *msg =
30 octal
31 ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
32 first_invalid_escape)
33 : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
34 if (msg == NULL) {
35 return -1;
36 }
37 PyObject *category;
38 if (p->feature_version >= 12) {
39 category = PyExc_SyntaxWarning;
40 }
41 else {
42 category = PyExc_DeprecationWarning;
43 }
44 if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
45 t->lineno, NULL, NULL) < 0) {
46 if (PyErr_ExceptionMatches(category)) {
47 /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
48 to get a more accurate error report */
49 PyErr_Clear();
50
51 /* This is needed, in order for the SyntaxError to point to the token t,
52 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
53 error location, if p->known_err_token is not set. */
54 p->known_err_token = t;
55 if (octal) {
56 RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
57 first_invalid_escape);
58 }
59 else {
60 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
61 }
62 }
63 Py_DECREF(msg);
64 return -1;
65 }
66 Py_DECREF(msg);
67 return 0;
68 }
69
70 static PyObject *
decode_utf8(const char ** sPtr,const char * end)71 decode_utf8(const char **sPtr, const char *end)
72 {
73 const char *s;
74 const char *t;
75 t = s = *sPtr;
76 while (s < end && (*s & 0x80)) {
77 s++;
78 }
79 *sPtr = s;
80 return PyUnicode_DecodeUTF8(t, s - t, NULL);
81 }
82
83 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)84 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
85 {
86 PyObject *v;
87 PyObject *u;
88 char *buf;
89 char *p;
90 const char *end;
91
92 /* check for integer overflow */
93 if (len > (size_t)PY_SSIZE_T_MAX / 6) {
94 return NULL;
95 }
96 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
97 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
98 u = PyBytes_FromStringAndSize((char *)NULL, (Py_ssize_t)len * 6);
99 if (u == NULL) {
100 return NULL;
101 }
102 p = buf = PyBytes_AsString(u);
103 if (p == NULL) {
104 return NULL;
105 }
106 end = s + len;
107 while (s < end) {
108 if (*s == '\\') {
109 *p++ = *s++;
110 if (s >= end || *s & 0x80) {
111 strcpy(p, "u005c");
112 p += 5;
113 if (s >= end) {
114 break;
115 }
116 }
117 }
118 if (*s & 0x80) {
119 PyObject *w;
120 int kind;
121 const void *data;
122 Py_ssize_t w_len;
123 Py_ssize_t i;
124 w = decode_utf8(&s, end);
125 if (w == NULL) {
126 Py_DECREF(u);
127 return NULL;
128 }
129 kind = PyUnicode_KIND(w);
130 data = PyUnicode_DATA(w);
131 w_len = PyUnicode_GET_LENGTH(w);
132 for (i = 0; i < w_len; i++) {
133 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
134 sprintf(p, "\\U%08x", chr);
135 p += 10;
136 }
137 /* Should be impossible to overflow */
138 assert(p - buf <= PyBytes_GET_SIZE(u));
139 Py_DECREF(w);
140 }
141 else {
142 *p++ = *s++;
143 }
144 }
145 len = (size_t)(p - buf);
146 s = buf;
147
148 const char *first_invalid_escape;
149 v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape);
150
151 // HACK: later we can simply pass the line no, since we don't preserve the tokens
152 // when we are decoding the string but we preserve the line numbers.
153 if (v != NULL && first_invalid_escape != NULL && t != NULL) {
154 if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
155 /* We have not decref u before because first_invalid_escape points
156 inside u. */
157 Py_XDECREF(u);
158 Py_DECREF(v);
159 return NULL;
160 }
161 }
162 Py_XDECREF(u);
163 return v;
164 }
165
166 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)167 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
168 {
169 const char *first_invalid_escape;
170 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
171 if (result == NULL) {
172 return NULL;
173 }
174
175 if (first_invalid_escape != NULL) {
176 if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
177 Py_DECREF(result);
178 return NULL;
179 }
180 }
181 return result;
182 }
183
184 PyObject *
_PyPegen_decode_string(Parser * p,int raw,const char * s,size_t len,Token * t)185 _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
186 {
187 if (raw) {
188 return PyUnicode_DecodeUTF8Stateful(s, (Py_ssize_t)len, NULL, NULL);
189 }
190 return decode_unicode_with_escapes(p, s, len, t);
191 }
192
193 /* s must include the bracketing quote characters, and r, b &/or f prefixes
194 (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
195 _PyPegen_parse_string parses it, and returns the decoded Python string object. */
196 PyObject *
_PyPegen_parse_string(Parser * p,Token * t)197 _PyPegen_parse_string(Parser *p, Token *t)
198 {
199 const char *s = PyBytes_AsString(t->bytes);
200 if (s == NULL) {
201 return NULL;
202 }
203
204 size_t len;
205 int quote = Py_CHARMASK(*s);
206 int bytesmode = 0;
207 int rawmode = 0;
208
209 if (Py_ISALPHA(quote)) {
210 while (!bytesmode || !rawmode) {
211 if (quote == 'b' || quote == 'B') {
212 quote =(unsigned char)*++s;
213 bytesmode = 1;
214 }
215 else if (quote == 'u' || quote == 'U') {
216 quote = (unsigned char)*++s;
217 }
218 else if (quote == 'r' || quote == 'R') {
219 quote = (unsigned char)*++s;
220 rawmode = 1;
221 }
222 else {
223 break;
224 }
225 }
226 }
227
228 if (quote != '\'' && quote != '\"') {
229 PyErr_BadInternalCall();
230 return NULL;
231 }
232
233 /* Skip the leading quote char. */
234 s++;
235 len = strlen(s);
236 // gh-120155: 's' contains at least the trailing quote,
237 // so the code '--len' below is safe.
238 assert(len >= 1);
239
240 if (len > INT_MAX) {
241 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
242 return NULL;
243 }
244 if (s[--len] != quote) {
245 /* Last quote char must match the first. */
246 PyErr_BadInternalCall();
247 return NULL;
248 }
249 if (len >= 4 && s[0] == quote && s[1] == quote) {
250 /* A triple quoted string. We've already skipped one quote at
251 the start and one at the end of the string. Now skip the
252 two at the start. */
253 s += 2;
254 len -= 2;
255 /* And check that the last two match. */
256 if (s[--len] != quote || s[--len] != quote) {
257 PyErr_BadInternalCall();
258 return NULL;
259 }
260 }
261
262 /* Avoid invoking escape decoding routines if possible. */
263 rawmode = rawmode || strchr(s, '\\') == NULL;
264 if (bytesmode) {
265 /* Disallow non-ASCII characters. */
266 const char *ch;
267 for (ch = s; *ch; ch++) {
268 if (Py_CHARMASK(*ch) >= 0x80) {
269 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
270 t,
271 "bytes can only contain ASCII "
272 "literal characters");
273 return NULL;
274 }
275 }
276 if (rawmode) {
277 return PyBytes_FromStringAndSize(s, (Py_ssize_t)len);
278 }
279 return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);
280 }
281 return _PyPegen_decode_string(p, rawmode, s, len, t);
282 }
283