• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <stdbool.h>
2 
3 #include <Python.h>
4 #include "pycore_bytesobject.h"   // _PyBytes_DecodeEscape()
5 #include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal()
6 
7 #include "lexer/state.h"
8 #include "pegen.h"
9 #include "string_parser.h"
10 
11 //// STRING HANDLING FUNCTIONS ////
12 
13 static int
warn_invalid_escape_sequence(Parser * p,const char * first_invalid_escape,Token * t)14 warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
15 {
16     if (p->call_invalid_rules) {
17         // Do not report warnings if we are in the second pass of the parser
18         // to avoid showing the warning twice.
19         return 0;
20     }
21     unsigned char c = (unsigned char)*first_invalid_escape;
22     if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {
23         // in this case the tokenizer has already emitted a warning,
24         // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence
25         return 0;
26     }
27 
28     int octal = ('4' <= c && c <= '7');
29     PyObject *msg =
30         octal
31         ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
32                                first_invalid_escape)
33         : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
34     if (msg == NULL) {
35         return -1;
36     }
37     PyObject *category;
38     if (p->feature_version >= 12) {
39         category = PyExc_SyntaxWarning;
40     }
41     else {
42         category = PyExc_DeprecationWarning;
43     }
44     if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
45                                  t->lineno, NULL, NULL) < 0) {
46         if (PyErr_ExceptionMatches(category)) {
47             /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
48                to get a more accurate error report */
49             PyErr_Clear();
50 
51             /* This is needed, in order for the SyntaxError to point to the token t,
52                since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
53                error location, if p->known_err_token is not set. */
54             p->known_err_token = t;
55             if (octal) {
56                 RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
57                                    first_invalid_escape);
58             }
59             else {
60                 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
61             }
62         }
63         Py_DECREF(msg);
64         return -1;
65     }
66     Py_DECREF(msg);
67     return 0;
68 }
69 
70 static PyObject *
decode_utf8(const char ** sPtr,const char * end)71 decode_utf8(const char **sPtr, const char *end)
72 {
73     const char *s;
74     const char *t;
75     t = s = *sPtr;
76     while (s < end && (*s & 0x80)) {
77         s++;
78     }
79     *sPtr = s;
80     return PyUnicode_DecodeUTF8(t, s - t, NULL);
81 }
82 
83 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)84 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
85 {
86     PyObject *v;
87     PyObject *u;
88     char *buf;
89     char *p;
90     const char *end;
91 
92     /* check for integer overflow */
93     if (len > (size_t)PY_SSIZE_T_MAX / 6) {
94         return NULL;
95     }
96     /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
97        "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
98     u = PyBytes_FromStringAndSize((char *)NULL, (Py_ssize_t)len * 6);
99     if (u == NULL) {
100         return NULL;
101     }
102     p = buf = PyBytes_AsString(u);
103     if (p == NULL) {
104         return NULL;
105     }
106     end = s + len;
107     while (s < end) {
108         if (*s == '\\') {
109             *p++ = *s++;
110             if (s >= end || *s & 0x80) {
111                 strcpy(p, "u005c");
112                 p += 5;
113                 if (s >= end) {
114                     break;
115                 }
116             }
117         }
118         if (*s & 0x80) {
119             PyObject *w;
120             int kind;
121             const void *data;
122             Py_ssize_t w_len;
123             Py_ssize_t i;
124             w = decode_utf8(&s, end);
125             if (w == NULL) {
126                 Py_DECREF(u);
127                 return NULL;
128             }
129             kind = PyUnicode_KIND(w);
130             data = PyUnicode_DATA(w);
131             w_len = PyUnicode_GET_LENGTH(w);
132             for (i = 0; i < w_len; i++) {
133                 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
134                 sprintf(p, "\\U%08x", chr);
135                 p += 10;
136             }
137             /* Should be impossible to overflow */
138             assert(p - buf <= PyBytes_GET_SIZE(u));
139             Py_DECREF(w);
140         }
141         else {
142             *p++ = *s++;
143         }
144     }
145     len = (size_t)(p - buf);
146     s = buf;
147 
148     const char *first_invalid_escape;
149     v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape);
150 
151     // HACK: later we can simply pass the line no, since we don't preserve the tokens
152     // when we are decoding the string but we preserve the line numbers.
153     if (v != NULL && first_invalid_escape != NULL && t != NULL) {
154         if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
155             /* We have not decref u before because first_invalid_escape points
156                inside u. */
157             Py_XDECREF(u);
158             Py_DECREF(v);
159             return NULL;
160         }
161     }
162     Py_XDECREF(u);
163     return v;
164 }
165 
166 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)167 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
168 {
169     const char *first_invalid_escape;
170     PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
171     if (result == NULL) {
172         return NULL;
173     }
174 
175     if (first_invalid_escape != NULL) {
176         if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
177             Py_DECREF(result);
178             return NULL;
179         }
180     }
181     return result;
182 }
183 
184 PyObject *
_PyPegen_decode_string(Parser * p,int raw,const char * s,size_t len,Token * t)185 _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
186 {
187     if (raw) {
188         return PyUnicode_DecodeUTF8Stateful(s, (Py_ssize_t)len, NULL, NULL);
189     }
190     return decode_unicode_with_escapes(p, s, len, t);
191 }
192 
193 /* s must include the bracketing quote characters, and r, b &/or f prefixes
194     (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
195    _PyPegen_parse_string parses it, and returns the decoded Python string object. */
196 PyObject *
_PyPegen_parse_string(Parser * p,Token * t)197 _PyPegen_parse_string(Parser *p, Token *t)
198 {
199     const char *s = PyBytes_AsString(t->bytes);
200     if (s == NULL) {
201         return NULL;
202     }
203 
204     size_t len;
205     int quote = Py_CHARMASK(*s);
206     int bytesmode = 0;
207     int rawmode = 0;
208 
209     if (Py_ISALPHA(quote)) {
210         while (!bytesmode || !rawmode) {
211             if (quote == 'b' || quote == 'B') {
212                 quote =(unsigned char)*++s;
213                 bytesmode = 1;
214             }
215             else if (quote == 'u' || quote == 'U') {
216                 quote = (unsigned char)*++s;
217             }
218             else if (quote == 'r' || quote == 'R') {
219                 quote = (unsigned char)*++s;
220                 rawmode = 1;
221             }
222             else {
223                 break;
224             }
225         }
226     }
227 
228     if (quote != '\'' && quote != '\"') {
229         PyErr_BadInternalCall();
230         return NULL;
231     }
232 
233     /* Skip the leading quote char. */
234     s++;
235     len = strlen(s);
236     // gh-120155: 's' contains at least the trailing quote,
237     // so the code '--len' below is safe.
238     assert(len >= 1);
239 
240     if (len > INT_MAX) {
241         PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
242         return NULL;
243     }
244     if (s[--len] != quote) {
245         /* Last quote char must match the first. */
246         PyErr_BadInternalCall();
247         return NULL;
248     }
249     if (len >= 4 && s[0] == quote && s[1] == quote) {
250         /* A triple quoted string. We've already skipped one quote at
251            the start and one at the end of the string. Now skip the
252            two at the start. */
253         s += 2;
254         len -= 2;
255         /* And check that the last two match. */
256         if (s[--len] != quote || s[--len] != quote) {
257             PyErr_BadInternalCall();
258             return NULL;
259         }
260     }
261 
262     /* Avoid invoking escape decoding routines if possible. */
263     rawmode = rawmode || strchr(s, '\\') == NULL;
264     if (bytesmode) {
265         /* Disallow non-ASCII characters. */
266         const char *ch;
267         for (ch = s; *ch; ch++) {
268             if (Py_CHARMASK(*ch) >= 0x80) {
269                 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
270                                    t,
271                                    "bytes can only contain ASCII "
272                                    "literal characters");
273                 return NULL;
274             }
275         }
276         if (rawmode) {
277             return PyBytes_FromStringAndSize(s, (Py_ssize_t)len);
278         }
279         return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);
280     }
281     return _PyPegen_decode_string(p, rawmode, s, len, t);
282 }
283