• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Secret Labs' Regular Expression Engine
3  *
4  * regular expression matching engine
5  *
6  * partial history:
7  * 1999-10-24 fl   created (based on existing template matcher code)
8  * 2000-03-06 fl   first alpha, sort of
9  * 2000-08-01 fl   fixes for 1.6b1
10  * 2000-08-07 fl   use PyOS_CheckStack() if available
11  * 2000-09-20 fl   added expand method
12  * 2001-03-20 fl   lots of fixes for 2.1b2
13  * 2001-04-15 fl   export copyright as Python attribute, not global
14  * 2001-04-28 fl   added __copy__ methods (work in progress)
15  * 2001-05-14 fl   fixes for 1.5.2 compatibility
16  * 2001-07-01 fl   added BIGCHARSET support (from Martin von Loewis)
17  * 2001-10-18 fl   fixed group reset issue (from Matthew Mueller)
18  * 2001-10-20 fl   added split primitive; re-enable unicode for 1.6/2.0/2.1
19  * 2001-10-21 fl   added sub/subn primitive
20  * 2001-10-24 fl   added finditer primitive (for 2.2 only)
21  * 2001-12-07 fl   fixed memory leak in sub/subn (Guido van Rossum)
22  * 2002-11-09 fl   fixed empty sub/subn return type
23  * 2003-04-18 mvl  fully support 4-byte codes
24  * 2003-10-17 gn   implemented non recursive scheme
25  * 2013-02-04 mrab added fullmatch primitive
26  *
27  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
28  *
29  * This version of the SRE library can be redistributed under CNRI's
30  * Python 1.6 license.  For any other use, please contact Secret Labs
31  * AB (info@pythonware.com).
32  *
33  * Portions of this engine have been developed in cooperation with
34  * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
35  * other compatibility work.
36  */
37 
38 static const char copyright[] =
39     " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40 
41 #define PY_SSIZE_T_CLEAN
42 
43 #include "Python.h"
44 #include "pycore_long.h"          // _PyLong_GetZero()
45 #include "pycore_moduleobject.h"  // _PyModule_GetState()
46 #include "structmember.h"         // PyMemberDef
47 
48 #include "sre.h"
49 
50 #define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
51 
52 #include <ctype.h>
53 
54 /* name of this module, minus the leading underscore */
55 #if !defined(SRE_MODULE)
56 #define SRE_MODULE "sre"
57 #endif
58 
59 #define SRE_PY_MODULE "re"
60 
61 /* defining this one enables tracing */
62 #undef VERBOSE
63 
64 /* -------------------------------------------------------------------- */
65 
66 #if defined(_MSC_VER)
67 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
68 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
69 /* fastest possible local call under MSVC */
70 #define LOCAL(type) static __inline type __fastcall
71 #else
72 #define LOCAL(type) static inline type
73 #endif
74 
75 /* error codes */
76 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
77 #define SRE_ERROR_STATE -2 /* illegal state */
78 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
79 #define SRE_ERROR_MEMORY -9 /* out of memory */
80 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
81 
82 #if defined(VERBOSE)
83 #define TRACE(v) printf v
84 #else
85 #define TRACE(v)
86 #endif
87 
88 /* -------------------------------------------------------------------- */
89 /* search engine state */
90 
91 #define SRE_IS_DIGIT(ch)\
92     ((ch) <= '9' && Py_ISDIGIT(ch))
93 #define SRE_IS_SPACE(ch)\
94     ((ch) <= ' ' && Py_ISSPACE(ch))
95 #define SRE_IS_LINEBREAK(ch)\
96     ((ch) == '\n')
97 #define SRE_IS_WORD(ch)\
98     ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
99 
sre_lower_ascii(unsigned int ch)100 static unsigned int sre_lower_ascii(unsigned int ch)
101 {
102     return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
103 }
104 
105 /* locale-specific character predicates */
106 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
107  * warnings when c's type supports only numbers < N+1 */
108 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
109 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
110 
sre_lower_locale(unsigned int ch)111 static unsigned int sre_lower_locale(unsigned int ch)
112 {
113     return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
114 }
115 
sre_upper_locale(unsigned int ch)116 static unsigned int sre_upper_locale(unsigned int ch)
117 {
118     return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
119 }
120 
121 /* unicode-specific character predicates */
122 
123 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
124 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
125 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
126 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
127 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
128 
sre_lower_unicode(unsigned int ch)129 static unsigned int sre_lower_unicode(unsigned int ch)
130 {
131     return (unsigned int) Py_UNICODE_TOLOWER(ch);
132 }
133 
sre_upper_unicode(unsigned int ch)134 static unsigned int sre_upper_unicode(unsigned int ch)
135 {
136     return (unsigned int) Py_UNICODE_TOUPPER(ch);
137 }
138 
139 LOCAL(int)
sre_category(SRE_CODE category,unsigned int ch)140 sre_category(SRE_CODE category, unsigned int ch)
141 {
142     switch (category) {
143 
144     case SRE_CATEGORY_DIGIT:
145         return SRE_IS_DIGIT(ch);
146     case SRE_CATEGORY_NOT_DIGIT:
147         return !SRE_IS_DIGIT(ch);
148     case SRE_CATEGORY_SPACE:
149         return SRE_IS_SPACE(ch);
150     case SRE_CATEGORY_NOT_SPACE:
151         return !SRE_IS_SPACE(ch);
152     case SRE_CATEGORY_WORD:
153         return SRE_IS_WORD(ch);
154     case SRE_CATEGORY_NOT_WORD:
155         return !SRE_IS_WORD(ch);
156     case SRE_CATEGORY_LINEBREAK:
157         return SRE_IS_LINEBREAK(ch);
158     case SRE_CATEGORY_NOT_LINEBREAK:
159         return !SRE_IS_LINEBREAK(ch);
160 
161     case SRE_CATEGORY_LOC_WORD:
162         return SRE_LOC_IS_WORD(ch);
163     case SRE_CATEGORY_LOC_NOT_WORD:
164         return !SRE_LOC_IS_WORD(ch);
165 
166     case SRE_CATEGORY_UNI_DIGIT:
167         return SRE_UNI_IS_DIGIT(ch);
168     case SRE_CATEGORY_UNI_NOT_DIGIT:
169         return !SRE_UNI_IS_DIGIT(ch);
170     case SRE_CATEGORY_UNI_SPACE:
171         return SRE_UNI_IS_SPACE(ch);
172     case SRE_CATEGORY_UNI_NOT_SPACE:
173         return !SRE_UNI_IS_SPACE(ch);
174     case SRE_CATEGORY_UNI_WORD:
175         return SRE_UNI_IS_WORD(ch);
176     case SRE_CATEGORY_UNI_NOT_WORD:
177         return !SRE_UNI_IS_WORD(ch);
178     case SRE_CATEGORY_UNI_LINEBREAK:
179         return SRE_UNI_IS_LINEBREAK(ch);
180     case SRE_CATEGORY_UNI_NOT_LINEBREAK:
181         return !SRE_UNI_IS_LINEBREAK(ch);
182     }
183     return 0;
184 }
185 
186 LOCAL(int)
char_loc_ignore(SRE_CODE pattern,SRE_CODE ch)187 char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
188 {
189     return ch == pattern
190         || (SRE_CODE) sre_lower_locale(ch) == pattern
191         || (SRE_CODE) sre_upper_locale(ch) == pattern;
192 }
193 
194 
195 /* helpers */
196 
197 static void
data_stack_dealloc(SRE_STATE * state)198 data_stack_dealloc(SRE_STATE* state)
199 {
200     if (state->data_stack) {
201         PyMem_Free(state->data_stack);
202         state->data_stack = NULL;
203     }
204     state->data_stack_size = state->data_stack_base = 0;
205 }
206 
207 static int
data_stack_grow(SRE_STATE * state,Py_ssize_t size)208 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
209 {
210     Py_ssize_t minsize, cursize;
211     minsize = state->data_stack_base+size;
212     cursize = state->data_stack_size;
213     if (cursize < minsize) {
214         void* stack;
215         cursize = minsize+minsize/4+1024;
216         TRACE(("allocate/grow stack %zd\n", cursize));
217         stack = PyMem_Realloc(state->data_stack, cursize);
218         if (!stack) {
219             data_stack_dealloc(state);
220             return SRE_ERROR_MEMORY;
221         }
222         state->data_stack = (char *)stack;
223         state->data_stack_size = cursize;
224     }
225     return 0;
226 }
227 
228 /* generate 8-bit version */
229 
230 #define SRE_CHAR Py_UCS1
231 #define SIZEOF_SRE_CHAR 1
232 #define SRE(F) sre_ucs1_##F
233 #include "sre_lib.h"
234 
235 /* generate 16-bit unicode version */
236 
237 #define SRE_CHAR Py_UCS2
238 #define SIZEOF_SRE_CHAR 2
239 #define SRE(F) sre_ucs2_##F
240 #include "sre_lib.h"
241 
242 /* generate 32-bit unicode version */
243 
244 #define SRE_CHAR Py_UCS4
245 #define SIZEOF_SRE_CHAR 4
246 #define SRE(F) sre_ucs4_##F
247 #include "sre_lib.h"
248 
249 /* -------------------------------------------------------------------- */
250 /* factories and destructors */
251 
252 /* module state */
253 typedef struct {
254     PyTypeObject *Pattern_Type;
255     PyTypeObject *Match_Type;
256     PyTypeObject *Scanner_Type;
257 } _sremodulestate;
258 
259 static _sremodulestate *
get_sre_module_state(PyObject * m)260 get_sre_module_state(PyObject *m)
261 {
262     _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
263     assert(state);
264     return state;
265 }
266 
267 static struct PyModuleDef sremodule;
268 #define get_sre_module_state_by_class(cls) \
269     (get_sre_module_state(PyType_GetModule(cls)))
270 
271 /* see sre.h for object declarations */
272 static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
273 static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
274 
275 /*[clinic input]
276 module _sre
277 class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
278 class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
279 class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
280 [clinic start generated code]*/
281 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
282 
283 /*[clinic input]
284 _sre.getcodesize -> int
285 [clinic start generated code]*/
286 
287 static int
_sre_getcodesize_impl(PyObject * module)288 _sre_getcodesize_impl(PyObject *module)
289 /*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
290 {
291     return sizeof(SRE_CODE);
292 }
293 
294 /*[clinic input]
295 _sre.ascii_iscased -> bool
296 
297     character: int
298     /
299 
300 [clinic start generated code]*/
301 
302 static int
_sre_ascii_iscased_impl(PyObject * module,int character)303 _sre_ascii_iscased_impl(PyObject *module, int character)
304 /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
305 {
306     unsigned int ch = (unsigned int)character;
307     return ch < 128 && Py_ISALPHA(ch);
308 }
309 
310 /*[clinic input]
311 _sre.unicode_iscased -> bool
312 
313     character: int
314     /
315 
316 [clinic start generated code]*/
317 
318 static int
_sre_unicode_iscased_impl(PyObject * module,int character)319 _sre_unicode_iscased_impl(PyObject *module, int character)
320 /*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
321 {
322     unsigned int ch = (unsigned int)character;
323     return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
324 }
325 
326 /*[clinic input]
327 _sre.ascii_tolower -> int
328 
329     character: int
330     /
331 
332 [clinic start generated code]*/
333 
334 static int
_sre_ascii_tolower_impl(PyObject * module,int character)335 _sre_ascii_tolower_impl(PyObject *module, int character)
336 /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
337 {
338     return sre_lower_ascii(character);
339 }
340 
341 /*[clinic input]
342 _sre.unicode_tolower -> int
343 
344     character: int
345     /
346 
347 [clinic start generated code]*/
348 
349 static int
_sre_unicode_tolower_impl(PyObject * module,int character)350 _sre_unicode_tolower_impl(PyObject *module, int character)
351 /*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
352 {
353     return sre_lower_unicode(character);
354 }
355 
356 LOCAL(void)
state_reset(SRE_STATE * state)357 state_reset(SRE_STATE* state)
358 {
359     /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
360     /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
361 
362     state->lastmark = -1;
363     state->lastindex = -1;
364 
365     state->repeat = NULL;
366 
367     data_stack_dealloc(state);
368 }
369 
370 static const void*
getstring(PyObject * string,Py_ssize_t * p_length,int * p_isbytes,int * p_charsize,Py_buffer * view)371 getstring(PyObject* string, Py_ssize_t* p_length,
372           int* p_isbytes, int* p_charsize,
373           Py_buffer *view)
374 {
375     /* given a python object, return a data pointer, a length (in
376        characters), and a character size.  return NULL if the object
377        is not a string (or not compatible) */
378 
379     /* Unicode objects do not support the buffer API. So, get the data
380        directly instead. */
381     if (PyUnicode_Check(string)) {
382         if (PyUnicode_READY(string) == -1)
383             return NULL;
384         *p_length = PyUnicode_GET_LENGTH(string);
385         *p_charsize = PyUnicode_KIND(string);
386         *p_isbytes = 0;
387         return PyUnicode_DATA(string);
388     }
389 
390     /* get pointer to byte string buffer */
391     if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
392         PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
393         return NULL;
394     }
395 
396     *p_length = view->len;
397     *p_charsize = 1;
398     *p_isbytes = 1;
399 
400     if (view->buf == NULL) {
401         PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
402         PyBuffer_Release(view);
403         view->buf = NULL;
404         return NULL;
405     }
406     return view->buf;
407 }
408 
409 LOCAL(PyObject*)
state_init(SRE_STATE * state,PatternObject * pattern,PyObject * string,Py_ssize_t start,Py_ssize_t end)410 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
411            Py_ssize_t start, Py_ssize_t end)
412 {
413     /* prepare state object */
414 
415     Py_ssize_t length;
416     int isbytes, charsize;
417     const void* ptr;
418 
419     memset(state, 0, sizeof(SRE_STATE));
420 
421     state->mark = PyMem_New(const void *, pattern->groups * 2);
422     if (!state->mark) {
423         PyErr_NoMemory();
424         goto err;
425     }
426     state->lastmark = -1;
427     state->lastindex = -1;
428 
429     state->buffer.buf = NULL;
430     ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
431     if (!ptr)
432         goto err;
433 
434     if (isbytes && pattern->isbytes == 0) {
435         PyErr_SetString(PyExc_TypeError,
436                         "cannot use a string pattern on a bytes-like object");
437         goto err;
438     }
439     if (!isbytes && pattern->isbytes > 0) {
440         PyErr_SetString(PyExc_TypeError,
441                         "cannot use a bytes pattern on a string-like object");
442         goto err;
443     }
444 
445     /* adjust boundaries */
446     if (start < 0)
447         start = 0;
448     else if (start > length)
449         start = length;
450 
451     if (end < 0)
452         end = 0;
453     else if (end > length)
454         end = length;
455 
456     state->isbytes = isbytes;
457     state->charsize = charsize;
458     state->match_all = 0;
459     state->must_advance = 0;
460 
461     state->beginning = ptr;
462 
463     state->start = (void*) ((char*) ptr + start * state->charsize);
464     state->end = (void*) ((char*) ptr + end * state->charsize);
465 
466     Py_INCREF(string);
467     state->string = string;
468     state->pos = start;
469     state->endpos = end;
470 
471     return string;
472   err:
473     /* We add an explicit cast here because MSVC has a bug when
474        compiling C code where it believes that `const void**` cannot be
475        safely casted to `void*`, see bpo-39943 for details. */
476     PyMem_Free((void*) state->mark);
477     state->mark = NULL;
478     if (state->buffer.buf)
479         PyBuffer_Release(&state->buffer);
480     return NULL;
481 }
482 
483 LOCAL(void)
state_fini(SRE_STATE * state)484 state_fini(SRE_STATE* state)
485 {
486     if (state->buffer.buf)
487         PyBuffer_Release(&state->buffer);
488     Py_XDECREF(state->string);
489     data_stack_dealloc(state);
490     /* See above PyMem_Del for why we explicitly cast here. */
491     PyMem_Free((void*) state->mark);
492     state->mark = NULL;
493 }
494 
495 /* calculate offset from start of string */
496 #define STATE_OFFSET(state, member)\
497     (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
498 
499 LOCAL(PyObject*)
getslice(int isbytes,const void * ptr,PyObject * string,Py_ssize_t start,Py_ssize_t end)500 getslice(int isbytes, const void *ptr,
501          PyObject* string, Py_ssize_t start, Py_ssize_t end)
502 {
503     if (isbytes) {
504         if (PyBytes_CheckExact(string) &&
505             start == 0 && end == PyBytes_GET_SIZE(string)) {
506             Py_INCREF(string);
507             return string;
508         }
509         return PyBytes_FromStringAndSize(
510                 (const char *)ptr + start, end - start);
511     }
512     else {
513         return PyUnicode_Substring(string, start, end);
514     }
515 }
516 
517 LOCAL(PyObject*)
state_getslice(SRE_STATE * state,Py_ssize_t index,PyObject * string,int empty)518 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
519 {
520     Py_ssize_t i, j;
521 
522     index = (index - 1) * 2;
523 
524     if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
525         if (empty)
526             /* want empty string */
527             i = j = 0;
528         else {
529             Py_RETURN_NONE;
530         }
531     } else {
532         i = STATE_OFFSET(state, state->mark[index]);
533         j = STATE_OFFSET(state, state->mark[index+1]);
534     }
535 
536     return getslice(state->isbytes, state->beginning, string, i, j);
537 }
538 
539 static void
pattern_error(Py_ssize_t status)540 pattern_error(Py_ssize_t status)
541 {
542     switch (status) {
543     case SRE_ERROR_RECURSION_LIMIT:
544         /* This error code seems to be unused. */
545         PyErr_SetString(
546             PyExc_RecursionError,
547             "maximum recursion limit exceeded"
548             );
549         break;
550     case SRE_ERROR_MEMORY:
551         PyErr_NoMemory();
552         break;
553     case SRE_ERROR_INTERRUPTED:
554     /* An exception has already been raised, so let it fly */
555         break;
556     default:
557         /* other error codes indicate compiler/engine bugs */
558         PyErr_SetString(
559             PyExc_RuntimeError,
560             "internal error in regular expression engine"
561             );
562     }
563 }
564 
565 static int
pattern_traverse(PatternObject * self,visitproc visit,void * arg)566 pattern_traverse(PatternObject *self, visitproc visit, void *arg)
567 {
568     Py_VISIT(Py_TYPE(self));
569     Py_VISIT(self->groupindex);
570     Py_VISIT(self->indexgroup);
571     Py_VISIT(self->pattern);
572     return 0;
573 }
574 
575 static int
pattern_clear(PatternObject * self)576 pattern_clear(PatternObject *self)
577 {
578     Py_CLEAR(self->groupindex);
579     Py_CLEAR(self->indexgroup);
580     Py_CLEAR(self->pattern);
581     return 0;
582 }
583 
584 static void
pattern_dealloc(PatternObject * self)585 pattern_dealloc(PatternObject* self)
586 {
587     PyTypeObject *tp = Py_TYPE(self);
588 
589     PyObject_GC_UnTrack(self);
590     if (self->weakreflist != NULL) {
591         PyObject_ClearWeakRefs((PyObject *) self);
592     }
593     (void)pattern_clear(self);
594     tp->tp_free(self);
595     Py_DECREF(tp);
596 }
597 
598 LOCAL(Py_ssize_t)
sre_match(SRE_STATE * state,SRE_CODE * pattern)599 sre_match(SRE_STATE* state, SRE_CODE* pattern)
600 {
601     if (state->charsize == 1)
602         return sre_ucs1_match(state, pattern, 1);
603     if (state->charsize == 2)
604         return sre_ucs2_match(state, pattern, 1);
605     assert(state->charsize == 4);
606     return sre_ucs4_match(state, pattern, 1);
607 }
608 
609 LOCAL(Py_ssize_t)
sre_search(SRE_STATE * state,SRE_CODE * pattern)610 sre_search(SRE_STATE* state, SRE_CODE* pattern)
611 {
612     if (state->charsize == 1)
613         return sre_ucs1_search(state, pattern);
614     if (state->charsize == 2)
615         return sre_ucs2_search(state, pattern);
616     assert(state->charsize == 4);
617     return sre_ucs4_search(state, pattern);
618 }
619 
620 /*[clinic input]
621 _sre.SRE_Pattern.match
622 
623     cls: defining_class
624     /
625     string: object
626     pos: Py_ssize_t = 0
627     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
628 
629 Matches zero or more characters at the beginning of the string.
630 [clinic start generated code]*/
631 
632 static PyObject *
_sre_SRE_Pattern_match_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)633 _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
634                             PyObject *string, Py_ssize_t pos,
635                             Py_ssize_t endpos)
636 /*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
637 {
638     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
639     SRE_STATE state;
640     Py_ssize_t status;
641     PyObject *match;
642 
643     if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
644         return NULL;
645 
646     state.ptr = state.start;
647 
648     TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
649 
650     status = sre_match(&state, PatternObject_GetCode(self));
651 
652     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
653     if (PyErr_Occurred()) {
654         state_fini(&state);
655         return NULL;
656     }
657 
658     match = pattern_new_match(module_state, self, &state, status);
659     state_fini(&state);
660     return match;
661 }
662 
663 /*[clinic input]
664 _sre.SRE_Pattern.fullmatch
665 
666     cls: defining_class
667     /
668     string: object
669     pos: Py_ssize_t = 0
670     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
671 
672 Matches against all of the string.
673 [clinic start generated code]*/
674 
675 static PyObject *
_sre_SRE_Pattern_fullmatch_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)676 _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
677                                 PyObject *string, Py_ssize_t pos,
678                                 Py_ssize_t endpos)
679 /*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
680 {
681     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
682     SRE_STATE state;
683     Py_ssize_t status;
684     PyObject *match;
685 
686     if (!state_init(&state, self, string, pos, endpos))
687         return NULL;
688 
689     state.ptr = state.start;
690 
691     TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
692 
693     state.match_all = 1;
694     status = sre_match(&state, PatternObject_GetCode(self));
695 
696     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
697     if (PyErr_Occurred()) {
698         state_fini(&state);
699         return NULL;
700     }
701 
702     match = pattern_new_match(module_state, self, &state, status);
703     state_fini(&state);
704     return match;
705 }
706 
707 /*[clinic input]
708 _sre.SRE_Pattern.search
709 
710     cls: defining_class
711     /
712     string: object
713     pos: Py_ssize_t = 0
714     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
715 
716 Scan through string looking for a match, and return a corresponding match object instance.
717 
718 Return None if no position in the string matches.
719 [clinic start generated code]*/
720 
721 static PyObject *
_sre_SRE_Pattern_search_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)722 _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
723                              PyObject *string, Py_ssize_t pos,
724                              Py_ssize_t endpos)
725 /*[clinic end generated code: output=bd7f2d9d583e1463 input=afa9afb66a74a4b3]*/
726 {
727     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
728     SRE_STATE state;
729     Py_ssize_t status;
730     PyObject *match;
731 
732     if (!state_init(&state, self, string, pos, endpos))
733         return NULL;
734 
735     TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
736 
737     status = sre_search(&state, PatternObject_GetCode(self));
738 
739     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
740 
741     if (PyErr_Occurred()) {
742         state_fini(&state);
743         return NULL;
744     }
745 
746     match = pattern_new_match(module_state, self, &state, status);
747     state_fini(&state);
748     return match;
749 }
750 
751 static PyObject*
call(const char * module,const char * function,PyObject * args)752 call(const char* module, const char* function, PyObject* args)
753 {
754     PyObject* name;
755     PyObject* mod;
756     PyObject* func;
757     PyObject* result;
758 
759     if (!args)
760         return NULL;
761     name = PyUnicode_FromString(module);
762     if (!name)
763         return NULL;
764     mod = PyImport_Import(name);
765     Py_DECREF(name);
766     if (!mod)
767         return NULL;
768     func = PyObject_GetAttrString(mod, function);
769     Py_DECREF(mod);
770     if (!func)
771         return NULL;
772     result = PyObject_CallObject(func, args);
773     Py_DECREF(func);
774     Py_DECREF(args);
775     return result;
776 }
777 
778 /*[clinic input]
779 _sre.SRE_Pattern.findall
780 
781     string: object
782     pos: Py_ssize_t = 0
783     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
784 
785 Return a list of all non-overlapping matches of pattern in string.
786 [clinic start generated code]*/
787 
788 static PyObject *
_sre_SRE_Pattern_findall_impl(PatternObject * self,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)789 _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
790                               Py_ssize_t pos, Py_ssize_t endpos)
791 /*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
792 {
793     SRE_STATE state;
794     PyObject* list;
795     Py_ssize_t status;
796     Py_ssize_t i, b, e;
797 
798     if (!state_init(&state, self, string, pos, endpos))
799         return NULL;
800 
801     list = PyList_New(0);
802     if (!list) {
803         state_fini(&state);
804         return NULL;
805     }
806 
807     while (state.start <= state.end) {
808 
809         PyObject* item;
810 
811         state_reset(&state);
812 
813         state.ptr = state.start;
814 
815         status = sre_search(&state, PatternObject_GetCode(self));
816         if (PyErr_Occurred())
817             goto error;
818 
819         if (status <= 0) {
820             if (status == 0)
821                 break;
822             pattern_error(status);
823             goto error;
824         }
825 
826         /* don't bother to build a match object */
827         switch (self->groups) {
828         case 0:
829             b = STATE_OFFSET(&state, state.start);
830             e = STATE_OFFSET(&state, state.ptr);
831             item = getslice(state.isbytes, state.beginning,
832                             string, b, e);
833             if (!item)
834                 goto error;
835             break;
836         case 1:
837             item = state_getslice(&state, 1, string, 1);
838             if (!item)
839                 goto error;
840             break;
841         default:
842             item = PyTuple_New(self->groups);
843             if (!item)
844                 goto error;
845             for (i = 0; i < self->groups; i++) {
846                 PyObject* o = state_getslice(&state, i+1, string, 1);
847                 if (!o) {
848                     Py_DECREF(item);
849                     goto error;
850                 }
851                 PyTuple_SET_ITEM(item, i, o);
852             }
853             break;
854         }
855 
856         status = PyList_Append(list, item);
857         Py_DECREF(item);
858         if (status < 0)
859             goto error;
860 
861         state.must_advance = (state.ptr == state.start);
862         state.start = state.ptr;
863     }
864 
865     state_fini(&state);
866     return list;
867 
868 error:
869     Py_DECREF(list);
870     state_fini(&state);
871     return NULL;
872 
873 }
874 
875 /*[clinic input]
876 _sre.SRE_Pattern.finditer
877 
878     cls: defining_class
879     /
880     string: object
881     pos: Py_ssize_t = 0
882     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
883 
884 Return an iterator over all non-overlapping matches for the RE pattern in string.
885 
886 For each match, the iterator returns a match object.
887 [clinic start generated code]*/
888 
889 static PyObject *
_sre_SRE_Pattern_finditer_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)890 _sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
891                                PyObject *string, Py_ssize_t pos,
892                                Py_ssize_t endpos)
893 /*[clinic end generated code: output=1791dbf3618ade56 input=812e332a4848cbaf]*/
894 {
895     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
896     PyObject* scanner;
897     PyObject* search;
898     PyObject* iterator;
899 
900     scanner = pattern_scanner(module_state, self, string, pos, endpos);
901     if (!scanner)
902         return NULL;
903 
904     search = PyObject_GetAttrString(scanner, "search");
905     Py_DECREF(scanner);
906     if (!search)
907         return NULL;
908 
909     iterator = PyCallIter_New(search, Py_None);
910     Py_DECREF(search);
911 
912     return iterator;
913 }
914 
915 /*[clinic input]
916 _sre.SRE_Pattern.scanner
917 
918     cls: defining_class
919     /
920     string: object
921     pos: Py_ssize_t = 0
922     endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
923 
924 [clinic start generated code]*/
925 
926 static PyObject *
_sre_SRE_Pattern_scanner_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)927 _sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
928                               PyObject *string, Py_ssize_t pos,
929                               Py_ssize_t endpos)
930 /*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
931 {
932     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
933 
934     return pattern_scanner(module_state, self, string, pos, endpos);
935 }
936 
937 /*[clinic input]
938 _sre.SRE_Pattern.split
939 
940     string: object
941     maxsplit: Py_ssize_t = 0
942 
943 Split string by the occurrences of pattern.
944 [clinic start generated code]*/
945 
946 static PyObject *
_sre_SRE_Pattern_split_impl(PatternObject * self,PyObject * string,Py_ssize_t maxsplit)947 _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
948                             Py_ssize_t maxsplit)
949 /*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
950 {
951     SRE_STATE state;
952     PyObject* list;
953     PyObject* item;
954     Py_ssize_t status;
955     Py_ssize_t n;
956     Py_ssize_t i;
957     const void* last;
958 
959     assert(self->codesize != 0);
960 
961     if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
962         return NULL;
963 
964     list = PyList_New(0);
965     if (!list) {
966         state_fini(&state);
967         return NULL;
968     }
969 
970     n = 0;
971     last = state.start;
972 
973     while (!maxsplit || n < maxsplit) {
974 
975         state_reset(&state);
976 
977         state.ptr = state.start;
978 
979         status = sre_search(&state, PatternObject_GetCode(self));
980         if (PyErr_Occurred())
981             goto error;
982 
983         if (status <= 0) {
984             if (status == 0)
985                 break;
986             pattern_error(status);
987             goto error;
988         }
989 
990         /* get segment before this match */
991         item = getslice(state.isbytes, state.beginning,
992             string, STATE_OFFSET(&state, last),
993             STATE_OFFSET(&state, state.start)
994             );
995         if (!item)
996             goto error;
997         status = PyList_Append(list, item);
998         Py_DECREF(item);
999         if (status < 0)
1000             goto error;
1001 
1002         /* add groups (if any) */
1003         for (i = 0; i < self->groups; i++) {
1004             item = state_getslice(&state, i+1, string, 0);
1005             if (!item)
1006                 goto error;
1007             status = PyList_Append(list, item);
1008             Py_DECREF(item);
1009             if (status < 0)
1010                 goto error;
1011         }
1012 
1013         n = n + 1;
1014         state.must_advance = (state.ptr == state.start);
1015         last = state.start = state.ptr;
1016 
1017     }
1018 
1019     /* get segment following last match (even if empty) */
1020     item = getslice(state.isbytes, state.beginning,
1021         string, STATE_OFFSET(&state, last), state.endpos
1022         );
1023     if (!item)
1024         goto error;
1025     status = PyList_Append(list, item);
1026     Py_DECREF(item);
1027     if (status < 0)
1028         goto error;
1029 
1030     state_fini(&state);
1031     return list;
1032 
1033 error:
1034     Py_DECREF(list);
1035     state_fini(&state);
1036     return NULL;
1037 
1038 }
1039 
1040 static PyObject*
pattern_subx(_sremodulestate * module_state,PatternObject * self,PyObject * ptemplate,PyObject * string,Py_ssize_t count,Py_ssize_t subn)1041 pattern_subx(_sremodulestate* module_state,
1042              PatternObject* self,
1043              PyObject* ptemplate,
1044              PyObject* string,
1045              Py_ssize_t count,
1046              Py_ssize_t subn)
1047 {
1048     SRE_STATE state;
1049     PyObject* list;
1050     PyObject* joiner;
1051     PyObject* item;
1052     PyObject* filter;
1053     PyObject* match;
1054     const void* ptr;
1055     Py_ssize_t status;
1056     Py_ssize_t n;
1057     Py_ssize_t i, b, e;
1058     int isbytes, charsize;
1059     int filter_is_callable;
1060     Py_buffer view;
1061 
1062     if (PyCallable_Check(ptemplate)) {
1063         /* sub/subn takes either a function or a template */
1064         filter = ptemplate;
1065         Py_INCREF(filter);
1066         filter_is_callable = 1;
1067     } else {
1068         /* if not callable, check if it's a literal string */
1069         int literal;
1070         view.buf = NULL;
1071         ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1072         if (ptr) {
1073             if (charsize == 1)
1074                 literal = memchr(ptr, '\\', n) == NULL;
1075             else
1076                 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1077         } else {
1078             PyErr_Clear();
1079             literal = 0;
1080         }
1081         if (view.buf)
1082             PyBuffer_Release(&view);
1083         if (literal) {
1084             filter = ptemplate;
1085             Py_INCREF(filter);
1086             filter_is_callable = 0;
1087         } else {
1088             /* not a literal; hand it over to the template compiler */
1089             filter = call(
1090                 SRE_PY_MODULE, "_subx",
1091                 PyTuple_Pack(2, self, ptemplate)
1092                 );
1093             if (!filter)
1094                 return NULL;
1095             filter_is_callable = PyCallable_Check(filter);
1096         }
1097     }
1098 
1099     if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1100         Py_DECREF(filter);
1101         return NULL;
1102     }
1103 
1104     list = PyList_New(0);
1105     if (!list) {
1106         Py_DECREF(filter);
1107         state_fini(&state);
1108         return NULL;
1109     }
1110 
1111     n = i = 0;
1112 
1113     while (!count || n < count) {
1114 
1115         state_reset(&state);
1116 
1117         state.ptr = state.start;
1118 
1119         status = sre_search(&state, PatternObject_GetCode(self));
1120         if (PyErr_Occurred())
1121             goto error;
1122 
1123         if (status <= 0) {
1124             if (status == 0)
1125                 break;
1126             pattern_error(status);
1127             goto error;
1128         }
1129 
1130         b = STATE_OFFSET(&state, state.start);
1131         e = STATE_OFFSET(&state, state.ptr);
1132 
1133         if (i < b) {
1134             /* get segment before this match */
1135             item = getslice(state.isbytes, state.beginning,
1136                 string, i, b);
1137             if (!item)
1138                 goto error;
1139             status = PyList_Append(list, item);
1140             Py_DECREF(item);
1141             if (status < 0)
1142                 goto error;
1143 
1144         }
1145 
1146         if (filter_is_callable) {
1147             /* pass match object through filter */
1148             match = pattern_new_match(module_state, self, &state, 1);
1149             if (!match)
1150                 goto error;
1151             item = PyObject_CallOneArg(filter, match);
1152             Py_DECREF(match);
1153             if (!item)
1154                 goto error;
1155         } else {
1156             /* filter is literal string */
1157             item = filter;
1158             Py_INCREF(item);
1159         }
1160 
1161         /* add to list */
1162         if (item != Py_None) {
1163             status = PyList_Append(list, item);
1164             Py_DECREF(item);
1165             if (status < 0)
1166                 goto error;
1167         }
1168 
1169         i = e;
1170         n = n + 1;
1171         state.must_advance = (state.ptr == state.start);
1172         state.start = state.ptr;
1173     }
1174 
1175     /* get segment following last match */
1176     if (i < state.endpos) {
1177         item = getslice(state.isbytes, state.beginning,
1178                         string, i, state.endpos);
1179         if (!item)
1180             goto error;
1181         status = PyList_Append(list, item);
1182         Py_DECREF(item);
1183         if (status < 0)
1184             goto error;
1185     }
1186 
1187     state_fini(&state);
1188 
1189     Py_DECREF(filter);
1190 
1191     /* convert list to single string (also removes list) */
1192     joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1193     if (!joiner) {
1194         Py_DECREF(list);
1195         return NULL;
1196     }
1197     if (PyList_GET_SIZE(list) == 0) {
1198         Py_DECREF(list);
1199         item = joiner;
1200     }
1201     else {
1202         if (state.isbytes)
1203             item = _PyBytes_Join(joiner, list);
1204         else
1205             item = PyUnicode_Join(joiner, list);
1206         Py_DECREF(joiner);
1207         Py_DECREF(list);
1208         if (!item)
1209             return NULL;
1210     }
1211 
1212     if (subn)
1213         return Py_BuildValue("Nn", item, n);
1214 
1215     return item;
1216 
1217 error:
1218     Py_DECREF(list);
1219     state_fini(&state);
1220     Py_DECREF(filter);
1221     return NULL;
1222 
1223 }
1224 
1225 /*[clinic input]
1226 _sre.SRE_Pattern.sub
1227 
1228     cls: defining_class
1229     /
1230     repl: object
1231     string: object
1232     count: Py_ssize_t = 0
1233 
1234 Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1235 [clinic start generated code]*/
1236 
1237 static PyObject *
_sre_SRE_Pattern_sub_impl(PatternObject * self,PyTypeObject * cls,PyObject * repl,PyObject * string,Py_ssize_t count)1238 _sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1239                           PyObject *repl, PyObject *string, Py_ssize_t count)
1240 /*[clinic end generated code: output=4be141ab04bca60d input=d8d1d4ac2311a07c]*/
1241 {
1242     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1243 
1244     return pattern_subx(module_state, self, repl, string, count, 0);
1245 }
1246 
1247 /*[clinic input]
1248 _sre.SRE_Pattern.subn
1249 
1250     cls: defining_class
1251     /
1252     repl: object
1253     string: object
1254     count: Py_ssize_t = 0
1255 
1256 Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1257 [clinic start generated code]*/
1258 
1259 static PyObject *
_sre_SRE_Pattern_subn_impl(PatternObject * self,PyTypeObject * cls,PyObject * repl,PyObject * string,Py_ssize_t count)1260 _sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1261                            PyObject *repl, PyObject *string,
1262                            Py_ssize_t count)
1263 /*[clinic end generated code: output=da02fd85258b1e1f input=8b78a65b8302e58d]*/
1264 {
1265     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1266 
1267     return pattern_subx(module_state, self, repl, string, count, 1);
1268 }
1269 
1270 /*[clinic input]
1271 _sre.SRE_Pattern.__copy__
1272 
1273 [clinic start generated code]*/
1274 
1275 static PyObject *
_sre_SRE_Pattern___copy___impl(PatternObject * self)1276 _sre_SRE_Pattern___copy___impl(PatternObject *self)
1277 /*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1278 {
1279     Py_INCREF(self);
1280     return (PyObject *)self;
1281 }
1282 
1283 /*[clinic input]
1284 _sre.SRE_Pattern.__deepcopy__
1285 
1286     memo: object
1287     /
1288 
1289 [clinic start generated code]*/
1290 
1291 static PyObject *
_sre_SRE_Pattern___deepcopy__(PatternObject * self,PyObject * memo)1292 _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1293 /*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
1294 {
1295     Py_INCREF(self);
1296     return (PyObject *)self;
1297 }
1298 
1299 static PyObject *
pattern_repr(PatternObject * obj)1300 pattern_repr(PatternObject *obj)
1301 {
1302     static const struct {
1303         const char *name;
1304         int value;
1305     } flag_names[] = {
1306         {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1307         {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1308         {"re.LOCALE", SRE_FLAG_LOCALE},
1309         {"re.MULTILINE", SRE_FLAG_MULTILINE},
1310         {"re.DOTALL", SRE_FLAG_DOTALL},
1311         {"re.UNICODE", SRE_FLAG_UNICODE},
1312         {"re.VERBOSE", SRE_FLAG_VERBOSE},
1313         {"re.DEBUG", SRE_FLAG_DEBUG},
1314         {"re.ASCII", SRE_FLAG_ASCII},
1315     };
1316     PyObject *result = NULL;
1317     PyObject *flag_items;
1318     size_t i;
1319     int flags = obj->flags;
1320 
1321     /* Omit re.UNICODE for valid string patterns. */
1322     if (obj->isbytes == 0 &&
1323         (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1324          SRE_FLAG_UNICODE)
1325         flags &= ~SRE_FLAG_UNICODE;
1326 
1327     flag_items = PyList_New(0);
1328     if (!flag_items)
1329         return NULL;
1330 
1331     for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1332         if (flags & flag_names[i].value) {
1333             PyObject *item = PyUnicode_FromString(flag_names[i].name);
1334             if (!item)
1335                 goto done;
1336 
1337             if (PyList_Append(flag_items, item) < 0) {
1338                 Py_DECREF(item);
1339                 goto done;
1340             }
1341             Py_DECREF(item);
1342             flags &= ~flag_names[i].value;
1343         }
1344     }
1345     if (flags) {
1346         PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1347         if (!item)
1348             goto done;
1349 
1350         if (PyList_Append(flag_items, item) < 0) {
1351             Py_DECREF(item);
1352             goto done;
1353         }
1354         Py_DECREF(item);
1355     }
1356 
1357     if (PyList_Size(flag_items) > 0) {
1358         PyObject *flags_result;
1359         PyObject *sep = PyUnicode_FromString("|");
1360         if (!sep)
1361             goto done;
1362         flags_result = PyUnicode_Join(sep, flag_items);
1363         Py_DECREF(sep);
1364         if (!flags_result)
1365             goto done;
1366         result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1367                                       obj->pattern, flags_result);
1368         Py_DECREF(flags_result);
1369     }
1370     else {
1371         result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1372     }
1373 
1374 done:
1375     Py_DECREF(flag_items);
1376     return result;
1377 }
1378 
1379 PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1380 
1381 /* PatternObject's 'groupindex' method. */
1382 static PyObject *
pattern_groupindex(PatternObject * self,void * Py_UNUSED (ignored))1383 pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
1384 {
1385     if (self->groupindex == NULL)
1386         return PyDict_New();
1387     return PyDictProxy_New(self->groupindex);
1388 }
1389 
1390 static int _validate(PatternObject *self); /* Forward */
1391 
1392 /*[clinic input]
1393 _sre.compile
1394 
1395     pattern: object
1396     flags: int
1397     code: object(subclass_of='&PyList_Type')
1398     groups: Py_ssize_t
1399     groupindex: object(subclass_of='&PyDict_Type')
1400     indexgroup: object(subclass_of='&PyTuple_Type')
1401 
1402 [clinic start generated code]*/
1403 
1404 static PyObject *
_sre_compile_impl(PyObject * module,PyObject * pattern,int flags,PyObject * code,Py_ssize_t groups,PyObject * groupindex,PyObject * indexgroup)1405 _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1406                   PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1407                   PyObject *indexgroup)
1408 /*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1409 {
1410     /* "compile" pattern descriptor to pattern object */
1411 
1412     _sremodulestate *module_state = get_sre_module_state(module);
1413     PatternObject* self;
1414     Py_ssize_t i, n;
1415 
1416     n = PyList_GET_SIZE(code);
1417     /* coverity[ampersand_in_size] */
1418     self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1419     if (!self)
1420         return NULL;
1421     self->weakreflist = NULL;
1422     self->pattern = NULL;
1423     self->groupindex = NULL;
1424     self->indexgroup = NULL;
1425 
1426     self->codesize = n;
1427 
1428     for (i = 0; i < n; i++) {
1429         PyObject *o = PyList_GET_ITEM(code, i);
1430         unsigned long value = PyLong_AsUnsignedLong(o);
1431         self->code[i] = (SRE_CODE) value;
1432         if ((unsigned long) self->code[i] != value) {
1433             PyErr_SetString(PyExc_OverflowError,
1434                             "regular expression code size limit exceeded");
1435             break;
1436         }
1437     }
1438     PyObject_GC_Track(self);
1439 
1440     if (PyErr_Occurred()) {
1441         Py_DECREF(self);
1442         return NULL;
1443     }
1444 
1445     if (pattern == Py_None) {
1446         self->isbytes = -1;
1447     }
1448     else {
1449         Py_ssize_t p_length;
1450         int charsize;
1451         Py_buffer view;
1452         view.buf = NULL;
1453         if (!getstring(pattern, &p_length, &self->isbytes,
1454                        &charsize, &view)) {
1455             Py_DECREF(self);
1456             return NULL;
1457         }
1458         if (view.buf)
1459             PyBuffer_Release(&view);
1460     }
1461 
1462     Py_INCREF(pattern);
1463     self->pattern = pattern;
1464 
1465     self->flags = flags;
1466 
1467     self->groups = groups;
1468 
1469     if (PyDict_GET_SIZE(groupindex) > 0) {
1470         Py_INCREF(groupindex);
1471         self->groupindex = groupindex;
1472         if (PyTuple_GET_SIZE(indexgroup) > 0) {
1473             Py_INCREF(indexgroup);
1474             self->indexgroup = indexgroup;
1475         }
1476     }
1477 
1478     if (!_validate(self)) {
1479         Py_DECREF(self);
1480         return NULL;
1481     }
1482 
1483     return (PyObject*) self;
1484 }
1485 
1486 /* -------------------------------------------------------------------- */
1487 /* Code validation */
1488 
1489 /* To learn more about this code, have a look at the _compile() function in
1490    Lib/sre_compile.py.  The validation functions below checks the code array
1491    for conformance with the code patterns generated there.
1492 
1493    The nice thing about the generated code is that it is position-independent:
1494    all jumps are relative jumps forward.  Also, jumps don't cross each other:
1495    the target of a later jump is always earlier than the target of an earlier
1496    jump.  IOW, this is okay:
1497 
1498    J---------J-------T--------T
1499     \         \_____/        /
1500      \______________________/
1501 
1502    but this is not:
1503 
1504    J---------J-------T--------T
1505     \_________\_____/        /
1506                \____________/
1507 
1508    It also helps that SRE_CODE is always an unsigned type.
1509 */
1510 
1511 /* Defining this one enables tracing of the validator */
1512 #undef VVERBOSE
1513 
1514 /* Trace macro for the validator */
1515 #if defined(VVERBOSE)
1516 #define VTRACE(v) printf v
1517 #else
1518 #define VTRACE(v) do {} while(0)  /* do nothing */
1519 #endif
1520 
1521 /* Report failure */
1522 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1523 
1524 /* Extract opcode, argument, or skip count from code array */
1525 #define GET_OP                                          \
1526     do {                                                \
1527         VTRACE(("%p: ", code));                         \
1528         if (code >= end) FAIL;                          \
1529         op = *code++;                                   \
1530         VTRACE(("%lu (op)\n", (unsigned long)op));      \
1531     } while (0)
1532 #define GET_ARG                                         \
1533     do {                                                \
1534         VTRACE(("%p= ", code));                         \
1535         if (code >= end) FAIL;                          \
1536         arg = *code++;                                  \
1537         VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
1538     } while (0)
1539 #define GET_SKIP_ADJ(adj)                               \
1540     do {                                                \
1541         VTRACE(("%p= ", code));                         \
1542         if (code >= end) FAIL;                          \
1543         skip = *code;                                   \
1544         VTRACE(("%lu (skip to %p)\n",                   \
1545                (unsigned long)skip, code+skip));        \
1546         if (skip-adj > (uintptr_t)(end - code))      \
1547             FAIL;                                       \
1548         code++;                                         \
1549     } while (0)
1550 #define GET_SKIP GET_SKIP_ADJ(0)
1551 
1552 static int
_validate_charset(SRE_CODE * code,SRE_CODE * end)1553 _validate_charset(SRE_CODE *code, SRE_CODE *end)
1554 {
1555     /* Some variables are manipulated by the macros above */
1556     SRE_CODE op;
1557     SRE_CODE arg;
1558     SRE_CODE offset;
1559     int i;
1560 
1561     while (code < end) {
1562         GET_OP;
1563         switch (op) {
1564 
1565         case SRE_OP_NEGATE:
1566             break;
1567 
1568         case SRE_OP_LITERAL:
1569             GET_ARG;
1570             break;
1571 
1572         case SRE_OP_RANGE:
1573         case SRE_OP_RANGE_UNI_IGNORE:
1574             GET_ARG;
1575             GET_ARG;
1576             break;
1577 
1578         case SRE_OP_CHARSET:
1579             offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1580             if (offset > (uintptr_t)(end - code))
1581                 FAIL;
1582             code += offset;
1583             break;
1584 
1585         case SRE_OP_BIGCHARSET:
1586             GET_ARG; /* Number of blocks */
1587             offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1588             if (offset > (uintptr_t)(end - code))
1589                 FAIL;
1590             /* Make sure that each byte points to a valid block */
1591             for (i = 0; i < 256; i++) {
1592                 if (((unsigned char *)code)[i] >= arg)
1593                     FAIL;
1594             }
1595             code += offset;
1596             offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1597             if (offset > (uintptr_t)(end - code))
1598                 FAIL;
1599             code += offset;
1600             break;
1601 
1602         case SRE_OP_CATEGORY:
1603             GET_ARG;
1604             switch (arg) {
1605             case SRE_CATEGORY_DIGIT:
1606             case SRE_CATEGORY_NOT_DIGIT:
1607             case SRE_CATEGORY_SPACE:
1608             case SRE_CATEGORY_NOT_SPACE:
1609             case SRE_CATEGORY_WORD:
1610             case SRE_CATEGORY_NOT_WORD:
1611             case SRE_CATEGORY_LINEBREAK:
1612             case SRE_CATEGORY_NOT_LINEBREAK:
1613             case SRE_CATEGORY_LOC_WORD:
1614             case SRE_CATEGORY_LOC_NOT_WORD:
1615             case SRE_CATEGORY_UNI_DIGIT:
1616             case SRE_CATEGORY_UNI_NOT_DIGIT:
1617             case SRE_CATEGORY_UNI_SPACE:
1618             case SRE_CATEGORY_UNI_NOT_SPACE:
1619             case SRE_CATEGORY_UNI_WORD:
1620             case SRE_CATEGORY_UNI_NOT_WORD:
1621             case SRE_CATEGORY_UNI_LINEBREAK:
1622             case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1623                 break;
1624             default:
1625                 FAIL;
1626             }
1627             break;
1628 
1629         default:
1630             FAIL;
1631 
1632         }
1633     }
1634 
1635     return 1;
1636 }
1637 
1638 static int
_validate_inner(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)1639 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1640 {
1641     /* Some variables are manipulated by the macros above */
1642     SRE_CODE op;
1643     SRE_CODE arg;
1644     SRE_CODE skip;
1645 
1646     VTRACE(("code=%p, end=%p\n", code, end));
1647 
1648     if (code > end)
1649         FAIL;
1650 
1651     while (code < end) {
1652         GET_OP;
1653         switch (op) {
1654 
1655         case SRE_OP_MARK:
1656             /* We don't check whether marks are properly nested; the
1657                sre_match() code is robust even if they don't, and the worst
1658                you can get is nonsensical match results. */
1659             GET_ARG;
1660             if (arg > 2 * (size_t)groups + 1) {
1661                 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1662                 FAIL;
1663             }
1664             break;
1665 
1666         case SRE_OP_LITERAL:
1667         case SRE_OP_NOT_LITERAL:
1668         case SRE_OP_LITERAL_IGNORE:
1669         case SRE_OP_NOT_LITERAL_IGNORE:
1670         case SRE_OP_LITERAL_UNI_IGNORE:
1671         case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1672         case SRE_OP_LITERAL_LOC_IGNORE:
1673         case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1674             GET_ARG;
1675             /* The arg is just a character, nothing to check */
1676             break;
1677 
1678         case SRE_OP_SUCCESS:
1679         case SRE_OP_FAILURE:
1680             /* Nothing to check; these normally end the matching process */
1681             break;
1682 
1683         case SRE_OP_AT:
1684             GET_ARG;
1685             switch (arg) {
1686             case SRE_AT_BEGINNING:
1687             case SRE_AT_BEGINNING_STRING:
1688             case SRE_AT_BEGINNING_LINE:
1689             case SRE_AT_END:
1690             case SRE_AT_END_LINE:
1691             case SRE_AT_END_STRING:
1692             case SRE_AT_BOUNDARY:
1693             case SRE_AT_NON_BOUNDARY:
1694             case SRE_AT_LOC_BOUNDARY:
1695             case SRE_AT_LOC_NON_BOUNDARY:
1696             case SRE_AT_UNI_BOUNDARY:
1697             case SRE_AT_UNI_NON_BOUNDARY:
1698                 break;
1699             default:
1700                 FAIL;
1701             }
1702             break;
1703 
1704         case SRE_OP_ANY:
1705         case SRE_OP_ANY_ALL:
1706             /* These have no operands */
1707             break;
1708 
1709         case SRE_OP_IN:
1710         case SRE_OP_IN_IGNORE:
1711         case SRE_OP_IN_UNI_IGNORE:
1712         case SRE_OP_IN_LOC_IGNORE:
1713             GET_SKIP;
1714             /* Stop 1 before the end; we check the FAILURE below */
1715             if (!_validate_charset(code, code+skip-2))
1716                 FAIL;
1717             if (code[skip-2] != SRE_OP_FAILURE)
1718                 FAIL;
1719             code += skip-1;
1720             break;
1721 
1722         case SRE_OP_INFO:
1723             {
1724                 /* A minimal info field is
1725                    <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1726                    If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1727                    more follows. */
1728                 SRE_CODE flags, i;
1729                 SRE_CODE *newcode;
1730                 GET_SKIP;
1731                 newcode = code+skip-1;
1732                 GET_ARG; flags = arg;
1733                 GET_ARG;
1734                 GET_ARG;
1735                 /* Check that only valid flags are present */
1736                 if ((flags & ~(SRE_INFO_PREFIX |
1737                                SRE_INFO_LITERAL |
1738                                SRE_INFO_CHARSET)) != 0)
1739                     FAIL;
1740                 /* PREFIX and CHARSET are mutually exclusive */
1741                 if ((flags & SRE_INFO_PREFIX) &&
1742                     (flags & SRE_INFO_CHARSET))
1743                     FAIL;
1744                 /* LITERAL implies PREFIX */
1745                 if ((flags & SRE_INFO_LITERAL) &&
1746                     !(flags & SRE_INFO_PREFIX))
1747                     FAIL;
1748                 /* Validate the prefix */
1749                 if (flags & SRE_INFO_PREFIX) {
1750                     SRE_CODE prefix_len;
1751                     GET_ARG; prefix_len = arg;
1752                     GET_ARG;
1753                     /* Here comes the prefix string */
1754                     if (prefix_len > (uintptr_t)(newcode - code))
1755                         FAIL;
1756                     code += prefix_len;
1757                     /* And here comes the overlap table */
1758                     if (prefix_len > (uintptr_t)(newcode - code))
1759                         FAIL;
1760                     /* Each overlap value should be < prefix_len */
1761                     for (i = 0; i < prefix_len; i++) {
1762                         if (code[i] >= prefix_len)
1763                             FAIL;
1764                     }
1765                     code += prefix_len;
1766                 }
1767                 /* Validate the charset */
1768                 if (flags & SRE_INFO_CHARSET) {
1769                     if (!_validate_charset(code, newcode-1))
1770                         FAIL;
1771                     if (newcode[-1] != SRE_OP_FAILURE)
1772                         FAIL;
1773                     code = newcode;
1774                 }
1775                 else if (code != newcode) {
1776                   VTRACE(("code=%p, newcode=%p\n", code, newcode));
1777                     FAIL;
1778                 }
1779             }
1780             break;
1781 
1782         case SRE_OP_BRANCH:
1783             {
1784                 SRE_CODE *target = NULL;
1785                 for (;;) {
1786                     GET_SKIP;
1787                     if (skip == 0)
1788                         break;
1789                     /* Stop 2 before the end; we check the JUMP below */
1790                     if (!_validate_inner(code, code+skip-3, groups))
1791                         FAIL;
1792                     code += skip-3;
1793                     /* Check that it ends with a JUMP, and that each JUMP
1794                        has the same target */
1795                     GET_OP;
1796                     if (op != SRE_OP_JUMP)
1797                         FAIL;
1798                     GET_SKIP;
1799                     if (target == NULL)
1800                         target = code+skip-1;
1801                     else if (code+skip-1 != target)
1802                         FAIL;
1803                 }
1804             }
1805             break;
1806 
1807         case SRE_OP_REPEAT_ONE:
1808         case SRE_OP_MIN_REPEAT_ONE:
1809             {
1810                 SRE_CODE min, max;
1811                 GET_SKIP;
1812                 GET_ARG; min = arg;
1813                 GET_ARG; max = arg;
1814                 if (min > max)
1815                     FAIL;
1816                 if (max > SRE_MAXREPEAT)
1817                     FAIL;
1818                 if (!_validate_inner(code, code+skip-4, groups))
1819                     FAIL;
1820                 code += skip-4;
1821                 GET_OP;
1822                 if (op != SRE_OP_SUCCESS)
1823                     FAIL;
1824             }
1825             break;
1826 
1827         case SRE_OP_REPEAT:
1828             {
1829                 SRE_CODE min, max;
1830                 GET_SKIP;
1831                 GET_ARG; min = arg;
1832                 GET_ARG; max = arg;
1833                 if (min > max)
1834                     FAIL;
1835                 if (max > SRE_MAXREPEAT)
1836                     FAIL;
1837                 if (!_validate_inner(code, code+skip-3, groups))
1838                     FAIL;
1839                 code += skip-3;
1840                 GET_OP;
1841                 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1842                     FAIL;
1843             }
1844             break;
1845 
1846         case SRE_OP_GROUPREF:
1847         case SRE_OP_GROUPREF_IGNORE:
1848         case SRE_OP_GROUPREF_UNI_IGNORE:
1849         case SRE_OP_GROUPREF_LOC_IGNORE:
1850             GET_ARG;
1851             if (arg >= (size_t)groups)
1852                 FAIL;
1853             break;
1854 
1855         case SRE_OP_GROUPREF_EXISTS:
1856             /* The regex syntax for this is: '(?(group)then|else)', where
1857                'group' is either an integer group number or a group name,
1858                'then' and 'else' are sub-regexes, and 'else' is optional. */
1859             GET_ARG;
1860             if (arg >= (size_t)groups)
1861                 FAIL;
1862             GET_SKIP_ADJ(1);
1863             code--; /* The skip is relative to the first arg! */
1864             /* There are two possibilities here: if there is both a 'then'
1865                part and an 'else' part, the generated code looks like:
1866 
1867                GROUPREF_EXISTS
1868                <group>
1869                <skipyes>
1870                ...then part...
1871                JUMP
1872                <skipno>
1873                (<skipyes> jumps here)
1874                ...else part...
1875                (<skipno> jumps here)
1876 
1877                If there is only a 'then' part, it looks like:
1878 
1879                GROUPREF_EXISTS
1880                <group>
1881                <skip>
1882                ...then part...
1883                (<skip> jumps here)
1884 
1885                There is no direct way to decide which it is, and we don't want
1886                to allow arbitrary jumps anywhere in the code; so we just look
1887                for a JUMP opcode preceding our skip target.
1888             */
1889             if (skip >= 3 && skip-3 < (uintptr_t)(end - code) &&
1890                 code[skip-3] == SRE_OP_JUMP)
1891             {
1892                 VTRACE(("both then and else parts present\n"));
1893                 if (!_validate_inner(code+1, code+skip-3, groups))
1894                     FAIL;
1895                 code += skip-2; /* Position after JUMP, at <skipno> */
1896                 GET_SKIP;
1897                 if (!_validate_inner(code, code+skip-1, groups))
1898                     FAIL;
1899                 code += skip-1;
1900             }
1901             else {
1902                 VTRACE(("only a then part present\n"));
1903                 if (!_validate_inner(code+1, code+skip-1, groups))
1904                     FAIL;
1905                 code += skip-1;
1906             }
1907             break;
1908 
1909         case SRE_OP_ASSERT:
1910         case SRE_OP_ASSERT_NOT:
1911             GET_SKIP;
1912             GET_ARG; /* 0 for lookahead, width for lookbehind */
1913             code--; /* Back up over arg to simplify math below */
1914             if (arg & 0x80000000)
1915                 FAIL; /* Width too large */
1916             /* Stop 1 before the end; we check the SUCCESS below */
1917             if (!_validate_inner(code+1, code+skip-2, groups))
1918                 FAIL;
1919             code += skip-2;
1920             GET_OP;
1921             if (op != SRE_OP_SUCCESS)
1922                 FAIL;
1923             break;
1924 
1925         default:
1926             FAIL;
1927 
1928         }
1929     }
1930 
1931     VTRACE(("okay\n"));
1932     return 1;
1933 }
1934 
1935 static int
_validate_outer(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)1936 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1937 {
1938     if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1939         code >= end || end[-1] != SRE_OP_SUCCESS)
1940         FAIL;
1941     return _validate_inner(code, end-1, groups);
1942 }
1943 
1944 static int
_validate(PatternObject * self)1945 _validate(PatternObject *self)
1946 {
1947     if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1948     {
1949         PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1950         return 0;
1951     }
1952     else
1953         VTRACE(("Success!\n"));
1954     return 1;
1955 }
1956 
1957 /* -------------------------------------------------------------------- */
1958 /* match methods */
1959 
1960 static int
match_traverse(MatchObject * self,visitproc visit,void * arg)1961 match_traverse(MatchObject *self, visitproc visit, void *arg)
1962 {
1963     Py_VISIT(Py_TYPE(self));
1964     Py_VISIT(self->string);
1965     Py_VISIT(self->regs);
1966     Py_VISIT(self->pattern);
1967     return 0;
1968 }
1969 
1970 static int
match_clear(MatchObject * self)1971 match_clear(MatchObject *self)
1972 {
1973     Py_CLEAR(self->string);
1974     Py_CLEAR(self->regs);
1975     Py_CLEAR(self->pattern);
1976     return 0;
1977 }
1978 
1979 static void
match_dealloc(MatchObject * self)1980 match_dealloc(MatchObject* self)
1981 {
1982     PyTypeObject *tp = Py_TYPE(self);
1983 
1984     PyObject_GC_UnTrack(self);
1985     (void)match_clear(self);
1986     tp->tp_free(self);
1987     Py_DECREF(tp);
1988 }
1989 
1990 static PyObject*
match_getslice_by_index(MatchObject * self,Py_ssize_t index,PyObject * def)1991 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
1992 {
1993     Py_ssize_t length;
1994     int isbytes, charsize;
1995     Py_buffer view;
1996     PyObject *result;
1997     const void* ptr;
1998     Py_ssize_t i, j;
1999 
2000     assert(0 <= index && index < self->groups);
2001     index *= 2;
2002 
2003     if (self->string == Py_None || self->mark[index] < 0) {
2004         /* return default value if the string or group is undefined */
2005         Py_INCREF(def);
2006         return def;
2007     }
2008 
2009     ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2010     if (ptr == NULL)
2011         return NULL;
2012 
2013     i = self->mark[index];
2014     j = self->mark[index+1];
2015     i = Py_MIN(i, length);
2016     j = Py_MIN(j, length);
2017     result = getslice(isbytes, ptr, self->string, i, j);
2018     if (isbytes && view.buf != NULL)
2019         PyBuffer_Release(&view);
2020     return result;
2021 }
2022 
2023 static Py_ssize_t
match_getindex(MatchObject * self,PyObject * index)2024 match_getindex(MatchObject* self, PyObject* index)
2025 {
2026     Py_ssize_t i;
2027 
2028     if (index == NULL)
2029         /* Default value */
2030         return 0;
2031 
2032     if (PyIndex_Check(index)) {
2033         i = PyNumber_AsSsize_t(index, NULL);
2034     }
2035     else {
2036         i = -1;
2037 
2038         if (self->pattern->groupindex) {
2039             index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2040             if (index && PyLong_Check(index)) {
2041                 i = PyLong_AsSsize_t(index);
2042             }
2043         }
2044     }
2045     if (i < 0 || i >= self->groups) {
2046         /* raise IndexError if we were given a bad group number */
2047         if (!PyErr_Occurred()) {
2048             PyErr_SetString(PyExc_IndexError, "no such group");
2049         }
2050         return -1;
2051     }
2052 
2053     return i;
2054 }
2055 
2056 static PyObject*
match_getslice(MatchObject * self,PyObject * index,PyObject * def)2057 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2058 {
2059     Py_ssize_t i = match_getindex(self, index);
2060 
2061     if (i < 0) {
2062         return NULL;
2063     }
2064 
2065     return match_getslice_by_index(self, i, def);
2066 }
2067 
2068 /*[clinic input]
2069 _sre.SRE_Match.expand
2070 
2071     template: object
2072 
2073 Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2074 [clinic start generated code]*/
2075 
2076 static PyObject *
_sre_SRE_Match_expand_impl(MatchObject * self,PyObject * template)2077 _sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2078 /*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
2079 {
2080     /* delegate to Python code */
2081     return call(
2082         SRE_PY_MODULE, "_expand",
2083         PyTuple_Pack(3, self->pattern, self, template)
2084         );
2085 }
2086 
2087 static PyObject*
match_group(MatchObject * self,PyObject * args)2088 match_group(MatchObject* self, PyObject* args)
2089 {
2090     PyObject* result;
2091     Py_ssize_t i, size;
2092 
2093     size = PyTuple_GET_SIZE(args);
2094 
2095     switch (size) {
2096     case 0:
2097         result = match_getslice(self, _PyLong_GetZero(), Py_None);
2098         break;
2099     case 1:
2100         result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2101         break;
2102     default:
2103         /* fetch multiple items */
2104         result = PyTuple_New(size);
2105         if (!result)
2106             return NULL;
2107         for (i = 0; i < size; i++) {
2108             PyObject* item = match_getslice(
2109                 self, PyTuple_GET_ITEM(args, i), Py_None
2110                 );
2111             if (!item) {
2112                 Py_DECREF(result);
2113                 return NULL;
2114             }
2115             PyTuple_SET_ITEM(result, i, item);
2116         }
2117         break;
2118     }
2119     return result;
2120 }
2121 
2122 static PyObject*
match_getitem(MatchObject * self,PyObject * name)2123 match_getitem(MatchObject* self, PyObject* name)
2124 {
2125     return match_getslice(self, name, Py_None);
2126 }
2127 
2128 /*[clinic input]
2129 _sre.SRE_Match.groups
2130 
2131     default: object = None
2132         Is used for groups that did not participate in the match.
2133 
2134 Return a tuple containing all the subgroups of the match, from 1.
2135 [clinic start generated code]*/
2136 
2137 static PyObject *
_sre_SRE_Match_groups_impl(MatchObject * self,PyObject * default_value)2138 _sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2139 /*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2140 {
2141     PyObject* result;
2142     Py_ssize_t index;
2143 
2144     result = PyTuple_New(self->groups-1);
2145     if (!result)
2146         return NULL;
2147 
2148     for (index = 1; index < self->groups; index++) {
2149         PyObject* item;
2150         item = match_getslice_by_index(self, index, default_value);
2151         if (!item) {
2152             Py_DECREF(result);
2153             return NULL;
2154         }
2155         PyTuple_SET_ITEM(result, index-1, item);
2156     }
2157 
2158     return result;
2159 }
2160 
2161 /*[clinic input]
2162 _sre.SRE_Match.groupdict
2163 
2164     default: object = None
2165         Is used for groups that did not participate in the match.
2166 
2167 Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2168 [clinic start generated code]*/
2169 
2170 static PyObject *
_sre_SRE_Match_groupdict_impl(MatchObject * self,PyObject * default_value)2171 _sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2172 /*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
2173 {
2174     PyObject *result;
2175     PyObject *key;
2176     PyObject *value;
2177     Py_ssize_t pos = 0;
2178     Py_hash_t hash;
2179 
2180     result = PyDict_New();
2181     if (!result || !self->pattern->groupindex)
2182         return result;
2183 
2184     while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2185         int status;
2186         Py_INCREF(key);
2187         value = match_getslice(self, key, default_value);
2188         if (!value) {
2189             Py_DECREF(key);
2190             goto failed;
2191         }
2192         status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2193         Py_DECREF(value);
2194         Py_DECREF(key);
2195         if (status < 0)
2196             goto failed;
2197     }
2198 
2199     return result;
2200 
2201 failed:
2202     Py_DECREF(result);
2203     return NULL;
2204 }
2205 
2206 /*[clinic input]
2207 _sre.SRE_Match.start -> Py_ssize_t
2208 
2209     group: object(c_default="NULL") = 0
2210     /
2211 
2212 Return index of the start of the substring matched by group.
2213 [clinic start generated code]*/
2214 
2215 static Py_ssize_t
_sre_SRE_Match_start_impl(MatchObject * self,PyObject * group)2216 _sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2217 /*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2218 {
2219     Py_ssize_t index = match_getindex(self, group);
2220 
2221     if (index < 0) {
2222         return -1;
2223     }
2224 
2225     /* mark is -1 if group is undefined */
2226     return self->mark[index*2];
2227 }
2228 
2229 /*[clinic input]
2230 _sre.SRE_Match.end -> Py_ssize_t
2231 
2232     group: object(c_default="NULL") = 0
2233     /
2234 
2235 Return index of the end of the substring matched by group.
2236 [clinic start generated code]*/
2237 
2238 static Py_ssize_t
_sre_SRE_Match_end_impl(MatchObject * self,PyObject * group)2239 _sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2240 /*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2241 {
2242     Py_ssize_t index = match_getindex(self, group);
2243 
2244     if (index < 0) {
2245         return -1;
2246     }
2247 
2248     /* mark is -1 if group is undefined */
2249     return self->mark[index*2+1];
2250 }
2251 
2252 LOCAL(PyObject*)
_pair(Py_ssize_t i1,Py_ssize_t i2)2253 _pair(Py_ssize_t i1, Py_ssize_t i2)
2254 {
2255     PyObject* pair;
2256     PyObject* item;
2257 
2258     pair = PyTuple_New(2);
2259     if (!pair)
2260         return NULL;
2261 
2262     item = PyLong_FromSsize_t(i1);
2263     if (!item)
2264         goto error;
2265     PyTuple_SET_ITEM(pair, 0, item);
2266 
2267     item = PyLong_FromSsize_t(i2);
2268     if (!item)
2269         goto error;
2270     PyTuple_SET_ITEM(pair, 1, item);
2271 
2272     return pair;
2273 
2274   error:
2275     Py_DECREF(pair);
2276     return NULL;
2277 }
2278 
2279 /*[clinic input]
2280 _sre.SRE_Match.span
2281 
2282     group: object(c_default="NULL") = 0
2283     /
2284 
2285 For match object m, return the 2-tuple (m.start(group), m.end(group)).
2286 [clinic start generated code]*/
2287 
2288 static PyObject *
_sre_SRE_Match_span_impl(MatchObject * self,PyObject * group)2289 _sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2290 /*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
2291 {
2292     Py_ssize_t index = match_getindex(self, group);
2293 
2294     if (index < 0) {
2295         return NULL;
2296     }
2297 
2298     /* marks are -1 if group is undefined */
2299     return _pair(self->mark[index*2], self->mark[index*2+1]);
2300 }
2301 
2302 static PyObject*
match_regs(MatchObject * self)2303 match_regs(MatchObject* self)
2304 {
2305     PyObject* regs;
2306     PyObject* item;
2307     Py_ssize_t index;
2308 
2309     regs = PyTuple_New(self->groups);
2310     if (!regs)
2311         return NULL;
2312 
2313     for (index = 0; index < self->groups; index++) {
2314         item = _pair(self->mark[index*2], self->mark[index*2+1]);
2315         if (!item) {
2316             Py_DECREF(regs);
2317             return NULL;
2318         }
2319         PyTuple_SET_ITEM(regs, index, item);
2320     }
2321 
2322     Py_INCREF(regs);
2323     self->regs = regs;
2324 
2325     return regs;
2326 }
2327 
2328 /*[clinic input]
2329 _sre.SRE_Match.__copy__
2330 
2331 [clinic start generated code]*/
2332 
2333 static PyObject *
_sre_SRE_Match___copy___impl(MatchObject * self)2334 _sre_SRE_Match___copy___impl(MatchObject *self)
2335 /*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2336 {
2337     Py_INCREF(self);
2338     return (PyObject *)self;
2339 }
2340 
2341 /*[clinic input]
2342 _sre.SRE_Match.__deepcopy__
2343 
2344     memo: object
2345     /
2346 
2347 [clinic start generated code]*/
2348 
2349 static PyObject *
_sre_SRE_Match___deepcopy__(MatchObject * self,PyObject * memo)2350 _sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2351 /*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
2352 {
2353     Py_INCREF(self);
2354     return (PyObject *)self;
2355 }
2356 
2357 PyDoc_STRVAR(match_doc,
2358 "The result of re.match() and re.search().\n\
2359 Match objects always have a boolean value of True.");
2360 
2361 PyDoc_STRVAR(match_group_doc,
2362 "group([group1, ...]) -> str or tuple.\n\
2363     Return subgroup(s) of the match by indices or names.\n\
2364     For 0 returns the entire match.");
2365 
2366 static PyObject *
match_lastindex_get(MatchObject * self,void * Py_UNUSED (ignored))2367 match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
2368 {
2369     if (self->lastindex >= 0)
2370         return PyLong_FromSsize_t(self->lastindex);
2371     Py_RETURN_NONE;
2372 }
2373 
2374 static PyObject *
match_lastgroup_get(MatchObject * self,void * Py_UNUSED (ignored))2375 match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
2376 {
2377     if (self->pattern->indexgroup &&
2378         self->lastindex >= 0 &&
2379         self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2380     {
2381         PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2382                                             self->lastindex);
2383         Py_INCREF(result);
2384         return result;
2385     }
2386     Py_RETURN_NONE;
2387 }
2388 
2389 static PyObject *
match_regs_get(MatchObject * self,void * Py_UNUSED (ignored))2390 match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
2391 {
2392     if (self->regs) {
2393         Py_INCREF(self->regs);
2394         return self->regs;
2395     } else
2396         return match_regs(self);
2397 }
2398 
2399 static PyObject *
match_repr(MatchObject * self)2400 match_repr(MatchObject *self)
2401 {
2402     PyObject *result;
2403     PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2404     if (group0 == NULL)
2405         return NULL;
2406     result = PyUnicode_FromFormat(
2407             "<%s object; span=(%zd, %zd), match=%.50R>",
2408             Py_TYPE(self)->tp_name,
2409             self->mark[0], self->mark[1], group0);
2410     Py_DECREF(group0);
2411     return result;
2412 }
2413 
2414 
2415 static PyObject*
pattern_new_match(_sremodulestate * module_state,PatternObject * pattern,SRE_STATE * state,Py_ssize_t status)2416 pattern_new_match(_sremodulestate* module_state,
2417                   PatternObject* pattern,
2418                   SRE_STATE* state,
2419                   Py_ssize_t status)
2420 {
2421     /* create match object (from state object) */
2422 
2423     MatchObject* match;
2424     Py_ssize_t i, j;
2425     char* base;
2426     int n;
2427 
2428     if (status > 0) {
2429 
2430         /* create match object (with room for extra group marks) */
2431         /* coverity[ampersand_in_size] */
2432         match = PyObject_GC_NewVar(MatchObject,
2433                                    module_state->Match_Type,
2434                                    2*(pattern->groups+1));
2435         if (!match)
2436             return NULL;
2437 
2438         Py_INCREF(pattern);
2439         match->pattern = pattern;
2440 
2441         Py_INCREF(state->string);
2442         match->string = state->string;
2443 
2444         match->regs = NULL;
2445         match->groups = pattern->groups+1;
2446 
2447         /* fill in group slices */
2448 
2449         base = (char*) state->beginning;
2450         n = state->charsize;
2451 
2452         match->mark[0] = ((char*) state->start - base) / n;
2453         match->mark[1] = ((char*) state->ptr - base) / n;
2454 
2455         for (i = j = 0; i < pattern->groups; i++, j+=2)
2456             if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2457                 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2458                 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2459             } else
2460                 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2461 
2462         match->pos = state->pos;
2463         match->endpos = state->endpos;
2464 
2465         match->lastindex = state->lastindex;
2466 
2467         PyObject_GC_Track(match);
2468         return (PyObject*) match;
2469 
2470     } else if (status == 0) {
2471 
2472         /* no match */
2473         Py_RETURN_NONE;
2474 
2475     }
2476 
2477     /* internal error */
2478     pattern_error(status);
2479     return NULL;
2480 }
2481 
2482 
2483 /* -------------------------------------------------------------------- */
2484 /* scanner methods (experimental) */
2485 
2486 static int
scanner_traverse(ScannerObject * self,visitproc visit,void * arg)2487 scanner_traverse(ScannerObject *self, visitproc visit, void *arg)
2488 {
2489     Py_VISIT(Py_TYPE(self));
2490     Py_VISIT(self->pattern);
2491     return 0;
2492 }
2493 
2494 static int
scanner_clear(ScannerObject * self)2495 scanner_clear(ScannerObject *self)
2496 {
2497     Py_CLEAR(self->pattern);
2498     return 0;
2499 }
2500 
2501 static void
scanner_dealloc(ScannerObject * self)2502 scanner_dealloc(ScannerObject* self)
2503 {
2504     PyTypeObject *tp = Py_TYPE(self);
2505 
2506     PyObject_GC_UnTrack(self);
2507     state_fini(&self->state);
2508     (void)scanner_clear(self);
2509     tp->tp_free(self);
2510     Py_DECREF(tp);
2511 }
2512 
2513 /*[clinic input]
2514 _sre.SRE_Scanner.match
2515 
2516     cls: defining_class
2517     /
2518 
2519 [clinic start generated code]*/
2520 
2521 static PyObject *
_sre_SRE_Scanner_match_impl(ScannerObject * self,PyTypeObject * cls)2522 _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
2523 /*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
2524 {
2525     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2526     SRE_STATE* state = &self->state;
2527     PyObject* match;
2528     Py_ssize_t status;
2529 
2530     if (state->start == NULL)
2531         Py_RETURN_NONE;
2532 
2533     state_reset(state);
2534 
2535     state->ptr = state->start;
2536 
2537     status = sre_match(state, PatternObject_GetCode(self->pattern));
2538     if (PyErr_Occurred())
2539         return NULL;
2540 
2541     match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2542                               state, status);
2543 
2544     if (status == 0)
2545         state->start = NULL;
2546     else {
2547         state->must_advance = (state->ptr == state->start);
2548         state->start = state->ptr;
2549     }
2550 
2551     return match;
2552 }
2553 
2554 
2555 /*[clinic input]
2556 _sre.SRE_Scanner.search
2557 
2558     cls: defining_class
2559     /
2560 
2561 [clinic start generated code]*/
2562 
2563 static PyObject *
_sre_SRE_Scanner_search_impl(ScannerObject * self,PyTypeObject * cls)2564 _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2565 /*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2566 {
2567     _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2568     SRE_STATE* state = &self->state;
2569     PyObject* match;
2570     Py_ssize_t status;
2571 
2572     if (state->start == NULL)
2573         Py_RETURN_NONE;
2574 
2575     state_reset(state);
2576 
2577     state->ptr = state->start;
2578 
2579     status = sre_search(state, PatternObject_GetCode(self->pattern));
2580     if (PyErr_Occurred())
2581         return NULL;
2582 
2583     match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2584                               state, status);
2585 
2586     if (status == 0)
2587         state->start = NULL;
2588     else {
2589         state->must_advance = (state->ptr == state->start);
2590         state->start = state->ptr;
2591     }
2592 
2593     return match;
2594 }
2595 
2596 static PyObject *
pattern_scanner(_sremodulestate * module_state,PatternObject * self,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)2597 pattern_scanner(_sremodulestate *module_state,
2598                 PatternObject *self,
2599                 PyObject *string,
2600                 Py_ssize_t pos,
2601                 Py_ssize_t endpos)
2602 {
2603     ScannerObject* scanner;
2604 
2605     /* create scanner object */
2606     scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2607     if (!scanner)
2608         return NULL;
2609     scanner->pattern = NULL;
2610 
2611     /* create search state object */
2612     if (!state_init(&scanner->state, self, string, pos, endpos)) {
2613         Py_DECREF(scanner);
2614         return NULL;
2615     }
2616 
2617     Py_INCREF(self);
2618     scanner->pattern = (PyObject*) self;
2619 
2620     PyObject_GC_Track(scanner);
2621     return (PyObject*) scanner;
2622 }
2623 
2624 static Py_hash_t
pattern_hash(PatternObject * self)2625 pattern_hash(PatternObject *self)
2626 {
2627     Py_hash_t hash, hash2;
2628 
2629     hash = PyObject_Hash(self->pattern);
2630     if (hash == -1) {
2631         return -1;
2632     }
2633 
2634     hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2635     hash ^= hash2;
2636 
2637     hash ^= self->flags;
2638     hash ^= self->isbytes;
2639     hash ^= self->codesize;
2640 
2641     if (hash == -1) {
2642         hash = -2;
2643     }
2644     return hash;
2645 }
2646 
2647 static PyObject*
pattern_richcompare(PyObject * lefto,PyObject * righto,int op)2648 pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2649 {
2650     PyTypeObject *tp = Py_TYPE(lefto);
2651     _sremodulestate *module_state = get_sre_module_state_by_class(tp);
2652     PatternObject *left, *right;
2653     int cmp;
2654 
2655     if (op != Py_EQ && op != Py_NE) {
2656         Py_RETURN_NOTIMPLEMENTED;
2657     }
2658 
2659     if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
2660     {
2661         Py_RETURN_NOTIMPLEMENTED;
2662     }
2663 
2664     if (lefto == righto) {
2665         /* a pattern is equal to itself */
2666         return PyBool_FromLong(op == Py_EQ);
2667     }
2668 
2669     left = (PatternObject *)lefto;
2670     right = (PatternObject *)righto;
2671 
2672     cmp = (left->flags == right->flags
2673            && left->isbytes == right->isbytes
2674            && left->codesize == right->codesize);
2675     if (cmp) {
2676         /* Compare the code and the pattern because the same pattern can
2677            produce different codes depending on the locale used to compile the
2678            pattern when the re.LOCALE flag is used. Don't compare groups,
2679            indexgroup nor groupindex: they are derivated from the pattern. */
2680         cmp = (memcmp(left->code, right->code,
2681                       sizeof(left->code[0]) * left->codesize) == 0);
2682     }
2683     if (cmp) {
2684         cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2685                                        Py_EQ);
2686         if (cmp < 0) {
2687             return NULL;
2688         }
2689     }
2690     if (op == Py_NE) {
2691         cmp = !cmp;
2692     }
2693     return PyBool_FromLong(cmp);
2694 }
2695 
2696 #include "clinic/_sre.c.h"
2697 
2698 static PyMethodDef pattern_methods[] = {
2699     _SRE_SRE_PATTERN_MATCH_METHODDEF
2700     _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2701     _SRE_SRE_PATTERN_SEARCH_METHODDEF
2702     _SRE_SRE_PATTERN_SUB_METHODDEF
2703     _SRE_SRE_PATTERN_SUBN_METHODDEF
2704     _SRE_SRE_PATTERN_FINDALL_METHODDEF
2705     _SRE_SRE_PATTERN_SPLIT_METHODDEF
2706     _SRE_SRE_PATTERN_FINDITER_METHODDEF
2707     _SRE_SRE_PATTERN_SCANNER_METHODDEF
2708     _SRE_SRE_PATTERN___COPY___METHODDEF
2709     _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
2710     {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2711      PyDoc_STR("See PEP 585")},
2712     {NULL, NULL}
2713 };
2714 
2715 static PyGetSetDef pattern_getset[] = {
2716     {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2717       "A dictionary mapping group names to group numbers."},
2718     {NULL}  /* Sentinel */
2719 };
2720 
2721 #define PAT_OFF(x) offsetof(PatternObject, x)
2722 static PyMemberDef pattern_members[] = {
2723     {"pattern",    T_OBJECT,    PAT_OFF(pattern),       READONLY,
2724      "The pattern string from which the RE object was compiled."},
2725     {"flags",      T_INT,       PAT_OFF(flags),         READONLY,
2726      "The regex matching flags."},
2727     {"groups",     T_PYSSIZET,  PAT_OFF(groups),        READONLY,
2728      "The number of capturing groups in the pattern."},
2729     {"__weaklistoffset__", T_PYSSIZET, offsetof(PatternObject, weakreflist), READONLY},
2730     {NULL}  /* Sentinel */
2731 };
2732 
2733 static PyType_Slot pattern_slots[] = {
2734     {Py_tp_dealloc, (destructor)pattern_dealloc},
2735     {Py_tp_repr, (reprfunc)pattern_repr},
2736     {Py_tp_hash, (hashfunc)pattern_hash},
2737     {Py_tp_doc, (void *)pattern_doc},
2738     {Py_tp_richcompare, pattern_richcompare},
2739     {Py_tp_methods, pattern_methods},
2740     {Py_tp_members, pattern_members},
2741     {Py_tp_getset, pattern_getset},
2742     {Py_tp_traverse, pattern_traverse},
2743     {Py_tp_clear, pattern_clear},
2744     {0, NULL},
2745 };
2746 
2747 static PyType_Spec pattern_spec = {
2748     .name = "re.Pattern",
2749     .basicsize = sizeof(PatternObject),
2750     .itemsize = sizeof(SRE_CODE),
2751     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2752               Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2753     .slots = pattern_slots,
2754 };
2755 
2756 static PyMethodDef match_methods[] = {
2757     {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2758     _SRE_SRE_MATCH_START_METHODDEF
2759     _SRE_SRE_MATCH_END_METHODDEF
2760     _SRE_SRE_MATCH_SPAN_METHODDEF
2761     _SRE_SRE_MATCH_GROUPS_METHODDEF
2762     _SRE_SRE_MATCH_GROUPDICT_METHODDEF
2763     _SRE_SRE_MATCH_EXPAND_METHODDEF
2764     _SRE_SRE_MATCH___COPY___METHODDEF
2765     _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
2766     {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2767      PyDoc_STR("See PEP 585")},
2768     {NULL, NULL}
2769 };
2770 
2771 static PyGetSetDef match_getset[] = {
2772     {"lastindex", (getter)match_lastindex_get, (setter)NULL,
2773      "The integer index of the last matched capturing group."},
2774     {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
2775      "The name of the last matched capturing group."},
2776     {"regs",      (getter)match_regs_get,      (setter)NULL},
2777     {NULL}
2778 };
2779 
2780 #define MATCH_OFF(x) offsetof(MatchObject, x)
2781 static PyMemberDef match_members[] = {
2782     {"string",  T_OBJECT,   MATCH_OFF(string),  READONLY,
2783      "The string passed to match() or search()."},
2784     {"re",      T_OBJECT,   MATCH_OFF(pattern), READONLY,
2785      "The regular expression object."},
2786     {"pos",     T_PYSSIZET, MATCH_OFF(pos),     READONLY,
2787      "The index into the string at which the RE engine started looking for a match."},
2788     {"endpos",  T_PYSSIZET, MATCH_OFF(endpos),  READONLY,
2789      "The index into the string beyond which the RE engine will not go."},
2790     {NULL}
2791 };
2792 
2793 /* FIXME: implement setattr("string", None) as a special case (to
2794    detach the associated string, if any */
2795 static PyType_Slot match_slots[] = {
2796     {Py_tp_dealloc, match_dealloc},
2797     {Py_tp_repr, match_repr},
2798     {Py_tp_doc, (void *)match_doc},
2799     {Py_tp_methods, match_methods},
2800     {Py_tp_members, match_members},
2801     {Py_tp_getset, match_getset},
2802     {Py_tp_traverse, match_traverse},
2803     {Py_tp_clear, match_clear},
2804 
2805     /* As mapping.
2806      *
2807      * Match objects do not support length or assignment, but do support
2808      * __getitem__.
2809      */
2810     {Py_mp_subscript, match_getitem},
2811 
2812     {0, NULL},
2813 };
2814 
2815 static PyType_Spec match_spec = {
2816     .name = "re.Match",
2817     .basicsize = sizeof(MatchObject),
2818     .itemsize = sizeof(Py_ssize_t),
2819     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2820               Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2821     .slots = match_slots,
2822 };
2823 
2824 static PyMethodDef scanner_methods[] = {
2825     _SRE_SRE_SCANNER_MATCH_METHODDEF
2826     _SRE_SRE_SCANNER_SEARCH_METHODDEF
2827     {NULL, NULL}
2828 };
2829 
2830 #define SCAN_OFF(x) offsetof(ScannerObject, x)
2831 static PyMemberDef scanner_members[] = {
2832     {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
2833     {NULL}  /* Sentinel */
2834 };
2835 
2836 static PyType_Slot scanner_slots[] = {
2837     {Py_tp_dealloc, scanner_dealloc},
2838     {Py_tp_methods, scanner_methods},
2839     {Py_tp_members, scanner_members},
2840     {Py_tp_traverse, scanner_traverse},
2841     {Py_tp_clear, scanner_clear},
2842     {0, NULL},
2843 };
2844 
2845 static PyType_Spec scanner_spec = {
2846     .name = "_" SRE_MODULE ".SRE_Scanner",
2847     .basicsize = sizeof(ScannerObject),
2848     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2849               Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2850     .slots = scanner_slots,
2851 };
2852 
2853 static PyMethodDef _functions[] = {
2854     _SRE_COMPILE_METHODDEF
2855     _SRE_GETCODESIZE_METHODDEF
2856     _SRE_ASCII_ISCASED_METHODDEF
2857     _SRE_UNICODE_ISCASED_METHODDEF
2858     _SRE_ASCII_TOLOWER_METHODDEF
2859     _SRE_UNICODE_TOLOWER_METHODDEF
2860     {NULL, NULL}
2861 };
2862 
2863 static int
sre_traverse(PyObject * module,visitproc visit,void * arg)2864 sre_traverse(PyObject *module, visitproc visit, void *arg)
2865 {
2866     _sremodulestate *state = get_sre_module_state(module);
2867 
2868     Py_VISIT(state->Pattern_Type);
2869     Py_VISIT(state->Match_Type);
2870     Py_VISIT(state->Scanner_Type);
2871 
2872     return 0;
2873 }
2874 
2875 static int
sre_clear(PyObject * module)2876 sre_clear(PyObject *module)
2877 {
2878     _sremodulestate *state = get_sre_module_state(module);
2879 
2880     Py_CLEAR(state->Pattern_Type);
2881     Py_CLEAR(state->Match_Type);
2882     Py_CLEAR(state->Scanner_Type);
2883 
2884     return 0;
2885 }
2886 
2887 static void
sre_free(void * module)2888 sre_free(void *module)
2889 {
2890     sre_clear((PyObject *)module);
2891 }
2892 
2893 #define CREATE_TYPE(m, type, spec)                                  \
2894 do {                                                                \
2895     type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
2896     if (type == NULL) {                                             \
2897         goto error;                                                 \
2898     }                                                               \
2899 } while (0)
2900 
2901 #define ADD_ULONG_CONSTANT(module, name, value)           \
2902     do {                                                  \
2903         PyObject *o = PyLong_FromUnsignedLong(value);     \
2904         if (!o)                                           \
2905             goto error;                                   \
2906         int res = PyModule_AddObjectRef(module, name, o); \
2907         Py_DECREF(o);                                     \
2908         if (res < 0) {                                    \
2909             goto error;                                   \
2910         }                                                 \
2911 } while (0)
2912 
2913 static int
sre_exec(PyObject * m)2914 sre_exec(PyObject *m)
2915 {
2916     _sremodulestate *state;
2917 
2918     /* Create heap types */
2919     state = get_sre_module_state(m);
2920     CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
2921     CREATE_TYPE(m, state->Match_Type, &match_spec);
2922     CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
2923 
2924     if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
2925         goto error;
2926     }
2927 
2928     if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
2929         goto error;
2930     }
2931 
2932     ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
2933     ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
2934 
2935     if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
2936         goto error;
2937     }
2938 
2939     return 0;
2940 
2941 error:
2942     return -1;
2943 }
2944 
2945 static PyModuleDef_Slot sre_slots[] = {
2946     {Py_mod_exec, sre_exec},
2947     {0, NULL},
2948 };
2949 
2950 static struct PyModuleDef sremodule = {
2951     .m_base = PyModuleDef_HEAD_INIT,
2952     .m_name = "_" SRE_MODULE,
2953     .m_size = sizeof(_sremodulestate),
2954     .m_methods = _functions,
2955     .m_slots = sre_slots,
2956     .m_traverse = sre_traverse,
2957     .m_free = sre_free,
2958     .m_clear = sre_clear,
2959 };
2960 
2961 PyMODINIT_FUNC
PyInit__sre(void)2962 PyInit__sre(void)
2963 {
2964     return PyModuleDef_Init(&sremodule);
2965 }
2966 
2967 /* vim:ts=4:sw=4:et
2968 */
2969