1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; re-enable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
26 *
27 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 *
29 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
33 * Portions of this engine have been developed in cooperation with
34 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
35 * other compatibility work.
36 */
37
38 static const char copyright[] =
39 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40
41 #define PY_SSIZE_T_CLEAN
42
43 #include "Python.h"
44 #include "pycore_long.h" // _PyLong_GetZero()
45 #include "pycore_moduleobject.h" // _PyModule_GetState()
46 #include "structmember.h" // PyMemberDef
47
48 #include "sre.h"
49
50 #define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
51
52 #include <ctype.h>
53
54 /* name of this module, minus the leading underscore */
55 #if !defined(SRE_MODULE)
56 #define SRE_MODULE "sre"
57 #endif
58
59 #define SRE_PY_MODULE "re"
60
61 /* defining this one enables tracing */
62 #undef VERBOSE
63
64 /* -------------------------------------------------------------------- */
65
66 #if defined(_MSC_VER)
67 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
68 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
69 /* fastest possible local call under MSVC */
70 #define LOCAL(type) static __inline type __fastcall
71 #else
72 #define LOCAL(type) static inline type
73 #endif
74
75 /* error codes */
76 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
77 #define SRE_ERROR_STATE -2 /* illegal state */
78 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
79 #define SRE_ERROR_MEMORY -9 /* out of memory */
80 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
81
82 #if defined(VERBOSE)
83 #define TRACE(v) printf v
84 #else
85 #define TRACE(v)
86 #endif
87
88 /* -------------------------------------------------------------------- */
89 /* search engine state */
90
91 #define SRE_IS_DIGIT(ch)\
92 ((ch) <= '9' && Py_ISDIGIT(ch))
93 #define SRE_IS_SPACE(ch)\
94 ((ch) <= ' ' && Py_ISSPACE(ch))
95 #define SRE_IS_LINEBREAK(ch)\
96 ((ch) == '\n')
97 #define SRE_IS_WORD(ch)\
98 ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
99
sre_lower_ascii(unsigned int ch)100 static unsigned int sre_lower_ascii(unsigned int ch)
101 {
102 return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
103 }
104
105 /* locale-specific character predicates */
106 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
107 * warnings when c's type supports only numbers < N+1 */
108 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
109 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
110
sre_lower_locale(unsigned int ch)111 static unsigned int sre_lower_locale(unsigned int ch)
112 {
113 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
114 }
115
sre_upper_locale(unsigned int ch)116 static unsigned int sre_upper_locale(unsigned int ch)
117 {
118 return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
119 }
120
121 /* unicode-specific character predicates */
122
123 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
124 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
125 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
126 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
127 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
128
sre_lower_unicode(unsigned int ch)129 static unsigned int sre_lower_unicode(unsigned int ch)
130 {
131 return (unsigned int) Py_UNICODE_TOLOWER(ch);
132 }
133
sre_upper_unicode(unsigned int ch)134 static unsigned int sre_upper_unicode(unsigned int ch)
135 {
136 return (unsigned int) Py_UNICODE_TOUPPER(ch);
137 }
138
139 LOCAL(int)
sre_category(SRE_CODE category,unsigned int ch)140 sre_category(SRE_CODE category, unsigned int ch)
141 {
142 switch (category) {
143
144 case SRE_CATEGORY_DIGIT:
145 return SRE_IS_DIGIT(ch);
146 case SRE_CATEGORY_NOT_DIGIT:
147 return !SRE_IS_DIGIT(ch);
148 case SRE_CATEGORY_SPACE:
149 return SRE_IS_SPACE(ch);
150 case SRE_CATEGORY_NOT_SPACE:
151 return !SRE_IS_SPACE(ch);
152 case SRE_CATEGORY_WORD:
153 return SRE_IS_WORD(ch);
154 case SRE_CATEGORY_NOT_WORD:
155 return !SRE_IS_WORD(ch);
156 case SRE_CATEGORY_LINEBREAK:
157 return SRE_IS_LINEBREAK(ch);
158 case SRE_CATEGORY_NOT_LINEBREAK:
159 return !SRE_IS_LINEBREAK(ch);
160
161 case SRE_CATEGORY_LOC_WORD:
162 return SRE_LOC_IS_WORD(ch);
163 case SRE_CATEGORY_LOC_NOT_WORD:
164 return !SRE_LOC_IS_WORD(ch);
165
166 case SRE_CATEGORY_UNI_DIGIT:
167 return SRE_UNI_IS_DIGIT(ch);
168 case SRE_CATEGORY_UNI_NOT_DIGIT:
169 return !SRE_UNI_IS_DIGIT(ch);
170 case SRE_CATEGORY_UNI_SPACE:
171 return SRE_UNI_IS_SPACE(ch);
172 case SRE_CATEGORY_UNI_NOT_SPACE:
173 return !SRE_UNI_IS_SPACE(ch);
174 case SRE_CATEGORY_UNI_WORD:
175 return SRE_UNI_IS_WORD(ch);
176 case SRE_CATEGORY_UNI_NOT_WORD:
177 return !SRE_UNI_IS_WORD(ch);
178 case SRE_CATEGORY_UNI_LINEBREAK:
179 return SRE_UNI_IS_LINEBREAK(ch);
180 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
181 return !SRE_UNI_IS_LINEBREAK(ch);
182 }
183 return 0;
184 }
185
186 LOCAL(int)
char_loc_ignore(SRE_CODE pattern,SRE_CODE ch)187 char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
188 {
189 return ch == pattern
190 || (SRE_CODE) sre_lower_locale(ch) == pattern
191 || (SRE_CODE) sre_upper_locale(ch) == pattern;
192 }
193
194
195 /* helpers */
196
197 static void
data_stack_dealloc(SRE_STATE * state)198 data_stack_dealloc(SRE_STATE* state)
199 {
200 if (state->data_stack) {
201 PyMem_Free(state->data_stack);
202 state->data_stack = NULL;
203 }
204 state->data_stack_size = state->data_stack_base = 0;
205 }
206
207 static int
data_stack_grow(SRE_STATE * state,Py_ssize_t size)208 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
209 {
210 Py_ssize_t minsize, cursize;
211 minsize = state->data_stack_base+size;
212 cursize = state->data_stack_size;
213 if (cursize < minsize) {
214 void* stack;
215 cursize = minsize+minsize/4+1024;
216 TRACE(("allocate/grow stack %zd\n", cursize));
217 stack = PyMem_Realloc(state->data_stack, cursize);
218 if (!stack) {
219 data_stack_dealloc(state);
220 return SRE_ERROR_MEMORY;
221 }
222 state->data_stack = (char *)stack;
223 state->data_stack_size = cursize;
224 }
225 return 0;
226 }
227
228 /* generate 8-bit version */
229
230 #define SRE_CHAR Py_UCS1
231 #define SIZEOF_SRE_CHAR 1
232 #define SRE(F) sre_ucs1_##F
233 #include "sre_lib.h"
234
235 /* generate 16-bit unicode version */
236
237 #define SRE_CHAR Py_UCS2
238 #define SIZEOF_SRE_CHAR 2
239 #define SRE(F) sre_ucs2_##F
240 #include "sre_lib.h"
241
242 /* generate 32-bit unicode version */
243
244 #define SRE_CHAR Py_UCS4
245 #define SIZEOF_SRE_CHAR 4
246 #define SRE(F) sre_ucs4_##F
247 #include "sre_lib.h"
248
249 /* -------------------------------------------------------------------- */
250 /* factories and destructors */
251
252 /* module state */
253 typedef struct {
254 PyTypeObject *Pattern_Type;
255 PyTypeObject *Match_Type;
256 PyTypeObject *Scanner_Type;
257 } _sremodulestate;
258
259 static _sremodulestate *
get_sre_module_state(PyObject * m)260 get_sre_module_state(PyObject *m)
261 {
262 _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
263 assert(state);
264 return state;
265 }
266
267 static struct PyModuleDef sremodule;
268 #define get_sre_module_state_by_class(cls) \
269 (get_sre_module_state(PyType_GetModule(cls)))
270
271 /* see sre.h for object declarations */
272 static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
273 static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
274
275 /*[clinic input]
276 module _sre
277 class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
278 class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
279 class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
280 [clinic start generated code]*/
281 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
282
283 /*[clinic input]
284 _sre.getcodesize -> int
285 [clinic start generated code]*/
286
287 static int
_sre_getcodesize_impl(PyObject * module)288 _sre_getcodesize_impl(PyObject *module)
289 /*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
290 {
291 return sizeof(SRE_CODE);
292 }
293
294 /*[clinic input]
295 _sre.ascii_iscased -> bool
296
297 character: int
298 /
299
300 [clinic start generated code]*/
301
302 static int
_sre_ascii_iscased_impl(PyObject * module,int character)303 _sre_ascii_iscased_impl(PyObject *module, int character)
304 /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
305 {
306 unsigned int ch = (unsigned int)character;
307 return ch < 128 && Py_ISALPHA(ch);
308 }
309
310 /*[clinic input]
311 _sre.unicode_iscased -> bool
312
313 character: int
314 /
315
316 [clinic start generated code]*/
317
318 static int
_sre_unicode_iscased_impl(PyObject * module,int character)319 _sre_unicode_iscased_impl(PyObject *module, int character)
320 /*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
321 {
322 unsigned int ch = (unsigned int)character;
323 return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
324 }
325
326 /*[clinic input]
327 _sre.ascii_tolower -> int
328
329 character: int
330 /
331
332 [clinic start generated code]*/
333
334 static int
_sre_ascii_tolower_impl(PyObject * module,int character)335 _sre_ascii_tolower_impl(PyObject *module, int character)
336 /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
337 {
338 return sre_lower_ascii(character);
339 }
340
341 /*[clinic input]
342 _sre.unicode_tolower -> int
343
344 character: int
345 /
346
347 [clinic start generated code]*/
348
349 static int
_sre_unicode_tolower_impl(PyObject * module,int character)350 _sre_unicode_tolower_impl(PyObject *module, int character)
351 /*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
352 {
353 return sre_lower_unicode(character);
354 }
355
356 LOCAL(void)
state_reset(SRE_STATE * state)357 state_reset(SRE_STATE* state)
358 {
359 /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
360 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
361
362 state->lastmark = -1;
363 state->lastindex = -1;
364
365 state->repeat = NULL;
366
367 data_stack_dealloc(state);
368 }
369
370 static const void*
getstring(PyObject * string,Py_ssize_t * p_length,int * p_isbytes,int * p_charsize,Py_buffer * view)371 getstring(PyObject* string, Py_ssize_t* p_length,
372 int* p_isbytes, int* p_charsize,
373 Py_buffer *view)
374 {
375 /* given a python object, return a data pointer, a length (in
376 characters), and a character size. return NULL if the object
377 is not a string (or not compatible) */
378
379 /* Unicode objects do not support the buffer API. So, get the data
380 directly instead. */
381 if (PyUnicode_Check(string)) {
382 if (PyUnicode_READY(string) == -1)
383 return NULL;
384 *p_length = PyUnicode_GET_LENGTH(string);
385 *p_charsize = PyUnicode_KIND(string);
386 *p_isbytes = 0;
387 return PyUnicode_DATA(string);
388 }
389
390 /* get pointer to byte string buffer */
391 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
392 PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
393 return NULL;
394 }
395
396 *p_length = view->len;
397 *p_charsize = 1;
398 *p_isbytes = 1;
399
400 if (view->buf == NULL) {
401 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
402 PyBuffer_Release(view);
403 view->buf = NULL;
404 return NULL;
405 }
406 return view->buf;
407 }
408
409 LOCAL(PyObject*)
state_init(SRE_STATE * state,PatternObject * pattern,PyObject * string,Py_ssize_t start,Py_ssize_t end)410 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
411 Py_ssize_t start, Py_ssize_t end)
412 {
413 /* prepare state object */
414
415 Py_ssize_t length;
416 int isbytes, charsize;
417 const void* ptr;
418
419 memset(state, 0, sizeof(SRE_STATE));
420
421 state->mark = PyMem_New(const void *, pattern->groups * 2);
422 if (!state->mark) {
423 PyErr_NoMemory();
424 goto err;
425 }
426 state->lastmark = -1;
427 state->lastindex = -1;
428
429 state->buffer.buf = NULL;
430 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
431 if (!ptr)
432 goto err;
433
434 if (isbytes && pattern->isbytes == 0) {
435 PyErr_SetString(PyExc_TypeError,
436 "cannot use a string pattern on a bytes-like object");
437 goto err;
438 }
439 if (!isbytes && pattern->isbytes > 0) {
440 PyErr_SetString(PyExc_TypeError,
441 "cannot use a bytes pattern on a string-like object");
442 goto err;
443 }
444
445 /* adjust boundaries */
446 if (start < 0)
447 start = 0;
448 else if (start > length)
449 start = length;
450
451 if (end < 0)
452 end = 0;
453 else if (end > length)
454 end = length;
455
456 state->isbytes = isbytes;
457 state->charsize = charsize;
458 state->match_all = 0;
459 state->must_advance = 0;
460
461 state->beginning = ptr;
462
463 state->start = (void*) ((char*) ptr + start * state->charsize);
464 state->end = (void*) ((char*) ptr + end * state->charsize);
465
466 Py_INCREF(string);
467 state->string = string;
468 state->pos = start;
469 state->endpos = end;
470
471 return string;
472 err:
473 /* We add an explicit cast here because MSVC has a bug when
474 compiling C code where it believes that `const void**` cannot be
475 safely casted to `void*`, see bpo-39943 for details. */
476 PyMem_Free((void*) state->mark);
477 state->mark = NULL;
478 if (state->buffer.buf)
479 PyBuffer_Release(&state->buffer);
480 return NULL;
481 }
482
483 LOCAL(void)
state_fini(SRE_STATE * state)484 state_fini(SRE_STATE* state)
485 {
486 if (state->buffer.buf)
487 PyBuffer_Release(&state->buffer);
488 Py_XDECREF(state->string);
489 data_stack_dealloc(state);
490 /* See above PyMem_Del for why we explicitly cast here. */
491 PyMem_Free((void*) state->mark);
492 state->mark = NULL;
493 }
494
495 /* calculate offset from start of string */
496 #define STATE_OFFSET(state, member)\
497 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
498
499 LOCAL(PyObject*)
getslice(int isbytes,const void * ptr,PyObject * string,Py_ssize_t start,Py_ssize_t end)500 getslice(int isbytes, const void *ptr,
501 PyObject* string, Py_ssize_t start, Py_ssize_t end)
502 {
503 if (isbytes) {
504 if (PyBytes_CheckExact(string) &&
505 start == 0 && end == PyBytes_GET_SIZE(string)) {
506 Py_INCREF(string);
507 return string;
508 }
509 return PyBytes_FromStringAndSize(
510 (const char *)ptr + start, end - start);
511 }
512 else {
513 return PyUnicode_Substring(string, start, end);
514 }
515 }
516
517 LOCAL(PyObject*)
state_getslice(SRE_STATE * state,Py_ssize_t index,PyObject * string,int empty)518 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
519 {
520 Py_ssize_t i, j;
521
522 index = (index - 1) * 2;
523
524 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
525 if (empty)
526 /* want empty string */
527 i = j = 0;
528 else {
529 Py_RETURN_NONE;
530 }
531 } else {
532 i = STATE_OFFSET(state, state->mark[index]);
533 j = STATE_OFFSET(state, state->mark[index+1]);
534 }
535
536 return getslice(state->isbytes, state->beginning, string, i, j);
537 }
538
539 static void
pattern_error(Py_ssize_t status)540 pattern_error(Py_ssize_t status)
541 {
542 switch (status) {
543 case SRE_ERROR_RECURSION_LIMIT:
544 /* This error code seems to be unused. */
545 PyErr_SetString(
546 PyExc_RecursionError,
547 "maximum recursion limit exceeded"
548 );
549 break;
550 case SRE_ERROR_MEMORY:
551 PyErr_NoMemory();
552 break;
553 case SRE_ERROR_INTERRUPTED:
554 /* An exception has already been raised, so let it fly */
555 break;
556 default:
557 /* other error codes indicate compiler/engine bugs */
558 PyErr_SetString(
559 PyExc_RuntimeError,
560 "internal error in regular expression engine"
561 );
562 }
563 }
564
565 static int
pattern_traverse(PatternObject * self,visitproc visit,void * arg)566 pattern_traverse(PatternObject *self, visitproc visit, void *arg)
567 {
568 Py_VISIT(Py_TYPE(self));
569 Py_VISIT(self->groupindex);
570 Py_VISIT(self->indexgroup);
571 Py_VISIT(self->pattern);
572 return 0;
573 }
574
575 static int
pattern_clear(PatternObject * self)576 pattern_clear(PatternObject *self)
577 {
578 Py_CLEAR(self->groupindex);
579 Py_CLEAR(self->indexgroup);
580 Py_CLEAR(self->pattern);
581 return 0;
582 }
583
584 static void
pattern_dealloc(PatternObject * self)585 pattern_dealloc(PatternObject* self)
586 {
587 PyTypeObject *tp = Py_TYPE(self);
588
589 PyObject_GC_UnTrack(self);
590 if (self->weakreflist != NULL) {
591 PyObject_ClearWeakRefs((PyObject *) self);
592 }
593 (void)pattern_clear(self);
594 tp->tp_free(self);
595 Py_DECREF(tp);
596 }
597
598 LOCAL(Py_ssize_t)
sre_match(SRE_STATE * state,SRE_CODE * pattern)599 sre_match(SRE_STATE* state, SRE_CODE* pattern)
600 {
601 if (state->charsize == 1)
602 return sre_ucs1_match(state, pattern, 1);
603 if (state->charsize == 2)
604 return sre_ucs2_match(state, pattern, 1);
605 assert(state->charsize == 4);
606 return sre_ucs4_match(state, pattern, 1);
607 }
608
609 LOCAL(Py_ssize_t)
sre_search(SRE_STATE * state,SRE_CODE * pattern)610 sre_search(SRE_STATE* state, SRE_CODE* pattern)
611 {
612 if (state->charsize == 1)
613 return sre_ucs1_search(state, pattern);
614 if (state->charsize == 2)
615 return sre_ucs2_search(state, pattern);
616 assert(state->charsize == 4);
617 return sre_ucs4_search(state, pattern);
618 }
619
620 /*[clinic input]
621 _sre.SRE_Pattern.match
622
623 cls: defining_class
624 /
625 string: object
626 pos: Py_ssize_t = 0
627 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
628
629 Matches zero or more characters at the beginning of the string.
630 [clinic start generated code]*/
631
632 static PyObject *
_sre_SRE_Pattern_match_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)633 _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
634 PyObject *string, Py_ssize_t pos,
635 Py_ssize_t endpos)
636 /*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
637 {
638 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
639 SRE_STATE state;
640 Py_ssize_t status;
641 PyObject *match;
642
643 if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
644 return NULL;
645
646 state.ptr = state.start;
647
648 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
649
650 status = sre_match(&state, PatternObject_GetCode(self));
651
652 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
653 if (PyErr_Occurred()) {
654 state_fini(&state);
655 return NULL;
656 }
657
658 match = pattern_new_match(module_state, self, &state, status);
659 state_fini(&state);
660 return match;
661 }
662
663 /*[clinic input]
664 _sre.SRE_Pattern.fullmatch
665
666 cls: defining_class
667 /
668 string: object
669 pos: Py_ssize_t = 0
670 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
671
672 Matches against all of the string.
673 [clinic start generated code]*/
674
675 static PyObject *
_sre_SRE_Pattern_fullmatch_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)676 _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
677 PyObject *string, Py_ssize_t pos,
678 Py_ssize_t endpos)
679 /*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
680 {
681 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
682 SRE_STATE state;
683 Py_ssize_t status;
684 PyObject *match;
685
686 if (!state_init(&state, self, string, pos, endpos))
687 return NULL;
688
689 state.ptr = state.start;
690
691 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
692
693 state.match_all = 1;
694 status = sre_match(&state, PatternObject_GetCode(self));
695
696 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
697 if (PyErr_Occurred()) {
698 state_fini(&state);
699 return NULL;
700 }
701
702 match = pattern_new_match(module_state, self, &state, status);
703 state_fini(&state);
704 return match;
705 }
706
707 /*[clinic input]
708 _sre.SRE_Pattern.search
709
710 cls: defining_class
711 /
712 string: object
713 pos: Py_ssize_t = 0
714 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
715
716 Scan through string looking for a match, and return a corresponding match object instance.
717
718 Return None if no position in the string matches.
719 [clinic start generated code]*/
720
721 static PyObject *
_sre_SRE_Pattern_search_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)722 _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
723 PyObject *string, Py_ssize_t pos,
724 Py_ssize_t endpos)
725 /*[clinic end generated code: output=bd7f2d9d583e1463 input=afa9afb66a74a4b3]*/
726 {
727 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
728 SRE_STATE state;
729 Py_ssize_t status;
730 PyObject *match;
731
732 if (!state_init(&state, self, string, pos, endpos))
733 return NULL;
734
735 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
736
737 status = sre_search(&state, PatternObject_GetCode(self));
738
739 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
740
741 if (PyErr_Occurred()) {
742 state_fini(&state);
743 return NULL;
744 }
745
746 match = pattern_new_match(module_state, self, &state, status);
747 state_fini(&state);
748 return match;
749 }
750
751 static PyObject*
call(const char * module,const char * function,PyObject * args)752 call(const char* module, const char* function, PyObject* args)
753 {
754 PyObject* name;
755 PyObject* mod;
756 PyObject* func;
757 PyObject* result;
758
759 if (!args)
760 return NULL;
761 name = PyUnicode_FromString(module);
762 if (!name)
763 return NULL;
764 mod = PyImport_Import(name);
765 Py_DECREF(name);
766 if (!mod)
767 return NULL;
768 func = PyObject_GetAttrString(mod, function);
769 Py_DECREF(mod);
770 if (!func)
771 return NULL;
772 result = PyObject_CallObject(func, args);
773 Py_DECREF(func);
774 Py_DECREF(args);
775 return result;
776 }
777
778 /*[clinic input]
779 _sre.SRE_Pattern.findall
780
781 string: object
782 pos: Py_ssize_t = 0
783 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
784
785 Return a list of all non-overlapping matches of pattern in string.
786 [clinic start generated code]*/
787
788 static PyObject *
_sre_SRE_Pattern_findall_impl(PatternObject * self,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)789 _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
790 Py_ssize_t pos, Py_ssize_t endpos)
791 /*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
792 {
793 SRE_STATE state;
794 PyObject* list;
795 Py_ssize_t status;
796 Py_ssize_t i, b, e;
797
798 if (!state_init(&state, self, string, pos, endpos))
799 return NULL;
800
801 list = PyList_New(0);
802 if (!list) {
803 state_fini(&state);
804 return NULL;
805 }
806
807 while (state.start <= state.end) {
808
809 PyObject* item;
810
811 state_reset(&state);
812
813 state.ptr = state.start;
814
815 status = sre_search(&state, PatternObject_GetCode(self));
816 if (PyErr_Occurred())
817 goto error;
818
819 if (status <= 0) {
820 if (status == 0)
821 break;
822 pattern_error(status);
823 goto error;
824 }
825
826 /* don't bother to build a match object */
827 switch (self->groups) {
828 case 0:
829 b = STATE_OFFSET(&state, state.start);
830 e = STATE_OFFSET(&state, state.ptr);
831 item = getslice(state.isbytes, state.beginning,
832 string, b, e);
833 if (!item)
834 goto error;
835 break;
836 case 1:
837 item = state_getslice(&state, 1, string, 1);
838 if (!item)
839 goto error;
840 break;
841 default:
842 item = PyTuple_New(self->groups);
843 if (!item)
844 goto error;
845 for (i = 0; i < self->groups; i++) {
846 PyObject* o = state_getslice(&state, i+1, string, 1);
847 if (!o) {
848 Py_DECREF(item);
849 goto error;
850 }
851 PyTuple_SET_ITEM(item, i, o);
852 }
853 break;
854 }
855
856 status = PyList_Append(list, item);
857 Py_DECREF(item);
858 if (status < 0)
859 goto error;
860
861 state.must_advance = (state.ptr == state.start);
862 state.start = state.ptr;
863 }
864
865 state_fini(&state);
866 return list;
867
868 error:
869 Py_DECREF(list);
870 state_fini(&state);
871 return NULL;
872
873 }
874
875 /*[clinic input]
876 _sre.SRE_Pattern.finditer
877
878 cls: defining_class
879 /
880 string: object
881 pos: Py_ssize_t = 0
882 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
883
884 Return an iterator over all non-overlapping matches for the RE pattern in string.
885
886 For each match, the iterator returns a match object.
887 [clinic start generated code]*/
888
889 static PyObject *
_sre_SRE_Pattern_finditer_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)890 _sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
891 PyObject *string, Py_ssize_t pos,
892 Py_ssize_t endpos)
893 /*[clinic end generated code: output=1791dbf3618ade56 input=812e332a4848cbaf]*/
894 {
895 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
896 PyObject* scanner;
897 PyObject* search;
898 PyObject* iterator;
899
900 scanner = pattern_scanner(module_state, self, string, pos, endpos);
901 if (!scanner)
902 return NULL;
903
904 search = PyObject_GetAttrString(scanner, "search");
905 Py_DECREF(scanner);
906 if (!search)
907 return NULL;
908
909 iterator = PyCallIter_New(search, Py_None);
910 Py_DECREF(search);
911
912 return iterator;
913 }
914
915 /*[clinic input]
916 _sre.SRE_Pattern.scanner
917
918 cls: defining_class
919 /
920 string: object
921 pos: Py_ssize_t = 0
922 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
923
924 [clinic start generated code]*/
925
926 static PyObject *
_sre_SRE_Pattern_scanner_impl(PatternObject * self,PyTypeObject * cls,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)927 _sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
928 PyObject *string, Py_ssize_t pos,
929 Py_ssize_t endpos)
930 /*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
931 {
932 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
933
934 return pattern_scanner(module_state, self, string, pos, endpos);
935 }
936
937 /*[clinic input]
938 _sre.SRE_Pattern.split
939
940 string: object
941 maxsplit: Py_ssize_t = 0
942
943 Split string by the occurrences of pattern.
944 [clinic start generated code]*/
945
946 static PyObject *
_sre_SRE_Pattern_split_impl(PatternObject * self,PyObject * string,Py_ssize_t maxsplit)947 _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
948 Py_ssize_t maxsplit)
949 /*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
950 {
951 SRE_STATE state;
952 PyObject* list;
953 PyObject* item;
954 Py_ssize_t status;
955 Py_ssize_t n;
956 Py_ssize_t i;
957 const void* last;
958
959 assert(self->codesize != 0);
960
961 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
962 return NULL;
963
964 list = PyList_New(0);
965 if (!list) {
966 state_fini(&state);
967 return NULL;
968 }
969
970 n = 0;
971 last = state.start;
972
973 while (!maxsplit || n < maxsplit) {
974
975 state_reset(&state);
976
977 state.ptr = state.start;
978
979 status = sre_search(&state, PatternObject_GetCode(self));
980 if (PyErr_Occurred())
981 goto error;
982
983 if (status <= 0) {
984 if (status == 0)
985 break;
986 pattern_error(status);
987 goto error;
988 }
989
990 /* get segment before this match */
991 item = getslice(state.isbytes, state.beginning,
992 string, STATE_OFFSET(&state, last),
993 STATE_OFFSET(&state, state.start)
994 );
995 if (!item)
996 goto error;
997 status = PyList_Append(list, item);
998 Py_DECREF(item);
999 if (status < 0)
1000 goto error;
1001
1002 /* add groups (if any) */
1003 for (i = 0; i < self->groups; i++) {
1004 item = state_getslice(&state, i+1, string, 0);
1005 if (!item)
1006 goto error;
1007 status = PyList_Append(list, item);
1008 Py_DECREF(item);
1009 if (status < 0)
1010 goto error;
1011 }
1012
1013 n = n + 1;
1014 state.must_advance = (state.ptr == state.start);
1015 last = state.start = state.ptr;
1016
1017 }
1018
1019 /* get segment following last match (even if empty) */
1020 item = getslice(state.isbytes, state.beginning,
1021 string, STATE_OFFSET(&state, last), state.endpos
1022 );
1023 if (!item)
1024 goto error;
1025 status = PyList_Append(list, item);
1026 Py_DECREF(item);
1027 if (status < 0)
1028 goto error;
1029
1030 state_fini(&state);
1031 return list;
1032
1033 error:
1034 Py_DECREF(list);
1035 state_fini(&state);
1036 return NULL;
1037
1038 }
1039
1040 static PyObject*
pattern_subx(_sremodulestate * module_state,PatternObject * self,PyObject * ptemplate,PyObject * string,Py_ssize_t count,Py_ssize_t subn)1041 pattern_subx(_sremodulestate* module_state,
1042 PatternObject* self,
1043 PyObject* ptemplate,
1044 PyObject* string,
1045 Py_ssize_t count,
1046 Py_ssize_t subn)
1047 {
1048 SRE_STATE state;
1049 PyObject* list;
1050 PyObject* joiner;
1051 PyObject* item;
1052 PyObject* filter;
1053 PyObject* match;
1054 const void* ptr;
1055 Py_ssize_t status;
1056 Py_ssize_t n;
1057 Py_ssize_t i, b, e;
1058 int isbytes, charsize;
1059 int filter_is_callable;
1060 Py_buffer view;
1061
1062 if (PyCallable_Check(ptemplate)) {
1063 /* sub/subn takes either a function or a template */
1064 filter = ptemplate;
1065 Py_INCREF(filter);
1066 filter_is_callable = 1;
1067 } else {
1068 /* if not callable, check if it's a literal string */
1069 int literal;
1070 view.buf = NULL;
1071 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1072 if (ptr) {
1073 if (charsize == 1)
1074 literal = memchr(ptr, '\\', n) == NULL;
1075 else
1076 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1077 } else {
1078 PyErr_Clear();
1079 literal = 0;
1080 }
1081 if (view.buf)
1082 PyBuffer_Release(&view);
1083 if (literal) {
1084 filter = ptemplate;
1085 Py_INCREF(filter);
1086 filter_is_callable = 0;
1087 } else {
1088 /* not a literal; hand it over to the template compiler */
1089 filter = call(
1090 SRE_PY_MODULE, "_subx",
1091 PyTuple_Pack(2, self, ptemplate)
1092 );
1093 if (!filter)
1094 return NULL;
1095 filter_is_callable = PyCallable_Check(filter);
1096 }
1097 }
1098
1099 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1100 Py_DECREF(filter);
1101 return NULL;
1102 }
1103
1104 list = PyList_New(0);
1105 if (!list) {
1106 Py_DECREF(filter);
1107 state_fini(&state);
1108 return NULL;
1109 }
1110
1111 n = i = 0;
1112
1113 while (!count || n < count) {
1114
1115 state_reset(&state);
1116
1117 state.ptr = state.start;
1118
1119 status = sre_search(&state, PatternObject_GetCode(self));
1120 if (PyErr_Occurred())
1121 goto error;
1122
1123 if (status <= 0) {
1124 if (status == 0)
1125 break;
1126 pattern_error(status);
1127 goto error;
1128 }
1129
1130 b = STATE_OFFSET(&state, state.start);
1131 e = STATE_OFFSET(&state, state.ptr);
1132
1133 if (i < b) {
1134 /* get segment before this match */
1135 item = getslice(state.isbytes, state.beginning,
1136 string, i, b);
1137 if (!item)
1138 goto error;
1139 status = PyList_Append(list, item);
1140 Py_DECREF(item);
1141 if (status < 0)
1142 goto error;
1143
1144 }
1145
1146 if (filter_is_callable) {
1147 /* pass match object through filter */
1148 match = pattern_new_match(module_state, self, &state, 1);
1149 if (!match)
1150 goto error;
1151 item = PyObject_CallOneArg(filter, match);
1152 Py_DECREF(match);
1153 if (!item)
1154 goto error;
1155 } else {
1156 /* filter is literal string */
1157 item = filter;
1158 Py_INCREF(item);
1159 }
1160
1161 /* add to list */
1162 if (item != Py_None) {
1163 status = PyList_Append(list, item);
1164 Py_DECREF(item);
1165 if (status < 0)
1166 goto error;
1167 }
1168
1169 i = e;
1170 n = n + 1;
1171 state.must_advance = (state.ptr == state.start);
1172 state.start = state.ptr;
1173 }
1174
1175 /* get segment following last match */
1176 if (i < state.endpos) {
1177 item = getslice(state.isbytes, state.beginning,
1178 string, i, state.endpos);
1179 if (!item)
1180 goto error;
1181 status = PyList_Append(list, item);
1182 Py_DECREF(item);
1183 if (status < 0)
1184 goto error;
1185 }
1186
1187 state_fini(&state);
1188
1189 Py_DECREF(filter);
1190
1191 /* convert list to single string (also removes list) */
1192 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1193 if (!joiner) {
1194 Py_DECREF(list);
1195 return NULL;
1196 }
1197 if (PyList_GET_SIZE(list) == 0) {
1198 Py_DECREF(list);
1199 item = joiner;
1200 }
1201 else {
1202 if (state.isbytes)
1203 item = _PyBytes_Join(joiner, list);
1204 else
1205 item = PyUnicode_Join(joiner, list);
1206 Py_DECREF(joiner);
1207 Py_DECREF(list);
1208 if (!item)
1209 return NULL;
1210 }
1211
1212 if (subn)
1213 return Py_BuildValue("Nn", item, n);
1214
1215 return item;
1216
1217 error:
1218 Py_DECREF(list);
1219 state_fini(&state);
1220 Py_DECREF(filter);
1221 return NULL;
1222
1223 }
1224
1225 /*[clinic input]
1226 _sre.SRE_Pattern.sub
1227
1228 cls: defining_class
1229 /
1230 repl: object
1231 string: object
1232 count: Py_ssize_t = 0
1233
1234 Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1235 [clinic start generated code]*/
1236
1237 static PyObject *
_sre_SRE_Pattern_sub_impl(PatternObject * self,PyTypeObject * cls,PyObject * repl,PyObject * string,Py_ssize_t count)1238 _sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1239 PyObject *repl, PyObject *string, Py_ssize_t count)
1240 /*[clinic end generated code: output=4be141ab04bca60d input=d8d1d4ac2311a07c]*/
1241 {
1242 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1243
1244 return pattern_subx(module_state, self, repl, string, count, 0);
1245 }
1246
1247 /*[clinic input]
1248 _sre.SRE_Pattern.subn
1249
1250 cls: defining_class
1251 /
1252 repl: object
1253 string: object
1254 count: Py_ssize_t = 0
1255
1256 Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1257 [clinic start generated code]*/
1258
1259 static PyObject *
_sre_SRE_Pattern_subn_impl(PatternObject * self,PyTypeObject * cls,PyObject * repl,PyObject * string,Py_ssize_t count)1260 _sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1261 PyObject *repl, PyObject *string,
1262 Py_ssize_t count)
1263 /*[clinic end generated code: output=da02fd85258b1e1f input=8b78a65b8302e58d]*/
1264 {
1265 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1266
1267 return pattern_subx(module_state, self, repl, string, count, 1);
1268 }
1269
1270 /*[clinic input]
1271 _sre.SRE_Pattern.__copy__
1272
1273 [clinic start generated code]*/
1274
1275 static PyObject *
_sre_SRE_Pattern___copy___impl(PatternObject * self)1276 _sre_SRE_Pattern___copy___impl(PatternObject *self)
1277 /*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1278 {
1279 Py_INCREF(self);
1280 return (PyObject *)self;
1281 }
1282
1283 /*[clinic input]
1284 _sre.SRE_Pattern.__deepcopy__
1285
1286 memo: object
1287 /
1288
1289 [clinic start generated code]*/
1290
1291 static PyObject *
_sre_SRE_Pattern___deepcopy__(PatternObject * self,PyObject * memo)1292 _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1293 /*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
1294 {
1295 Py_INCREF(self);
1296 return (PyObject *)self;
1297 }
1298
1299 static PyObject *
pattern_repr(PatternObject * obj)1300 pattern_repr(PatternObject *obj)
1301 {
1302 static const struct {
1303 const char *name;
1304 int value;
1305 } flag_names[] = {
1306 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1307 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1308 {"re.LOCALE", SRE_FLAG_LOCALE},
1309 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1310 {"re.DOTALL", SRE_FLAG_DOTALL},
1311 {"re.UNICODE", SRE_FLAG_UNICODE},
1312 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1313 {"re.DEBUG", SRE_FLAG_DEBUG},
1314 {"re.ASCII", SRE_FLAG_ASCII},
1315 };
1316 PyObject *result = NULL;
1317 PyObject *flag_items;
1318 size_t i;
1319 int flags = obj->flags;
1320
1321 /* Omit re.UNICODE for valid string patterns. */
1322 if (obj->isbytes == 0 &&
1323 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1324 SRE_FLAG_UNICODE)
1325 flags &= ~SRE_FLAG_UNICODE;
1326
1327 flag_items = PyList_New(0);
1328 if (!flag_items)
1329 return NULL;
1330
1331 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1332 if (flags & flag_names[i].value) {
1333 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1334 if (!item)
1335 goto done;
1336
1337 if (PyList_Append(flag_items, item) < 0) {
1338 Py_DECREF(item);
1339 goto done;
1340 }
1341 Py_DECREF(item);
1342 flags &= ~flag_names[i].value;
1343 }
1344 }
1345 if (flags) {
1346 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1347 if (!item)
1348 goto done;
1349
1350 if (PyList_Append(flag_items, item) < 0) {
1351 Py_DECREF(item);
1352 goto done;
1353 }
1354 Py_DECREF(item);
1355 }
1356
1357 if (PyList_Size(flag_items) > 0) {
1358 PyObject *flags_result;
1359 PyObject *sep = PyUnicode_FromString("|");
1360 if (!sep)
1361 goto done;
1362 flags_result = PyUnicode_Join(sep, flag_items);
1363 Py_DECREF(sep);
1364 if (!flags_result)
1365 goto done;
1366 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1367 obj->pattern, flags_result);
1368 Py_DECREF(flags_result);
1369 }
1370 else {
1371 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1372 }
1373
1374 done:
1375 Py_DECREF(flag_items);
1376 return result;
1377 }
1378
1379 PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1380
1381 /* PatternObject's 'groupindex' method. */
1382 static PyObject *
pattern_groupindex(PatternObject * self,void * Py_UNUSED (ignored))1383 pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
1384 {
1385 if (self->groupindex == NULL)
1386 return PyDict_New();
1387 return PyDictProxy_New(self->groupindex);
1388 }
1389
1390 static int _validate(PatternObject *self); /* Forward */
1391
1392 /*[clinic input]
1393 _sre.compile
1394
1395 pattern: object
1396 flags: int
1397 code: object(subclass_of='&PyList_Type')
1398 groups: Py_ssize_t
1399 groupindex: object(subclass_of='&PyDict_Type')
1400 indexgroup: object(subclass_of='&PyTuple_Type')
1401
1402 [clinic start generated code]*/
1403
1404 static PyObject *
_sre_compile_impl(PyObject * module,PyObject * pattern,int flags,PyObject * code,Py_ssize_t groups,PyObject * groupindex,PyObject * indexgroup)1405 _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1406 PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1407 PyObject *indexgroup)
1408 /*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1409 {
1410 /* "compile" pattern descriptor to pattern object */
1411
1412 _sremodulestate *module_state = get_sre_module_state(module);
1413 PatternObject* self;
1414 Py_ssize_t i, n;
1415
1416 n = PyList_GET_SIZE(code);
1417 /* coverity[ampersand_in_size] */
1418 self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1419 if (!self)
1420 return NULL;
1421 self->weakreflist = NULL;
1422 self->pattern = NULL;
1423 self->groupindex = NULL;
1424 self->indexgroup = NULL;
1425
1426 self->codesize = n;
1427
1428 for (i = 0; i < n; i++) {
1429 PyObject *o = PyList_GET_ITEM(code, i);
1430 unsigned long value = PyLong_AsUnsignedLong(o);
1431 self->code[i] = (SRE_CODE) value;
1432 if ((unsigned long) self->code[i] != value) {
1433 PyErr_SetString(PyExc_OverflowError,
1434 "regular expression code size limit exceeded");
1435 break;
1436 }
1437 }
1438 PyObject_GC_Track(self);
1439
1440 if (PyErr_Occurred()) {
1441 Py_DECREF(self);
1442 return NULL;
1443 }
1444
1445 if (pattern == Py_None) {
1446 self->isbytes = -1;
1447 }
1448 else {
1449 Py_ssize_t p_length;
1450 int charsize;
1451 Py_buffer view;
1452 view.buf = NULL;
1453 if (!getstring(pattern, &p_length, &self->isbytes,
1454 &charsize, &view)) {
1455 Py_DECREF(self);
1456 return NULL;
1457 }
1458 if (view.buf)
1459 PyBuffer_Release(&view);
1460 }
1461
1462 Py_INCREF(pattern);
1463 self->pattern = pattern;
1464
1465 self->flags = flags;
1466
1467 self->groups = groups;
1468
1469 if (PyDict_GET_SIZE(groupindex) > 0) {
1470 Py_INCREF(groupindex);
1471 self->groupindex = groupindex;
1472 if (PyTuple_GET_SIZE(indexgroup) > 0) {
1473 Py_INCREF(indexgroup);
1474 self->indexgroup = indexgroup;
1475 }
1476 }
1477
1478 if (!_validate(self)) {
1479 Py_DECREF(self);
1480 return NULL;
1481 }
1482
1483 return (PyObject*) self;
1484 }
1485
1486 /* -------------------------------------------------------------------- */
1487 /* Code validation */
1488
1489 /* To learn more about this code, have a look at the _compile() function in
1490 Lib/sre_compile.py. The validation functions below checks the code array
1491 for conformance with the code patterns generated there.
1492
1493 The nice thing about the generated code is that it is position-independent:
1494 all jumps are relative jumps forward. Also, jumps don't cross each other:
1495 the target of a later jump is always earlier than the target of an earlier
1496 jump. IOW, this is okay:
1497
1498 J---------J-------T--------T
1499 \ \_____/ /
1500 \______________________/
1501
1502 but this is not:
1503
1504 J---------J-------T--------T
1505 \_________\_____/ /
1506 \____________/
1507
1508 It also helps that SRE_CODE is always an unsigned type.
1509 */
1510
1511 /* Defining this one enables tracing of the validator */
1512 #undef VVERBOSE
1513
1514 /* Trace macro for the validator */
1515 #if defined(VVERBOSE)
1516 #define VTRACE(v) printf v
1517 #else
1518 #define VTRACE(v) do {} while(0) /* do nothing */
1519 #endif
1520
1521 /* Report failure */
1522 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1523
1524 /* Extract opcode, argument, or skip count from code array */
1525 #define GET_OP \
1526 do { \
1527 VTRACE(("%p: ", code)); \
1528 if (code >= end) FAIL; \
1529 op = *code++; \
1530 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1531 } while (0)
1532 #define GET_ARG \
1533 do { \
1534 VTRACE(("%p= ", code)); \
1535 if (code >= end) FAIL; \
1536 arg = *code++; \
1537 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1538 } while (0)
1539 #define GET_SKIP_ADJ(adj) \
1540 do { \
1541 VTRACE(("%p= ", code)); \
1542 if (code >= end) FAIL; \
1543 skip = *code; \
1544 VTRACE(("%lu (skip to %p)\n", \
1545 (unsigned long)skip, code+skip)); \
1546 if (skip-adj > (uintptr_t)(end - code)) \
1547 FAIL; \
1548 code++; \
1549 } while (0)
1550 #define GET_SKIP GET_SKIP_ADJ(0)
1551
1552 static int
_validate_charset(SRE_CODE * code,SRE_CODE * end)1553 _validate_charset(SRE_CODE *code, SRE_CODE *end)
1554 {
1555 /* Some variables are manipulated by the macros above */
1556 SRE_CODE op;
1557 SRE_CODE arg;
1558 SRE_CODE offset;
1559 int i;
1560
1561 while (code < end) {
1562 GET_OP;
1563 switch (op) {
1564
1565 case SRE_OP_NEGATE:
1566 break;
1567
1568 case SRE_OP_LITERAL:
1569 GET_ARG;
1570 break;
1571
1572 case SRE_OP_RANGE:
1573 case SRE_OP_RANGE_UNI_IGNORE:
1574 GET_ARG;
1575 GET_ARG;
1576 break;
1577
1578 case SRE_OP_CHARSET:
1579 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1580 if (offset > (uintptr_t)(end - code))
1581 FAIL;
1582 code += offset;
1583 break;
1584
1585 case SRE_OP_BIGCHARSET:
1586 GET_ARG; /* Number of blocks */
1587 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1588 if (offset > (uintptr_t)(end - code))
1589 FAIL;
1590 /* Make sure that each byte points to a valid block */
1591 for (i = 0; i < 256; i++) {
1592 if (((unsigned char *)code)[i] >= arg)
1593 FAIL;
1594 }
1595 code += offset;
1596 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1597 if (offset > (uintptr_t)(end - code))
1598 FAIL;
1599 code += offset;
1600 break;
1601
1602 case SRE_OP_CATEGORY:
1603 GET_ARG;
1604 switch (arg) {
1605 case SRE_CATEGORY_DIGIT:
1606 case SRE_CATEGORY_NOT_DIGIT:
1607 case SRE_CATEGORY_SPACE:
1608 case SRE_CATEGORY_NOT_SPACE:
1609 case SRE_CATEGORY_WORD:
1610 case SRE_CATEGORY_NOT_WORD:
1611 case SRE_CATEGORY_LINEBREAK:
1612 case SRE_CATEGORY_NOT_LINEBREAK:
1613 case SRE_CATEGORY_LOC_WORD:
1614 case SRE_CATEGORY_LOC_NOT_WORD:
1615 case SRE_CATEGORY_UNI_DIGIT:
1616 case SRE_CATEGORY_UNI_NOT_DIGIT:
1617 case SRE_CATEGORY_UNI_SPACE:
1618 case SRE_CATEGORY_UNI_NOT_SPACE:
1619 case SRE_CATEGORY_UNI_WORD:
1620 case SRE_CATEGORY_UNI_NOT_WORD:
1621 case SRE_CATEGORY_UNI_LINEBREAK:
1622 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1623 break;
1624 default:
1625 FAIL;
1626 }
1627 break;
1628
1629 default:
1630 FAIL;
1631
1632 }
1633 }
1634
1635 return 1;
1636 }
1637
1638 static int
_validate_inner(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)1639 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1640 {
1641 /* Some variables are manipulated by the macros above */
1642 SRE_CODE op;
1643 SRE_CODE arg;
1644 SRE_CODE skip;
1645
1646 VTRACE(("code=%p, end=%p\n", code, end));
1647
1648 if (code > end)
1649 FAIL;
1650
1651 while (code < end) {
1652 GET_OP;
1653 switch (op) {
1654
1655 case SRE_OP_MARK:
1656 /* We don't check whether marks are properly nested; the
1657 sre_match() code is robust even if they don't, and the worst
1658 you can get is nonsensical match results. */
1659 GET_ARG;
1660 if (arg > 2 * (size_t)groups + 1) {
1661 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1662 FAIL;
1663 }
1664 break;
1665
1666 case SRE_OP_LITERAL:
1667 case SRE_OP_NOT_LITERAL:
1668 case SRE_OP_LITERAL_IGNORE:
1669 case SRE_OP_NOT_LITERAL_IGNORE:
1670 case SRE_OP_LITERAL_UNI_IGNORE:
1671 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1672 case SRE_OP_LITERAL_LOC_IGNORE:
1673 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1674 GET_ARG;
1675 /* The arg is just a character, nothing to check */
1676 break;
1677
1678 case SRE_OP_SUCCESS:
1679 case SRE_OP_FAILURE:
1680 /* Nothing to check; these normally end the matching process */
1681 break;
1682
1683 case SRE_OP_AT:
1684 GET_ARG;
1685 switch (arg) {
1686 case SRE_AT_BEGINNING:
1687 case SRE_AT_BEGINNING_STRING:
1688 case SRE_AT_BEGINNING_LINE:
1689 case SRE_AT_END:
1690 case SRE_AT_END_LINE:
1691 case SRE_AT_END_STRING:
1692 case SRE_AT_BOUNDARY:
1693 case SRE_AT_NON_BOUNDARY:
1694 case SRE_AT_LOC_BOUNDARY:
1695 case SRE_AT_LOC_NON_BOUNDARY:
1696 case SRE_AT_UNI_BOUNDARY:
1697 case SRE_AT_UNI_NON_BOUNDARY:
1698 break;
1699 default:
1700 FAIL;
1701 }
1702 break;
1703
1704 case SRE_OP_ANY:
1705 case SRE_OP_ANY_ALL:
1706 /* These have no operands */
1707 break;
1708
1709 case SRE_OP_IN:
1710 case SRE_OP_IN_IGNORE:
1711 case SRE_OP_IN_UNI_IGNORE:
1712 case SRE_OP_IN_LOC_IGNORE:
1713 GET_SKIP;
1714 /* Stop 1 before the end; we check the FAILURE below */
1715 if (!_validate_charset(code, code+skip-2))
1716 FAIL;
1717 if (code[skip-2] != SRE_OP_FAILURE)
1718 FAIL;
1719 code += skip-1;
1720 break;
1721
1722 case SRE_OP_INFO:
1723 {
1724 /* A minimal info field is
1725 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1726 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1727 more follows. */
1728 SRE_CODE flags, i;
1729 SRE_CODE *newcode;
1730 GET_SKIP;
1731 newcode = code+skip-1;
1732 GET_ARG; flags = arg;
1733 GET_ARG;
1734 GET_ARG;
1735 /* Check that only valid flags are present */
1736 if ((flags & ~(SRE_INFO_PREFIX |
1737 SRE_INFO_LITERAL |
1738 SRE_INFO_CHARSET)) != 0)
1739 FAIL;
1740 /* PREFIX and CHARSET are mutually exclusive */
1741 if ((flags & SRE_INFO_PREFIX) &&
1742 (flags & SRE_INFO_CHARSET))
1743 FAIL;
1744 /* LITERAL implies PREFIX */
1745 if ((flags & SRE_INFO_LITERAL) &&
1746 !(flags & SRE_INFO_PREFIX))
1747 FAIL;
1748 /* Validate the prefix */
1749 if (flags & SRE_INFO_PREFIX) {
1750 SRE_CODE prefix_len;
1751 GET_ARG; prefix_len = arg;
1752 GET_ARG;
1753 /* Here comes the prefix string */
1754 if (prefix_len > (uintptr_t)(newcode - code))
1755 FAIL;
1756 code += prefix_len;
1757 /* And here comes the overlap table */
1758 if (prefix_len > (uintptr_t)(newcode - code))
1759 FAIL;
1760 /* Each overlap value should be < prefix_len */
1761 for (i = 0; i < prefix_len; i++) {
1762 if (code[i] >= prefix_len)
1763 FAIL;
1764 }
1765 code += prefix_len;
1766 }
1767 /* Validate the charset */
1768 if (flags & SRE_INFO_CHARSET) {
1769 if (!_validate_charset(code, newcode-1))
1770 FAIL;
1771 if (newcode[-1] != SRE_OP_FAILURE)
1772 FAIL;
1773 code = newcode;
1774 }
1775 else if (code != newcode) {
1776 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1777 FAIL;
1778 }
1779 }
1780 break;
1781
1782 case SRE_OP_BRANCH:
1783 {
1784 SRE_CODE *target = NULL;
1785 for (;;) {
1786 GET_SKIP;
1787 if (skip == 0)
1788 break;
1789 /* Stop 2 before the end; we check the JUMP below */
1790 if (!_validate_inner(code, code+skip-3, groups))
1791 FAIL;
1792 code += skip-3;
1793 /* Check that it ends with a JUMP, and that each JUMP
1794 has the same target */
1795 GET_OP;
1796 if (op != SRE_OP_JUMP)
1797 FAIL;
1798 GET_SKIP;
1799 if (target == NULL)
1800 target = code+skip-1;
1801 else if (code+skip-1 != target)
1802 FAIL;
1803 }
1804 }
1805 break;
1806
1807 case SRE_OP_REPEAT_ONE:
1808 case SRE_OP_MIN_REPEAT_ONE:
1809 {
1810 SRE_CODE min, max;
1811 GET_SKIP;
1812 GET_ARG; min = arg;
1813 GET_ARG; max = arg;
1814 if (min > max)
1815 FAIL;
1816 if (max > SRE_MAXREPEAT)
1817 FAIL;
1818 if (!_validate_inner(code, code+skip-4, groups))
1819 FAIL;
1820 code += skip-4;
1821 GET_OP;
1822 if (op != SRE_OP_SUCCESS)
1823 FAIL;
1824 }
1825 break;
1826
1827 case SRE_OP_REPEAT:
1828 {
1829 SRE_CODE min, max;
1830 GET_SKIP;
1831 GET_ARG; min = arg;
1832 GET_ARG; max = arg;
1833 if (min > max)
1834 FAIL;
1835 if (max > SRE_MAXREPEAT)
1836 FAIL;
1837 if (!_validate_inner(code, code+skip-3, groups))
1838 FAIL;
1839 code += skip-3;
1840 GET_OP;
1841 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1842 FAIL;
1843 }
1844 break;
1845
1846 case SRE_OP_GROUPREF:
1847 case SRE_OP_GROUPREF_IGNORE:
1848 case SRE_OP_GROUPREF_UNI_IGNORE:
1849 case SRE_OP_GROUPREF_LOC_IGNORE:
1850 GET_ARG;
1851 if (arg >= (size_t)groups)
1852 FAIL;
1853 break;
1854
1855 case SRE_OP_GROUPREF_EXISTS:
1856 /* The regex syntax for this is: '(?(group)then|else)', where
1857 'group' is either an integer group number or a group name,
1858 'then' and 'else' are sub-regexes, and 'else' is optional. */
1859 GET_ARG;
1860 if (arg >= (size_t)groups)
1861 FAIL;
1862 GET_SKIP_ADJ(1);
1863 code--; /* The skip is relative to the first arg! */
1864 /* There are two possibilities here: if there is both a 'then'
1865 part and an 'else' part, the generated code looks like:
1866
1867 GROUPREF_EXISTS
1868 <group>
1869 <skipyes>
1870 ...then part...
1871 JUMP
1872 <skipno>
1873 (<skipyes> jumps here)
1874 ...else part...
1875 (<skipno> jumps here)
1876
1877 If there is only a 'then' part, it looks like:
1878
1879 GROUPREF_EXISTS
1880 <group>
1881 <skip>
1882 ...then part...
1883 (<skip> jumps here)
1884
1885 There is no direct way to decide which it is, and we don't want
1886 to allow arbitrary jumps anywhere in the code; so we just look
1887 for a JUMP opcode preceding our skip target.
1888 */
1889 if (skip >= 3 && skip-3 < (uintptr_t)(end - code) &&
1890 code[skip-3] == SRE_OP_JUMP)
1891 {
1892 VTRACE(("both then and else parts present\n"));
1893 if (!_validate_inner(code+1, code+skip-3, groups))
1894 FAIL;
1895 code += skip-2; /* Position after JUMP, at <skipno> */
1896 GET_SKIP;
1897 if (!_validate_inner(code, code+skip-1, groups))
1898 FAIL;
1899 code += skip-1;
1900 }
1901 else {
1902 VTRACE(("only a then part present\n"));
1903 if (!_validate_inner(code+1, code+skip-1, groups))
1904 FAIL;
1905 code += skip-1;
1906 }
1907 break;
1908
1909 case SRE_OP_ASSERT:
1910 case SRE_OP_ASSERT_NOT:
1911 GET_SKIP;
1912 GET_ARG; /* 0 for lookahead, width for lookbehind */
1913 code--; /* Back up over arg to simplify math below */
1914 if (arg & 0x80000000)
1915 FAIL; /* Width too large */
1916 /* Stop 1 before the end; we check the SUCCESS below */
1917 if (!_validate_inner(code+1, code+skip-2, groups))
1918 FAIL;
1919 code += skip-2;
1920 GET_OP;
1921 if (op != SRE_OP_SUCCESS)
1922 FAIL;
1923 break;
1924
1925 default:
1926 FAIL;
1927
1928 }
1929 }
1930
1931 VTRACE(("okay\n"));
1932 return 1;
1933 }
1934
1935 static int
_validate_outer(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)1936 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1937 {
1938 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1939 code >= end || end[-1] != SRE_OP_SUCCESS)
1940 FAIL;
1941 return _validate_inner(code, end-1, groups);
1942 }
1943
1944 static int
_validate(PatternObject * self)1945 _validate(PatternObject *self)
1946 {
1947 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1948 {
1949 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1950 return 0;
1951 }
1952 else
1953 VTRACE(("Success!\n"));
1954 return 1;
1955 }
1956
1957 /* -------------------------------------------------------------------- */
1958 /* match methods */
1959
1960 static int
match_traverse(MatchObject * self,visitproc visit,void * arg)1961 match_traverse(MatchObject *self, visitproc visit, void *arg)
1962 {
1963 Py_VISIT(Py_TYPE(self));
1964 Py_VISIT(self->string);
1965 Py_VISIT(self->regs);
1966 Py_VISIT(self->pattern);
1967 return 0;
1968 }
1969
1970 static int
match_clear(MatchObject * self)1971 match_clear(MatchObject *self)
1972 {
1973 Py_CLEAR(self->string);
1974 Py_CLEAR(self->regs);
1975 Py_CLEAR(self->pattern);
1976 return 0;
1977 }
1978
1979 static void
match_dealloc(MatchObject * self)1980 match_dealloc(MatchObject* self)
1981 {
1982 PyTypeObject *tp = Py_TYPE(self);
1983
1984 PyObject_GC_UnTrack(self);
1985 (void)match_clear(self);
1986 tp->tp_free(self);
1987 Py_DECREF(tp);
1988 }
1989
1990 static PyObject*
match_getslice_by_index(MatchObject * self,Py_ssize_t index,PyObject * def)1991 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
1992 {
1993 Py_ssize_t length;
1994 int isbytes, charsize;
1995 Py_buffer view;
1996 PyObject *result;
1997 const void* ptr;
1998 Py_ssize_t i, j;
1999
2000 assert(0 <= index && index < self->groups);
2001 index *= 2;
2002
2003 if (self->string == Py_None || self->mark[index] < 0) {
2004 /* return default value if the string or group is undefined */
2005 Py_INCREF(def);
2006 return def;
2007 }
2008
2009 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2010 if (ptr == NULL)
2011 return NULL;
2012
2013 i = self->mark[index];
2014 j = self->mark[index+1];
2015 i = Py_MIN(i, length);
2016 j = Py_MIN(j, length);
2017 result = getslice(isbytes, ptr, self->string, i, j);
2018 if (isbytes && view.buf != NULL)
2019 PyBuffer_Release(&view);
2020 return result;
2021 }
2022
2023 static Py_ssize_t
match_getindex(MatchObject * self,PyObject * index)2024 match_getindex(MatchObject* self, PyObject* index)
2025 {
2026 Py_ssize_t i;
2027
2028 if (index == NULL)
2029 /* Default value */
2030 return 0;
2031
2032 if (PyIndex_Check(index)) {
2033 i = PyNumber_AsSsize_t(index, NULL);
2034 }
2035 else {
2036 i = -1;
2037
2038 if (self->pattern->groupindex) {
2039 index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2040 if (index && PyLong_Check(index)) {
2041 i = PyLong_AsSsize_t(index);
2042 }
2043 }
2044 }
2045 if (i < 0 || i >= self->groups) {
2046 /* raise IndexError if we were given a bad group number */
2047 if (!PyErr_Occurred()) {
2048 PyErr_SetString(PyExc_IndexError, "no such group");
2049 }
2050 return -1;
2051 }
2052
2053 return i;
2054 }
2055
2056 static PyObject*
match_getslice(MatchObject * self,PyObject * index,PyObject * def)2057 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2058 {
2059 Py_ssize_t i = match_getindex(self, index);
2060
2061 if (i < 0) {
2062 return NULL;
2063 }
2064
2065 return match_getslice_by_index(self, i, def);
2066 }
2067
2068 /*[clinic input]
2069 _sre.SRE_Match.expand
2070
2071 template: object
2072
2073 Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2074 [clinic start generated code]*/
2075
2076 static PyObject *
_sre_SRE_Match_expand_impl(MatchObject * self,PyObject * template)2077 _sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2078 /*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
2079 {
2080 /* delegate to Python code */
2081 return call(
2082 SRE_PY_MODULE, "_expand",
2083 PyTuple_Pack(3, self->pattern, self, template)
2084 );
2085 }
2086
2087 static PyObject*
match_group(MatchObject * self,PyObject * args)2088 match_group(MatchObject* self, PyObject* args)
2089 {
2090 PyObject* result;
2091 Py_ssize_t i, size;
2092
2093 size = PyTuple_GET_SIZE(args);
2094
2095 switch (size) {
2096 case 0:
2097 result = match_getslice(self, _PyLong_GetZero(), Py_None);
2098 break;
2099 case 1:
2100 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2101 break;
2102 default:
2103 /* fetch multiple items */
2104 result = PyTuple_New(size);
2105 if (!result)
2106 return NULL;
2107 for (i = 0; i < size; i++) {
2108 PyObject* item = match_getslice(
2109 self, PyTuple_GET_ITEM(args, i), Py_None
2110 );
2111 if (!item) {
2112 Py_DECREF(result);
2113 return NULL;
2114 }
2115 PyTuple_SET_ITEM(result, i, item);
2116 }
2117 break;
2118 }
2119 return result;
2120 }
2121
2122 static PyObject*
match_getitem(MatchObject * self,PyObject * name)2123 match_getitem(MatchObject* self, PyObject* name)
2124 {
2125 return match_getslice(self, name, Py_None);
2126 }
2127
2128 /*[clinic input]
2129 _sre.SRE_Match.groups
2130
2131 default: object = None
2132 Is used for groups that did not participate in the match.
2133
2134 Return a tuple containing all the subgroups of the match, from 1.
2135 [clinic start generated code]*/
2136
2137 static PyObject *
_sre_SRE_Match_groups_impl(MatchObject * self,PyObject * default_value)2138 _sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2139 /*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2140 {
2141 PyObject* result;
2142 Py_ssize_t index;
2143
2144 result = PyTuple_New(self->groups-1);
2145 if (!result)
2146 return NULL;
2147
2148 for (index = 1; index < self->groups; index++) {
2149 PyObject* item;
2150 item = match_getslice_by_index(self, index, default_value);
2151 if (!item) {
2152 Py_DECREF(result);
2153 return NULL;
2154 }
2155 PyTuple_SET_ITEM(result, index-1, item);
2156 }
2157
2158 return result;
2159 }
2160
2161 /*[clinic input]
2162 _sre.SRE_Match.groupdict
2163
2164 default: object = None
2165 Is used for groups that did not participate in the match.
2166
2167 Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2168 [clinic start generated code]*/
2169
2170 static PyObject *
_sre_SRE_Match_groupdict_impl(MatchObject * self,PyObject * default_value)2171 _sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2172 /*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
2173 {
2174 PyObject *result;
2175 PyObject *key;
2176 PyObject *value;
2177 Py_ssize_t pos = 0;
2178 Py_hash_t hash;
2179
2180 result = PyDict_New();
2181 if (!result || !self->pattern->groupindex)
2182 return result;
2183
2184 while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2185 int status;
2186 Py_INCREF(key);
2187 value = match_getslice(self, key, default_value);
2188 if (!value) {
2189 Py_DECREF(key);
2190 goto failed;
2191 }
2192 status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2193 Py_DECREF(value);
2194 Py_DECREF(key);
2195 if (status < 0)
2196 goto failed;
2197 }
2198
2199 return result;
2200
2201 failed:
2202 Py_DECREF(result);
2203 return NULL;
2204 }
2205
2206 /*[clinic input]
2207 _sre.SRE_Match.start -> Py_ssize_t
2208
2209 group: object(c_default="NULL") = 0
2210 /
2211
2212 Return index of the start of the substring matched by group.
2213 [clinic start generated code]*/
2214
2215 static Py_ssize_t
_sre_SRE_Match_start_impl(MatchObject * self,PyObject * group)2216 _sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2217 /*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2218 {
2219 Py_ssize_t index = match_getindex(self, group);
2220
2221 if (index < 0) {
2222 return -1;
2223 }
2224
2225 /* mark is -1 if group is undefined */
2226 return self->mark[index*2];
2227 }
2228
2229 /*[clinic input]
2230 _sre.SRE_Match.end -> Py_ssize_t
2231
2232 group: object(c_default="NULL") = 0
2233 /
2234
2235 Return index of the end of the substring matched by group.
2236 [clinic start generated code]*/
2237
2238 static Py_ssize_t
_sre_SRE_Match_end_impl(MatchObject * self,PyObject * group)2239 _sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2240 /*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2241 {
2242 Py_ssize_t index = match_getindex(self, group);
2243
2244 if (index < 0) {
2245 return -1;
2246 }
2247
2248 /* mark is -1 if group is undefined */
2249 return self->mark[index*2+1];
2250 }
2251
2252 LOCAL(PyObject*)
_pair(Py_ssize_t i1,Py_ssize_t i2)2253 _pair(Py_ssize_t i1, Py_ssize_t i2)
2254 {
2255 PyObject* pair;
2256 PyObject* item;
2257
2258 pair = PyTuple_New(2);
2259 if (!pair)
2260 return NULL;
2261
2262 item = PyLong_FromSsize_t(i1);
2263 if (!item)
2264 goto error;
2265 PyTuple_SET_ITEM(pair, 0, item);
2266
2267 item = PyLong_FromSsize_t(i2);
2268 if (!item)
2269 goto error;
2270 PyTuple_SET_ITEM(pair, 1, item);
2271
2272 return pair;
2273
2274 error:
2275 Py_DECREF(pair);
2276 return NULL;
2277 }
2278
2279 /*[clinic input]
2280 _sre.SRE_Match.span
2281
2282 group: object(c_default="NULL") = 0
2283 /
2284
2285 For match object m, return the 2-tuple (m.start(group), m.end(group)).
2286 [clinic start generated code]*/
2287
2288 static PyObject *
_sre_SRE_Match_span_impl(MatchObject * self,PyObject * group)2289 _sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2290 /*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
2291 {
2292 Py_ssize_t index = match_getindex(self, group);
2293
2294 if (index < 0) {
2295 return NULL;
2296 }
2297
2298 /* marks are -1 if group is undefined */
2299 return _pair(self->mark[index*2], self->mark[index*2+1]);
2300 }
2301
2302 static PyObject*
match_regs(MatchObject * self)2303 match_regs(MatchObject* self)
2304 {
2305 PyObject* regs;
2306 PyObject* item;
2307 Py_ssize_t index;
2308
2309 regs = PyTuple_New(self->groups);
2310 if (!regs)
2311 return NULL;
2312
2313 for (index = 0; index < self->groups; index++) {
2314 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2315 if (!item) {
2316 Py_DECREF(regs);
2317 return NULL;
2318 }
2319 PyTuple_SET_ITEM(regs, index, item);
2320 }
2321
2322 Py_INCREF(regs);
2323 self->regs = regs;
2324
2325 return regs;
2326 }
2327
2328 /*[clinic input]
2329 _sre.SRE_Match.__copy__
2330
2331 [clinic start generated code]*/
2332
2333 static PyObject *
_sre_SRE_Match___copy___impl(MatchObject * self)2334 _sre_SRE_Match___copy___impl(MatchObject *self)
2335 /*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2336 {
2337 Py_INCREF(self);
2338 return (PyObject *)self;
2339 }
2340
2341 /*[clinic input]
2342 _sre.SRE_Match.__deepcopy__
2343
2344 memo: object
2345 /
2346
2347 [clinic start generated code]*/
2348
2349 static PyObject *
_sre_SRE_Match___deepcopy__(MatchObject * self,PyObject * memo)2350 _sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2351 /*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
2352 {
2353 Py_INCREF(self);
2354 return (PyObject *)self;
2355 }
2356
2357 PyDoc_STRVAR(match_doc,
2358 "The result of re.match() and re.search().\n\
2359 Match objects always have a boolean value of True.");
2360
2361 PyDoc_STRVAR(match_group_doc,
2362 "group([group1, ...]) -> str or tuple.\n\
2363 Return subgroup(s) of the match by indices or names.\n\
2364 For 0 returns the entire match.");
2365
2366 static PyObject *
match_lastindex_get(MatchObject * self,void * Py_UNUSED (ignored))2367 match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
2368 {
2369 if (self->lastindex >= 0)
2370 return PyLong_FromSsize_t(self->lastindex);
2371 Py_RETURN_NONE;
2372 }
2373
2374 static PyObject *
match_lastgroup_get(MatchObject * self,void * Py_UNUSED (ignored))2375 match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
2376 {
2377 if (self->pattern->indexgroup &&
2378 self->lastindex >= 0 &&
2379 self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2380 {
2381 PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2382 self->lastindex);
2383 Py_INCREF(result);
2384 return result;
2385 }
2386 Py_RETURN_NONE;
2387 }
2388
2389 static PyObject *
match_regs_get(MatchObject * self,void * Py_UNUSED (ignored))2390 match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
2391 {
2392 if (self->regs) {
2393 Py_INCREF(self->regs);
2394 return self->regs;
2395 } else
2396 return match_regs(self);
2397 }
2398
2399 static PyObject *
match_repr(MatchObject * self)2400 match_repr(MatchObject *self)
2401 {
2402 PyObject *result;
2403 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2404 if (group0 == NULL)
2405 return NULL;
2406 result = PyUnicode_FromFormat(
2407 "<%s object; span=(%zd, %zd), match=%.50R>",
2408 Py_TYPE(self)->tp_name,
2409 self->mark[0], self->mark[1], group0);
2410 Py_DECREF(group0);
2411 return result;
2412 }
2413
2414
2415 static PyObject*
pattern_new_match(_sremodulestate * module_state,PatternObject * pattern,SRE_STATE * state,Py_ssize_t status)2416 pattern_new_match(_sremodulestate* module_state,
2417 PatternObject* pattern,
2418 SRE_STATE* state,
2419 Py_ssize_t status)
2420 {
2421 /* create match object (from state object) */
2422
2423 MatchObject* match;
2424 Py_ssize_t i, j;
2425 char* base;
2426 int n;
2427
2428 if (status > 0) {
2429
2430 /* create match object (with room for extra group marks) */
2431 /* coverity[ampersand_in_size] */
2432 match = PyObject_GC_NewVar(MatchObject,
2433 module_state->Match_Type,
2434 2*(pattern->groups+1));
2435 if (!match)
2436 return NULL;
2437
2438 Py_INCREF(pattern);
2439 match->pattern = pattern;
2440
2441 Py_INCREF(state->string);
2442 match->string = state->string;
2443
2444 match->regs = NULL;
2445 match->groups = pattern->groups+1;
2446
2447 /* fill in group slices */
2448
2449 base = (char*) state->beginning;
2450 n = state->charsize;
2451
2452 match->mark[0] = ((char*) state->start - base) / n;
2453 match->mark[1] = ((char*) state->ptr - base) / n;
2454
2455 for (i = j = 0; i < pattern->groups; i++, j+=2)
2456 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2457 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2458 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2459 } else
2460 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2461
2462 match->pos = state->pos;
2463 match->endpos = state->endpos;
2464
2465 match->lastindex = state->lastindex;
2466
2467 PyObject_GC_Track(match);
2468 return (PyObject*) match;
2469
2470 } else if (status == 0) {
2471
2472 /* no match */
2473 Py_RETURN_NONE;
2474
2475 }
2476
2477 /* internal error */
2478 pattern_error(status);
2479 return NULL;
2480 }
2481
2482
2483 /* -------------------------------------------------------------------- */
2484 /* scanner methods (experimental) */
2485
2486 static int
scanner_traverse(ScannerObject * self,visitproc visit,void * arg)2487 scanner_traverse(ScannerObject *self, visitproc visit, void *arg)
2488 {
2489 Py_VISIT(Py_TYPE(self));
2490 Py_VISIT(self->pattern);
2491 return 0;
2492 }
2493
2494 static int
scanner_clear(ScannerObject * self)2495 scanner_clear(ScannerObject *self)
2496 {
2497 Py_CLEAR(self->pattern);
2498 return 0;
2499 }
2500
2501 static void
scanner_dealloc(ScannerObject * self)2502 scanner_dealloc(ScannerObject* self)
2503 {
2504 PyTypeObject *tp = Py_TYPE(self);
2505
2506 PyObject_GC_UnTrack(self);
2507 state_fini(&self->state);
2508 (void)scanner_clear(self);
2509 tp->tp_free(self);
2510 Py_DECREF(tp);
2511 }
2512
2513 /*[clinic input]
2514 _sre.SRE_Scanner.match
2515
2516 cls: defining_class
2517 /
2518
2519 [clinic start generated code]*/
2520
2521 static PyObject *
_sre_SRE_Scanner_match_impl(ScannerObject * self,PyTypeObject * cls)2522 _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
2523 /*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
2524 {
2525 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2526 SRE_STATE* state = &self->state;
2527 PyObject* match;
2528 Py_ssize_t status;
2529
2530 if (state->start == NULL)
2531 Py_RETURN_NONE;
2532
2533 state_reset(state);
2534
2535 state->ptr = state->start;
2536
2537 status = sre_match(state, PatternObject_GetCode(self->pattern));
2538 if (PyErr_Occurred())
2539 return NULL;
2540
2541 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2542 state, status);
2543
2544 if (status == 0)
2545 state->start = NULL;
2546 else {
2547 state->must_advance = (state->ptr == state->start);
2548 state->start = state->ptr;
2549 }
2550
2551 return match;
2552 }
2553
2554
2555 /*[clinic input]
2556 _sre.SRE_Scanner.search
2557
2558 cls: defining_class
2559 /
2560
2561 [clinic start generated code]*/
2562
2563 static PyObject *
_sre_SRE_Scanner_search_impl(ScannerObject * self,PyTypeObject * cls)2564 _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2565 /*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2566 {
2567 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2568 SRE_STATE* state = &self->state;
2569 PyObject* match;
2570 Py_ssize_t status;
2571
2572 if (state->start == NULL)
2573 Py_RETURN_NONE;
2574
2575 state_reset(state);
2576
2577 state->ptr = state->start;
2578
2579 status = sre_search(state, PatternObject_GetCode(self->pattern));
2580 if (PyErr_Occurred())
2581 return NULL;
2582
2583 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2584 state, status);
2585
2586 if (status == 0)
2587 state->start = NULL;
2588 else {
2589 state->must_advance = (state->ptr == state->start);
2590 state->start = state->ptr;
2591 }
2592
2593 return match;
2594 }
2595
2596 static PyObject *
pattern_scanner(_sremodulestate * module_state,PatternObject * self,PyObject * string,Py_ssize_t pos,Py_ssize_t endpos)2597 pattern_scanner(_sremodulestate *module_state,
2598 PatternObject *self,
2599 PyObject *string,
2600 Py_ssize_t pos,
2601 Py_ssize_t endpos)
2602 {
2603 ScannerObject* scanner;
2604
2605 /* create scanner object */
2606 scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2607 if (!scanner)
2608 return NULL;
2609 scanner->pattern = NULL;
2610
2611 /* create search state object */
2612 if (!state_init(&scanner->state, self, string, pos, endpos)) {
2613 Py_DECREF(scanner);
2614 return NULL;
2615 }
2616
2617 Py_INCREF(self);
2618 scanner->pattern = (PyObject*) self;
2619
2620 PyObject_GC_Track(scanner);
2621 return (PyObject*) scanner;
2622 }
2623
2624 static Py_hash_t
pattern_hash(PatternObject * self)2625 pattern_hash(PatternObject *self)
2626 {
2627 Py_hash_t hash, hash2;
2628
2629 hash = PyObject_Hash(self->pattern);
2630 if (hash == -1) {
2631 return -1;
2632 }
2633
2634 hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2635 hash ^= hash2;
2636
2637 hash ^= self->flags;
2638 hash ^= self->isbytes;
2639 hash ^= self->codesize;
2640
2641 if (hash == -1) {
2642 hash = -2;
2643 }
2644 return hash;
2645 }
2646
2647 static PyObject*
pattern_richcompare(PyObject * lefto,PyObject * righto,int op)2648 pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2649 {
2650 PyTypeObject *tp = Py_TYPE(lefto);
2651 _sremodulestate *module_state = get_sre_module_state_by_class(tp);
2652 PatternObject *left, *right;
2653 int cmp;
2654
2655 if (op != Py_EQ && op != Py_NE) {
2656 Py_RETURN_NOTIMPLEMENTED;
2657 }
2658
2659 if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
2660 {
2661 Py_RETURN_NOTIMPLEMENTED;
2662 }
2663
2664 if (lefto == righto) {
2665 /* a pattern is equal to itself */
2666 return PyBool_FromLong(op == Py_EQ);
2667 }
2668
2669 left = (PatternObject *)lefto;
2670 right = (PatternObject *)righto;
2671
2672 cmp = (left->flags == right->flags
2673 && left->isbytes == right->isbytes
2674 && left->codesize == right->codesize);
2675 if (cmp) {
2676 /* Compare the code and the pattern because the same pattern can
2677 produce different codes depending on the locale used to compile the
2678 pattern when the re.LOCALE flag is used. Don't compare groups,
2679 indexgroup nor groupindex: they are derivated from the pattern. */
2680 cmp = (memcmp(left->code, right->code,
2681 sizeof(left->code[0]) * left->codesize) == 0);
2682 }
2683 if (cmp) {
2684 cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2685 Py_EQ);
2686 if (cmp < 0) {
2687 return NULL;
2688 }
2689 }
2690 if (op == Py_NE) {
2691 cmp = !cmp;
2692 }
2693 return PyBool_FromLong(cmp);
2694 }
2695
2696 #include "clinic/_sre.c.h"
2697
2698 static PyMethodDef pattern_methods[] = {
2699 _SRE_SRE_PATTERN_MATCH_METHODDEF
2700 _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2701 _SRE_SRE_PATTERN_SEARCH_METHODDEF
2702 _SRE_SRE_PATTERN_SUB_METHODDEF
2703 _SRE_SRE_PATTERN_SUBN_METHODDEF
2704 _SRE_SRE_PATTERN_FINDALL_METHODDEF
2705 _SRE_SRE_PATTERN_SPLIT_METHODDEF
2706 _SRE_SRE_PATTERN_FINDITER_METHODDEF
2707 _SRE_SRE_PATTERN_SCANNER_METHODDEF
2708 _SRE_SRE_PATTERN___COPY___METHODDEF
2709 _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
2710 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2711 PyDoc_STR("See PEP 585")},
2712 {NULL, NULL}
2713 };
2714
2715 static PyGetSetDef pattern_getset[] = {
2716 {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2717 "A dictionary mapping group names to group numbers."},
2718 {NULL} /* Sentinel */
2719 };
2720
2721 #define PAT_OFF(x) offsetof(PatternObject, x)
2722 static PyMemberDef pattern_members[] = {
2723 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY,
2724 "The pattern string from which the RE object was compiled."},
2725 {"flags", T_INT, PAT_OFF(flags), READONLY,
2726 "The regex matching flags."},
2727 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY,
2728 "The number of capturing groups in the pattern."},
2729 {"__weaklistoffset__", T_PYSSIZET, offsetof(PatternObject, weakreflist), READONLY},
2730 {NULL} /* Sentinel */
2731 };
2732
2733 static PyType_Slot pattern_slots[] = {
2734 {Py_tp_dealloc, (destructor)pattern_dealloc},
2735 {Py_tp_repr, (reprfunc)pattern_repr},
2736 {Py_tp_hash, (hashfunc)pattern_hash},
2737 {Py_tp_doc, (void *)pattern_doc},
2738 {Py_tp_richcompare, pattern_richcompare},
2739 {Py_tp_methods, pattern_methods},
2740 {Py_tp_members, pattern_members},
2741 {Py_tp_getset, pattern_getset},
2742 {Py_tp_traverse, pattern_traverse},
2743 {Py_tp_clear, pattern_clear},
2744 {0, NULL},
2745 };
2746
2747 static PyType_Spec pattern_spec = {
2748 .name = "re.Pattern",
2749 .basicsize = sizeof(PatternObject),
2750 .itemsize = sizeof(SRE_CODE),
2751 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2752 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2753 .slots = pattern_slots,
2754 };
2755
2756 static PyMethodDef match_methods[] = {
2757 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2758 _SRE_SRE_MATCH_START_METHODDEF
2759 _SRE_SRE_MATCH_END_METHODDEF
2760 _SRE_SRE_MATCH_SPAN_METHODDEF
2761 _SRE_SRE_MATCH_GROUPS_METHODDEF
2762 _SRE_SRE_MATCH_GROUPDICT_METHODDEF
2763 _SRE_SRE_MATCH_EXPAND_METHODDEF
2764 _SRE_SRE_MATCH___COPY___METHODDEF
2765 _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
2766 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2767 PyDoc_STR("See PEP 585")},
2768 {NULL, NULL}
2769 };
2770
2771 static PyGetSetDef match_getset[] = {
2772 {"lastindex", (getter)match_lastindex_get, (setter)NULL,
2773 "The integer index of the last matched capturing group."},
2774 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
2775 "The name of the last matched capturing group."},
2776 {"regs", (getter)match_regs_get, (setter)NULL},
2777 {NULL}
2778 };
2779
2780 #define MATCH_OFF(x) offsetof(MatchObject, x)
2781 static PyMemberDef match_members[] = {
2782 {"string", T_OBJECT, MATCH_OFF(string), READONLY,
2783 "The string passed to match() or search()."},
2784 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY,
2785 "The regular expression object."},
2786 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY,
2787 "The index into the string at which the RE engine started looking for a match."},
2788 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY,
2789 "The index into the string beyond which the RE engine will not go."},
2790 {NULL}
2791 };
2792
2793 /* FIXME: implement setattr("string", None) as a special case (to
2794 detach the associated string, if any */
2795 static PyType_Slot match_slots[] = {
2796 {Py_tp_dealloc, match_dealloc},
2797 {Py_tp_repr, match_repr},
2798 {Py_tp_doc, (void *)match_doc},
2799 {Py_tp_methods, match_methods},
2800 {Py_tp_members, match_members},
2801 {Py_tp_getset, match_getset},
2802 {Py_tp_traverse, match_traverse},
2803 {Py_tp_clear, match_clear},
2804
2805 /* As mapping.
2806 *
2807 * Match objects do not support length or assignment, but do support
2808 * __getitem__.
2809 */
2810 {Py_mp_subscript, match_getitem},
2811
2812 {0, NULL},
2813 };
2814
2815 static PyType_Spec match_spec = {
2816 .name = "re.Match",
2817 .basicsize = sizeof(MatchObject),
2818 .itemsize = sizeof(Py_ssize_t),
2819 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2820 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2821 .slots = match_slots,
2822 };
2823
2824 static PyMethodDef scanner_methods[] = {
2825 _SRE_SRE_SCANNER_MATCH_METHODDEF
2826 _SRE_SRE_SCANNER_SEARCH_METHODDEF
2827 {NULL, NULL}
2828 };
2829
2830 #define SCAN_OFF(x) offsetof(ScannerObject, x)
2831 static PyMemberDef scanner_members[] = {
2832 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
2833 {NULL} /* Sentinel */
2834 };
2835
2836 static PyType_Slot scanner_slots[] = {
2837 {Py_tp_dealloc, scanner_dealloc},
2838 {Py_tp_methods, scanner_methods},
2839 {Py_tp_members, scanner_members},
2840 {Py_tp_traverse, scanner_traverse},
2841 {Py_tp_clear, scanner_clear},
2842 {0, NULL},
2843 };
2844
2845 static PyType_Spec scanner_spec = {
2846 .name = "_" SRE_MODULE ".SRE_Scanner",
2847 .basicsize = sizeof(ScannerObject),
2848 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
2849 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
2850 .slots = scanner_slots,
2851 };
2852
2853 static PyMethodDef _functions[] = {
2854 _SRE_COMPILE_METHODDEF
2855 _SRE_GETCODESIZE_METHODDEF
2856 _SRE_ASCII_ISCASED_METHODDEF
2857 _SRE_UNICODE_ISCASED_METHODDEF
2858 _SRE_ASCII_TOLOWER_METHODDEF
2859 _SRE_UNICODE_TOLOWER_METHODDEF
2860 {NULL, NULL}
2861 };
2862
2863 static int
sre_traverse(PyObject * module,visitproc visit,void * arg)2864 sre_traverse(PyObject *module, visitproc visit, void *arg)
2865 {
2866 _sremodulestate *state = get_sre_module_state(module);
2867
2868 Py_VISIT(state->Pattern_Type);
2869 Py_VISIT(state->Match_Type);
2870 Py_VISIT(state->Scanner_Type);
2871
2872 return 0;
2873 }
2874
2875 static int
sre_clear(PyObject * module)2876 sre_clear(PyObject *module)
2877 {
2878 _sremodulestate *state = get_sre_module_state(module);
2879
2880 Py_CLEAR(state->Pattern_Type);
2881 Py_CLEAR(state->Match_Type);
2882 Py_CLEAR(state->Scanner_Type);
2883
2884 return 0;
2885 }
2886
2887 static void
sre_free(void * module)2888 sre_free(void *module)
2889 {
2890 sre_clear((PyObject *)module);
2891 }
2892
2893 #define CREATE_TYPE(m, type, spec) \
2894 do { \
2895 type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
2896 if (type == NULL) { \
2897 goto error; \
2898 } \
2899 } while (0)
2900
2901 #define ADD_ULONG_CONSTANT(module, name, value) \
2902 do { \
2903 PyObject *o = PyLong_FromUnsignedLong(value); \
2904 if (!o) \
2905 goto error; \
2906 int res = PyModule_AddObjectRef(module, name, o); \
2907 Py_DECREF(o); \
2908 if (res < 0) { \
2909 goto error; \
2910 } \
2911 } while (0)
2912
2913 static int
sre_exec(PyObject * m)2914 sre_exec(PyObject *m)
2915 {
2916 _sremodulestate *state;
2917
2918 /* Create heap types */
2919 state = get_sre_module_state(m);
2920 CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
2921 CREATE_TYPE(m, state->Match_Type, &match_spec);
2922 CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
2923
2924 if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
2925 goto error;
2926 }
2927
2928 if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
2929 goto error;
2930 }
2931
2932 ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
2933 ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
2934
2935 if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
2936 goto error;
2937 }
2938
2939 return 0;
2940
2941 error:
2942 return -1;
2943 }
2944
2945 static PyModuleDef_Slot sre_slots[] = {
2946 {Py_mod_exec, sre_exec},
2947 {0, NULL},
2948 };
2949
2950 static struct PyModuleDef sremodule = {
2951 .m_base = PyModuleDef_HEAD_INIT,
2952 .m_name = "_" SRE_MODULE,
2953 .m_size = sizeof(_sremodulestate),
2954 .m_methods = _functions,
2955 .m_slots = sre_slots,
2956 .m_traverse = sre_traverse,
2957 .m_free = sre_free,
2958 .m_clear = sre_clear,
2959 };
2960
2961 PyMODINIT_FUNC
PyInit__sre(void)2962 PyInit__sre(void)
2963 {
2964 return PyModuleDef_Init(&sremodule);
2965 }
2966
2967 /* vim:ts=4:sw=4:et
2968 */
2969