• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Secret Labs' Regular Expression Engine
3  *
4  * regular expression matching engine
5  *
6  * partial history:
7  * 1999-10-24 fl  created (based on existing template matcher code)
8  * 2000-03-06 fl  first alpha, sort of
9  * 2000-08-01 fl  fixes for 1.6b1
10  * 2000-08-07 fl  use PyOS_CheckStack() if available
11  * 2000-09-20 fl  added expand method
12  * 2001-03-20 fl  lots of fixes for 2.1b2
13  * 2001-04-15 fl  export copyright as Python attribute, not global
14  * 2001-04-28 fl  added __copy__ methods (work in progress)
15  * 2001-05-14 fl  fixes for 1.5.2 compatibility
16  * 2001-07-01 fl  added BIGCHARSET support (from Martin von Loewis)
17  * 2001-10-18 fl  fixed group reset issue (from Matthew Mueller)
18  * 2001-10-20 fl  added split primitive; reenable unicode for 1.6/2.0/2.1
19  * 2001-10-21 fl  added sub/subn primitive
20  * 2001-10-24 fl  added finditer primitive (for 2.2 only)
21  * 2001-12-07 fl  fixed memory leak in sub/subn (Guido van Rossum)
22  * 2002-11-09 fl  fixed empty sub/subn return type
23  * 2003-04-18 mvl fully support 4-byte codes
24  * 2003-10-17 gn  implemented non recursive scheme
25  *
26  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
27  *
28  * This version of the SRE library can be redistributed under CNRI's
29  * Python 1.6 license.  For any other use, please contact Secret Labs
30  * AB (info@pythonware.com).
31  *
32  * Portions of this engine have been developed in cooperation with
33  * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
34  * other compatibility work.
35  */
36 
37 #ifndef SRE_RECURSIVE
38 
39 static char copyright[] =
40     " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
41 
42 #define PY_SSIZE_T_CLEAN
43 
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
46 
47 #include "sre.h"
48 
49 #include <ctype.h>
50 
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
55 
56 #define SRE_PY_MODULE "re"
57 
58 /* defining this one enables tracing */
59 #undef VERBOSE
60 
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX  < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
64 #define HAVE_UNICODE
65 #endif
66 #endif
67 
68 /* -------------------------------------------------------------------- */
69 /* optional features */
70 
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
73 
74 /* enables aggressive inlining (always on for Visual C) */
75 #undef USE_INLINE
76 
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
79 
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
82 #endif
83 
84 /* -------------------------------------------------------------------- */
85 
86 #if defined(_MSC_VER)
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
93 #else
94 #define LOCAL(type) static type
95 #endif
96 
97 /* error codes */
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
102 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
103 
104 #if defined(VERBOSE)
105 #define TRACE(v) printf v
106 #else
107 #define TRACE(v)
108 #endif
109 
110 /* -------------------------------------------------------------------- */
111 /* search engine state */
112 
113 /* default character predicates (run sre_chars.py to regenerate tables) */
114 
115 #define SRE_DIGIT_MASK 1
116 #define SRE_SPACE_MASK 2
117 #define SRE_LINEBREAK_MASK 4
118 #define SRE_ALNUM_MASK 8
119 #define SRE_WORD_MASK 16
120 
121 /* FIXME: this assumes ASCII.  create tables in init_sre() instead */
122 
123 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
124 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
126 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
127 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
128 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
129 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130 
131 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
132 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
133 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
134 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
135 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139 120, 121, 122, 123, 124, 125, 126, 127 };
140 
141 #define SRE_IS_DIGIT(ch)\
142     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143 #define SRE_IS_SPACE(ch)\
144     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145 #define SRE_IS_LINEBREAK(ch)\
146     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147 #define SRE_IS_ALNUM(ch)\
148     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149 #define SRE_IS_WORD(ch)\
150     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151 
sre_lower(unsigned int ch)152 static unsigned int sre_lower(unsigned int ch)
153 {
154     return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
155 }
156 
157 /* locale-specific character predicates */
158 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159  * warnings when c's type supports only numbers < N+1 */
160 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
162 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
163 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
164 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165 
sre_lower_locale(unsigned int ch)166 static unsigned int sre_lower_locale(unsigned int ch)
167 {
168     return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
169 }
170 
171 /* unicode-specific character predicates */
172 
173 #if defined(HAVE_UNICODE)
174 
175 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
179 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180 
sre_lower_unicode(unsigned int ch)181 static unsigned int sre_lower_unicode(unsigned int ch)
182 {
183     return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184 }
185 
186 #endif
187 
188 LOCAL(int)
sre_category(SRE_CODE category,unsigned int ch)189 sre_category(SRE_CODE category, unsigned int ch)
190 {
191     switch (category) {
192 
193     case SRE_CATEGORY_DIGIT:
194         return SRE_IS_DIGIT(ch);
195     case SRE_CATEGORY_NOT_DIGIT:
196         return !SRE_IS_DIGIT(ch);
197     case SRE_CATEGORY_SPACE:
198         return SRE_IS_SPACE(ch);
199     case SRE_CATEGORY_NOT_SPACE:
200         return !SRE_IS_SPACE(ch);
201     case SRE_CATEGORY_WORD:
202         return SRE_IS_WORD(ch);
203     case SRE_CATEGORY_NOT_WORD:
204         return !SRE_IS_WORD(ch);
205     case SRE_CATEGORY_LINEBREAK:
206         return SRE_IS_LINEBREAK(ch);
207     case SRE_CATEGORY_NOT_LINEBREAK:
208         return !SRE_IS_LINEBREAK(ch);
209 
210     case SRE_CATEGORY_LOC_WORD:
211         return SRE_LOC_IS_WORD(ch);
212     case SRE_CATEGORY_LOC_NOT_WORD:
213         return !SRE_LOC_IS_WORD(ch);
214 
215 #if defined(HAVE_UNICODE)
216     case SRE_CATEGORY_UNI_DIGIT:
217         return SRE_UNI_IS_DIGIT(ch);
218     case SRE_CATEGORY_UNI_NOT_DIGIT:
219         return !SRE_UNI_IS_DIGIT(ch);
220     case SRE_CATEGORY_UNI_SPACE:
221         return SRE_UNI_IS_SPACE(ch);
222     case SRE_CATEGORY_UNI_NOT_SPACE:
223         return !SRE_UNI_IS_SPACE(ch);
224     case SRE_CATEGORY_UNI_WORD:
225         return SRE_UNI_IS_WORD(ch);
226     case SRE_CATEGORY_UNI_NOT_WORD:
227         return !SRE_UNI_IS_WORD(ch);
228     case SRE_CATEGORY_UNI_LINEBREAK:
229         return SRE_UNI_IS_LINEBREAK(ch);
230     case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231         return !SRE_UNI_IS_LINEBREAK(ch);
232 #else
233     case SRE_CATEGORY_UNI_DIGIT:
234         return SRE_IS_DIGIT(ch);
235     case SRE_CATEGORY_UNI_NOT_DIGIT:
236         return !SRE_IS_DIGIT(ch);
237     case SRE_CATEGORY_UNI_SPACE:
238         return SRE_IS_SPACE(ch);
239     case SRE_CATEGORY_UNI_NOT_SPACE:
240         return !SRE_IS_SPACE(ch);
241     case SRE_CATEGORY_UNI_WORD:
242         return SRE_LOC_IS_WORD(ch);
243     case SRE_CATEGORY_UNI_NOT_WORD:
244         return !SRE_LOC_IS_WORD(ch);
245     case SRE_CATEGORY_UNI_LINEBREAK:
246         return SRE_IS_LINEBREAK(ch);
247     case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248         return !SRE_IS_LINEBREAK(ch);
249 #endif
250     }
251     return 0;
252 }
253 
254 /* helpers */
255 
256 static void
data_stack_dealloc(SRE_STATE * state)257 data_stack_dealloc(SRE_STATE* state)
258 {
259     if (state->data_stack) {
260         PyMem_FREE(state->data_stack);
261         state->data_stack = NULL;
262     }
263     state->data_stack_size = state->data_stack_base = 0;
264 }
265 
266 static int
data_stack_grow(SRE_STATE * state,Py_ssize_t size)267 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
268 {
269     Py_ssize_t minsize, cursize;
270     minsize = state->data_stack_base+size;
271     cursize = state->data_stack_size;
272     if (cursize < minsize) {
273         void* stack;
274         cursize = minsize+minsize/4+1024;
275         TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
276         stack = PyMem_REALLOC(state->data_stack, cursize);
277         if (!stack) {
278             data_stack_dealloc(state);
279             return SRE_ERROR_MEMORY;
280         }
281         state->data_stack = (char *)stack;
282         state->data_stack_size = cursize;
283     }
284     return 0;
285 }
286 
287 /* generate 8-bit version */
288 
289 #define SRE_CHAR unsigned char
290 #define SRE_AT sre_at
291 #define SRE_COUNT sre_count
292 #define SRE_CHARSET sre_charset
293 #define SRE_INFO sre_info
294 #define SRE_MATCH sre_match
295 #define SRE_MATCH_CONTEXT sre_match_context
296 #define SRE_SEARCH sre_search
297 #define SRE_LITERAL_TEMPLATE sre_literal_template
298 
299 #if defined(HAVE_UNICODE)
300 
301 #define SRE_RECURSIVE
302 #include "_sre.c"
303 #undef SRE_RECURSIVE
304 
305 #undef SRE_LITERAL_TEMPLATE
306 #undef SRE_SEARCH
307 #undef SRE_MATCH
308 #undef SRE_MATCH_CONTEXT
309 #undef SRE_INFO
310 #undef SRE_CHARSET
311 #undef SRE_COUNT
312 #undef SRE_AT
313 #undef SRE_CHAR
314 
315 /* generate 16-bit unicode version */
316 
317 #define SRE_CHAR Py_UNICODE
318 #define SRE_AT sre_uat
319 #define SRE_COUNT sre_ucount
320 #define SRE_CHARSET sre_ucharset
321 #define SRE_INFO sre_uinfo
322 #define SRE_MATCH sre_umatch
323 #define SRE_MATCH_CONTEXT sre_umatch_context
324 #define SRE_SEARCH sre_usearch
325 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
326 #endif
327 
328 #endif /* SRE_RECURSIVE */
329 
330 /* -------------------------------------------------------------------- */
331 /* String matching engine */
332 
333 /* the following section is compiled twice, with different character
334    settings */
335 
336 LOCAL(int)
SRE_AT(SRE_STATE * state,SRE_CHAR * ptr,SRE_CODE at)337 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338 {
339     /* check if pointer is at given position */
340 
341     Py_ssize_t thisp, thatp;
342 
343     switch (at) {
344 
345     case SRE_AT_BEGINNING:
346     case SRE_AT_BEGINNING_STRING:
347         return ((void*) ptr == state->beginning);
348 
349     case SRE_AT_BEGINNING_LINE:
350         return ((void*) ptr == state->beginning ||
351                 SRE_IS_LINEBREAK((int) ptr[-1]));
352 
353     case SRE_AT_END:
354         return (((SRE_CHAR *)state->end - ptr == 1 &&
355                  SRE_IS_LINEBREAK((int) ptr[0])) ||
356                 ((void*) ptr == state->end));
357 
358     case SRE_AT_END_LINE:
359         return ((void*) ptr == state->end ||
360                 SRE_IS_LINEBREAK((int) ptr[0]));
361 
362     case SRE_AT_END_STRING:
363         return ((void*) ptr == state->end);
364 
365     case SRE_AT_BOUNDARY:
366         if (state->beginning == state->end)
367             return 0;
368         thatp = ((void*) ptr > state->beginning) ?
369             SRE_IS_WORD((int) ptr[-1]) : 0;
370         thisp = ((void*) ptr < state->end) ?
371             SRE_IS_WORD((int) ptr[0]) : 0;
372         return thisp != thatp;
373 
374     case SRE_AT_NON_BOUNDARY:
375         if (state->beginning == state->end)
376             return 0;
377         thatp = ((void*) ptr > state->beginning) ?
378             SRE_IS_WORD((int) ptr[-1]) : 0;
379         thisp = ((void*) ptr < state->end) ?
380             SRE_IS_WORD((int) ptr[0]) : 0;
381         return thisp == thatp;
382 
383     case SRE_AT_LOC_BOUNDARY:
384         if (state->beginning == state->end)
385             return 0;
386         thatp = ((void*) ptr > state->beginning) ?
387             SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
388         thisp = ((void*) ptr < state->end) ?
389             SRE_LOC_IS_WORD((int) ptr[0]) : 0;
390         return thisp != thatp;
391 
392     case SRE_AT_LOC_NON_BOUNDARY:
393         if (state->beginning == state->end)
394             return 0;
395         thatp = ((void*) ptr > state->beginning) ?
396             SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
397         thisp = ((void*) ptr < state->end) ?
398             SRE_LOC_IS_WORD((int) ptr[0]) : 0;
399         return thisp == thatp;
400 
401 #if defined(HAVE_UNICODE)
402     case SRE_AT_UNI_BOUNDARY:
403         if (state->beginning == state->end)
404             return 0;
405         thatp = ((void*) ptr > state->beginning) ?
406             SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
407         thisp = ((void*) ptr < state->end) ?
408             SRE_UNI_IS_WORD((int) ptr[0]) : 0;
409         return thisp != thatp;
410 
411     case SRE_AT_UNI_NON_BOUNDARY:
412         if (state->beginning == state->end)
413             return 0;
414         thatp = ((void*) ptr > state->beginning) ?
415             SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
416         thisp = ((void*) ptr < state->end) ?
417             SRE_UNI_IS_WORD((int) ptr[0]) : 0;
418         return thisp == thatp;
419 #endif
420 
421     }
422 
423     return 0;
424 }
425 
426 LOCAL(int)
SRE_CHARSET(SRE_CODE * set,SRE_CODE ch)427 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428 {
429     /* check if character is a member of the given set */
430 
431     int ok = 1;
432 
433     for (;;) {
434         switch (*set++) {
435 
436         case SRE_OP_FAILURE:
437             return !ok;
438 
439         case SRE_OP_LITERAL:
440             /* <LITERAL> <code> */
441             if (ch == set[0])
442                 return ok;
443             set++;
444             break;
445 
446         case SRE_OP_CATEGORY:
447             /* <CATEGORY> <code> */
448             if (sre_category(set[0], (int) ch))
449                 return ok;
450             set += 1;
451             break;
452 
453         case SRE_OP_CHARSET:
454             if (sizeof(SRE_CODE) == 2) {
455                 /* <CHARSET> <bitmap> (16 bits per code word) */
456                 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457                     return ok;
458                 set += 16;
459             }
460             else {
461                 /* <CHARSET> <bitmap> (32 bits per code word) */
462                 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
463                     return ok;
464                 set += 8;
465             }
466             break;
467 
468         case SRE_OP_RANGE:
469             /* <RANGE> <lower> <upper> */
470             if (set[0] <= ch && ch <= set[1])
471                 return ok;
472             set += 2;
473             break;
474 
475         case SRE_OP_NEGATE:
476             ok = !ok;
477             break;
478 
479         case SRE_OP_BIGCHARSET:
480             /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481         {
482             Py_ssize_t count, block;
483             count = *(set++);
484 
485             if (sizeof(SRE_CODE) == 2) {
486                 block = ((unsigned char*)set)[ch >> 8];
487                 set += 128;
488                 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489                     return ok;
490                 set += count*16;
491             }
492             else {
493                 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494                  * warnings when c's type supports only numbers < N+1 */
495                 if (!(ch & ~65535))
496                     block = ((unsigned char*)set)[ch >> 8];
497                 else
498                     block = -1;
499                 set += 64;
500                 if (block >=0 &&
501                     (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
502                     return ok;
503                 set += count*8;
504             }
505             break;
506         }
507 
508         default:
509             /* internal error -- there's not much we can do about it
510                here, so let's just pretend it didn't match... */
511             return 0;
512         }
513     }
514 }
515 
516 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517 
518 LOCAL(Py_ssize_t)
SRE_COUNT(SRE_STATE * state,SRE_CODE * pattern,Py_ssize_t maxcount)519 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
520 {
521     SRE_CODE chr;
522     SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523     SRE_CHAR* end = (SRE_CHAR *)state->end;
524     Py_ssize_t i;
525 
526     /* adjust end */
527     if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
528         end = ptr + maxcount;
529 
530     switch (pattern[0]) {
531 
532     case SRE_OP_IN:
533         /* repeated set */
534         TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535         while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536             ptr++;
537         break;
538 
539     case SRE_OP_ANY:
540         /* repeated dot wildcard. */
541         TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
542         while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543             ptr++;
544         break;
545 
546     case SRE_OP_ANY_ALL:
547         /* repeated dot wildcard.  skip to the end of the target
548            string, and backtrack from there */
549         TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
550         ptr = end;
551         break;
552 
553     case SRE_OP_LITERAL:
554         /* repeated literal */
555         chr = pattern[1];
556         TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
557         while (ptr < end && (SRE_CODE) *ptr == chr)
558             ptr++;
559         break;
560 
561     case SRE_OP_LITERAL_IGNORE:
562         /* repeated literal */
563         chr = pattern[1];
564         TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
565         while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566             ptr++;
567         break;
568 
569     case SRE_OP_NOT_LITERAL:
570         /* repeated non-literal */
571         chr = pattern[1];
572         TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
573         while (ptr < end && (SRE_CODE) *ptr != chr)
574             ptr++;
575         break;
576 
577     case SRE_OP_NOT_LITERAL_IGNORE:
578         /* repeated non-literal */
579         chr = pattern[1];
580         TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
581         while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582             ptr++;
583         break;
584 
585     default:
586         /* repeated single character pattern */
587         TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
588         while ((SRE_CHAR*) state->ptr < end) {
589             i = SRE_MATCH(state, pattern);
590             if (i < 0)
591                 return i;
592             if (!i)
593                 break;
594         }
595         TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
596                (SRE_CHAR*) state->ptr - ptr));
597         return (SRE_CHAR*) state->ptr - ptr;
598     }
599 
600     TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
601            ptr - (SRE_CHAR*) state->ptr));
602     return ptr - (SRE_CHAR*) state->ptr;
603 }
604 
605 #if 0 /* not used in this release */
606 LOCAL(int)
607 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
608 {
609     /* check if an SRE_OP_INFO block matches at the current position.
610        returns the number of SRE_CODE objects to skip if successful, 0
611        if no match */
612 
613     SRE_CHAR* end = state->end;
614     SRE_CHAR* ptr = state->ptr;
615     Py_ssize_t i;
616 
617     /* check minimal length */
618     if (pattern[3] && (end - ptr) < pattern[3])
619         return 0;
620 
621     /* check known prefix */
622     if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
623         /* <length> <skip> <prefix data> <overlap data> */
624         for (i = 0; i < pattern[5]; i++)
625             if ((SRE_CODE) ptr[i] != pattern[7 + i])
626                 return 0;
627         return pattern[0] + 2 * pattern[6];
628     }
629     return pattern[0];
630 }
631 #endif
632 
633 /* The macros below should be used to protect recursive SRE_MATCH()
634  * calls that *failed* and do *not* return immediately (IOW, those
635  * that will backtrack). Explaining:
636  *
637  * - Recursive SRE_MATCH() returned true: that's usually a success
638  *   (besides atypical cases like ASSERT_NOT), therefore there's no
639  *   reason to restore lastmark;
640  *
641  * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
642  *   is returning to the caller: If the current SRE_MATCH() is the
643  *   top function of the recursion, returning false will be a matching
644  *   failure, and it doesn't matter where lastmark is pointing to.
645  *   If it's *not* the top function, it will be a recursive SRE_MATCH()
646  *   failure by itself, and the calling SRE_MATCH() will have to deal
647  *   with the failure by the same rules explained here (it will restore
648  *   lastmark by itself if necessary);
649  *
650  * - Recursive SRE_MATCH() returned false, and will continue the
651  *   outside 'for' loop: must be protected when breaking, since the next
652  *   OP could potentially depend on lastmark;
653  *
654  * - Recursive SRE_MATCH() returned false, and will be called again
655  *   inside a local for/while loop: must be protected between each
656  *   loop iteration, since the recursive SRE_MATCH() could do anything,
657  *   and could potentially depend on lastmark.
658  *
659  * For more information, check the discussion at SF patch #712900.
660  */
661 #define LASTMARK_SAVE()     \
662     do { \
663         ctx->lastmark = state->lastmark; \
664         ctx->lastindex = state->lastindex; \
665     } while (0)
666 #define LASTMARK_RESTORE()  \
667     do { \
668         state->lastmark = ctx->lastmark; \
669         state->lastindex = ctx->lastindex; \
670     } while (0)
671 
672 #define RETURN_ERROR(i) do { return i; } while(0)
673 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
674 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
675 
676 #define RETURN_ON_ERROR(i) \
677     do { if (i < 0) RETURN_ERROR(i); } while (0)
678 #define RETURN_ON_SUCCESS(i) \
679     do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
680 #define RETURN_ON_FAILURE(i) \
681     do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
682 
683 #define SFY(x) #x
684 
685 #define DATA_STACK_ALLOC(state, type, ptr) \
686 do { \
687     alloc_pos = state->data_stack_base; \
688     TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
689            "(%" PY_FORMAT_SIZE_T "d)\n", \
690            SFY(type), alloc_pos, sizeof(type))); \
691     if (sizeof(type) > state->data_stack_size - alloc_pos) { \
692         int j = data_stack_grow(state, sizeof(type)); \
693         if (j < 0) return j; \
694         if (ctx_pos != -1) \
695             DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
696     } \
697     ptr = (type*)(state->data_stack+alloc_pos); \
698     state->data_stack_base += sizeof(type); \
699 } while (0)
700 
701 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
702 do { \
703     TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
704     ptr = (type*)(state->data_stack+pos); \
705 } while (0)
706 
707 #define DATA_STACK_PUSH(state, data, size) \
708 do { \
709     TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
710            "(%" PY_FORMAT_SIZE_T "d)\n", \
711            data, state->data_stack_base, size)); \
712     if (size > state->data_stack_size - state->data_stack_base) { \
713         int j = data_stack_grow(state, size); \
714         if (j < 0) return j; \
715         if (ctx_pos != -1) \
716             DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
717     } \
718     memcpy(state->data_stack+state->data_stack_base, data, size); \
719     state->data_stack_base += size; \
720 } while (0)
721 
722 #define DATA_STACK_POP(state, data, size, discard) \
723 do { \
724     TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
725            "(%" PY_FORMAT_SIZE_T "d)\n", \
726            data, state->data_stack_base-size, size)); \
727     memcpy(data, state->data_stack+state->data_stack_base-size, size); \
728     if (discard) \
729         state->data_stack_base -= size; \
730 } while (0)
731 
732 #define DATA_STACK_POP_DISCARD(state, size) \
733 do { \
734     TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
735            "(%" PY_FORMAT_SIZE_T "d)\n", \
736            state->data_stack_base-size, size)); \
737     state->data_stack_base -= size; \
738 } while(0)
739 
740 #define DATA_PUSH(x) \
741     DATA_STACK_PUSH(state, (x), sizeof(*(x)))
742 #define DATA_POP(x) \
743     DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
744 #define DATA_POP_DISCARD(x) \
745     DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
746 #define DATA_ALLOC(t,p) \
747     DATA_STACK_ALLOC(state, t, p)
748 #define DATA_LOOKUP_AT(t,p,pos) \
749     DATA_STACK_LOOKUP_AT(state,t,p,pos)
750 
751 #define MARK_PUSH(lastmark) \
752     do if (lastmark > 0) { \
753         i = lastmark; /* ctx->lastmark may change if reallocated */ \
754         DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
755     } while (0)
756 #define MARK_POP(lastmark) \
757     do if (lastmark > 0) { \
758         DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
759     } while (0)
760 #define MARK_POP_KEEP(lastmark) \
761     do if (lastmark > 0) { \
762         DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
763     } while (0)
764 #define MARK_POP_DISCARD(lastmark) \
765     do if (lastmark > 0) { \
766         DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
767     } while (0)
768 
769 #define JUMP_NONE            0
770 #define JUMP_MAX_UNTIL_1     1
771 #define JUMP_MAX_UNTIL_2     2
772 #define JUMP_MAX_UNTIL_3     3
773 #define JUMP_MIN_UNTIL_1     4
774 #define JUMP_MIN_UNTIL_2     5
775 #define JUMP_MIN_UNTIL_3     6
776 #define JUMP_REPEAT          7
777 #define JUMP_REPEAT_ONE_1    8
778 #define JUMP_REPEAT_ONE_2    9
779 #define JUMP_MIN_REPEAT_ONE  10
780 #define JUMP_BRANCH          11
781 #define JUMP_ASSERT          12
782 #define JUMP_ASSERT_NOT      13
783 
784 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
785     DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
786     nextctx->last_ctx_pos = ctx_pos; \
787     nextctx->jump = jumpvalue; \
788     nextctx->pattern = nextpattern; \
789     ctx_pos = alloc_pos; \
790     ctx = nextctx; \
791     goto entrance; \
792     jumplabel: \
793     while (0) /* gcc doesn't like labels at end of scopes */ \
794 
795 typedef struct {
796     Py_ssize_t last_ctx_pos;
797     Py_ssize_t jump;
798     SRE_CHAR* ptr;
799     SRE_CODE* pattern;
800     Py_ssize_t count;
801     Py_ssize_t lastmark;
802     Py_ssize_t lastindex;
803     union {
804         SRE_CODE chr;
805         SRE_REPEAT* rep;
806     } u;
807 } SRE_MATCH_CONTEXT;
808 
809 /* check if string matches the given pattern.  returns <0 for
810    error, 0 for failure, and 1 for success */
811 LOCAL(Py_ssize_t)
SRE_MATCH(SRE_STATE * state,SRE_CODE * pattern)812 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
813 {
814     SRE_CHAR* end = (SRE_CHAR *)state->end;
815     Py_ssize_t alloc_pos, ctx_pos = -1;
816     Py_ssize_t i, ret = 0;
817     Py_ssize_t jump;
818     unsigned int sigcount=0;
819 
820     SRE_MATCH_CONTEXT* ctx;
821     SRE_MATCH_CONTEXT* nextctx;
822 
823     TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
824 
825     DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
826     ctx->last_ctx_pos = -1;
827     ctx->jump = JUMP_NONE;
828     ctx->pattern = pattern;
829     ctx_pos = alloc_pos;
830 
831 entrance:
832 
833     ctx->ptr = (SRE_CHAR *)state->ptr;
834 
835     if (ctx->pattern[0] == SRE_OP_INFO) {
836         /* optimization info block */
837         /* <INFO> <1=skip> <2=flags> <3=min> ... */
838         if (ctx->pattern[3] && (Py_uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) {
839             TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
840                    "need %" PY_FORMAT_SIZE_T "d)\n",
841                    (end - ctx->ptr), (Py_ssize_t) ctx->pattern[3]));
842             RETURN_FAILURE;
843         }
844         ctx->pattern += ctx->pattern[1] + 1;
845     }
846 
847     for (;;) {
848         ++sigcount;
849         if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
850             RETURN_ERROR(SRE_ERROR_INTERRUPTED);
851 
852         switch (*ctx->pattern++) {
853 
854         case SRE_OP_MARK:
855             /* set mark */
856             /* <MARK> <gid> */
857             TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
858                    ctx->ptr, ctx->pattern[0]));
859             i = ctx->pattern[0];
860             if (i & 1)
861                 state->lastindex = i/2 + 1;
862             if (i > state->lastmark) {
863                 /* state->lastmark is the highest valid index in the
864                    state->mark array.  If it is increased by more than 1,
865                    the intervening marks must be set to NULL to signal
866                    that these marks have not been encountered. */
867                 Py_ssize_t j = state->lastmark + 1;
868                 while (j < i)
869                     state->mark[j++] = NULL;
870                 state->lastmark = i;
871             }
872             state->mark[i] = ctx->ptr;
873             ctx->pattern++;
874             break;
875 
876         case SRE_OP_LITERAL:
877             /* match literal string */
878             /* <LITERAL> <code> */
879             TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
880                    ctx->ptr, *ctx->pattern));
881             if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
882                 RETURN_FAILURE;
883             ctx->pattern++;
884             ctx->ptr++;
885             break;
886 
887         case SRE_OP_NOT_LITERAL:
888             /* match anything that is not literal character */
889             /* <NOT_LITERAL> <code> */
890             TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
891                    ctx->ptr, *ctx->pattern));
892             if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
893                 RETURN_FAILURE;
894             ctx->pattern++;
895             ctx->ptr++;
896             break;
897 
898         case SRE_OP_SUCCESS:
899             /* end of pattern */
900             TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
901             state->ptr = ctx->ptr;
902             RETURN_SUCCESS;
903 
904         case SRE_OP_AT:
905             /* match at given position */
906             /* <AT> <code> */
907             TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
908             if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
909                 RETURN_FAILURE;
910             ctx->pattern++;
911             break;
912 
913         case SRE_OP_CATEGORY:
914             /* match at given category */
915             /* <CATEGORY> <code> */
916             TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
917                    ctx->ptr, *ctx->pattern));
918             if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
919                 RETURN_FAILURE;
920             ctx->pattern++;
921             ctx->ptr++;
922             break;
923 
924         case SRE_OP_ANY:
925             /* match anything (except a newline) */
926             /* <ANY> */
927             TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
928             if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
929                 RETURN_FAILURE;
930             ctx->ptr++;
931             break;
932 
933         case SRE_OP_ANY_ALL:
934             /* match anything */
935             /* <ANY_ALL> */
936             TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
937             if (ctx->ptr >= end)
938                 RETURN_FAILURE;
939             ctx->ptr++;
940             break;
941 
942         case SRE_OP_IN:
943             /* match set member (or non_member) */
944             /* <IN> <skip> <set> */
945             TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
946             if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
947                 RETURN_FAILURE;
948             ctx->pattern += ctx->pattern[0];
949             ctx->ptr++;
950             break;
951 
952         case SRE_OP_LITERAL_IGNORE:
953             TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
954                    ctx->pattern, ctx->ptr, ctx->pattern[0]));
955             if (ctx->ptr >= end ||
956                 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
957                 RETURN_FAILURE;
958             ctx->pattern++;
959             ctx->ptr++;
960             break;
961 
962         case SRE_OP_NOT_LITERAL_IGNORE:
963             TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
964                    ctx->pattern, ctx->ptr, *ctx->pattern));
965             if (ctx->ptr >= end ||
966                 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
967                 RETURN_FAILURE;
968             ctx->pattern++;
969             ctx->ptr++;
970             break;
971 
972         case SRE_OP_IN_IGNORE:
973             TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
974             if (ctx->ptr >= end
975                 || !SRE_CHARSET(ctx->pattern+1,
976                                 (SRE_CODE)state->lower(*ctx->ptr)))
977                 RETURN_FAILURE;
978             ctx->pattern += ctx->pattern[0];
979             ctx->ptr++;
980             break;
981 
982         case SRE_OP_JUMP:
983         case SRE_OP_INFO:
984             /* jump forward */
985             /* <JUMP> <offset> */
986             TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
987                    ctx->ptr, ctx->pattern[0]));
988             ctx->pattern += ctx->pattern[0];
989             break;
990 
991         case SRE_OP_BRANCH:
992             /* alternation */
993             /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
994             TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
995             LASTMARK_SAVE();
996             ctx->u.rep = state->repeat;
997             if (ctx->u.rep)
998                 MARK_PUSH(ctx->lastmark);
999             for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
1000                 if (ctx->pattern[1] == SRE_OP_LITERAL &&
1001                     (ctx->ptr >= end ||
1002                      (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
1003                     continue;
1004                 if (ctx->pattern[1] == SRE_OP_IN &&
1005                     (ctx->ptr >= end ||
1006                      !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1007                     continue;
1008                 state->ptr = ctx->ptr;
1009                 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1010                 if (ret) {
1011                     if (ctx->u.rep)
1012                         MARK_POP_DISCARD(ctx->lastmark);
1013                     RETURN_ON_ERROR(ret);
1014                     RETURN_SUCCESS;
1015                 }
1016                 if (ctx->u.rep)
1017                     MARK_POP_KEEP(ctx->lastmark);
1018                 LASTMARK_RESTORE();
1019             }
1020             if (ctx->u.rep)
1021                 MARK_POP_DISCARD(ctx->lastmark);
1022             RETURN_FAILURE;
1023 
1024         case SRE_OP_REPEAT_ONE:
1025             /* match repeated sequence (maximizing regexp) */
1026 
1027             /* this operator only works if the repeated item is
1028                exactly one character wide, and we're not already
1029                collecting backtracking points.  for other cases,
1030                use the MAX_REPEAT operator */
1031 
1032             /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1033 
1034             TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1035                    ctx->pattern[1], ctx->pattern[2]));
1036 
1037             if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1038                 RETURN_FAILURE; /* cannot match */
1039 
1040             state->ptr = ctx->ptr;
1041 
1042             ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1043             RETURN_ON_ERROR(ret);
1044             DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1045             ctx->count = ret;
1046             ctx->ptr += ctx->count;
1047 
1048             /* when we arrive here, count contains the number of
1049                matches, and ctx->ptr points to the tail of the target
1050                string.  check if the rest of the pattern matches,
1051                and backtrack if not. */
1052 
1053             if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1054                 RETURN_FAILURE;
1055 
1056             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1057                 /* tail is empty.  we're finished */
1058                 state->ptr = ctx->ptr;
1059                 RETURN_SUCCESS;
1060             }
1061 
1062             LASTMARK_SAVE();
1063 
1064             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1065                 /* tail starts with a literal. skip positions where
1066                    the rest of the pattern cannot possibly match */
1067                 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1068                 for (;;) {
1069                     while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1070                            (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1071                         ctx->ptr--;
1072                         ctx->count--;
1073                     }
1074                     if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1075                         break;
1076                     state->ptr = ctx->ptr;
1077                     DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1078                             ctx->pattern+ctx->pattern[0]);
1079                     if (ret) {
1080                         RETURN_ON_ERROR(ret);
1081                         RETURN_SUCCESS;
1082                     }
1083 
1084                     LASTMARK_RESTORE();
1085 
1086                     ctx->ptr--;
1087                     ctx->count--;
1088                 }
1089 
1090             } else {
1091                 /* general case */
1092                 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1093                     state->ptr = ctx->ptr;
1094                     DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1095                             ctx->pattern+ctx->pattern[0]);
1096                     if (ret) {
1097                         RETURN_ON_ERROR(ret);
1098                         RETURN_SUCCESS;
1099                     }
1100                     ctx->ptr--;
1101                     ctx->count--;
1102                     LASTMARK_RESTORE();
1103                 }
1104             }
1105             RETURN_FAILURE;
1106 
1107         case SRE_OP_MIN_REPEAT_ONE:
1108             /* match repeated sequence (minimizing regexp) */
1109 
1110             /* this operator only works if the repeated item is
1111                exactly one character wide, and we're not already
1112                collecting backtracking points.  for other cases,
1113                use the MIN_REPEAT operator */
1114 
1115             /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1116 
1117             TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1118                    ctx->pattern[1], ctx->pattern[2]));
1119 
1120             if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1121                 RETURN_FAILURE; /* cannot match */
1122 
1123             state->ptr = ctx->ptr;
1124 
1125             if (ctx->pattern[1] == 0)
1126                 ctx->count = 0;
1127             else {
1128                 /* count using pattern min as the maximum */
1129                 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1130                 RETURN_ON_ERROR(ret);
1131                 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1132                 if (ret < (Py_ssize_t) ctx->pattern[1])
1133                     /* didn't match minimum number of times */
1134                     RETURN_FAILURE;
1135                 /* advance past minimum matches of repeat */
1136                 ctx->count = ret;
1137                 ctx->ptr += ctx->count;
1138             }
1139 
1140             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1141                 /* tail is empty.  we're finished */
1142                 state->ptr = ctx->ptr;
1143                 RETURN_SUCCESS;
1144 
1145             } else {
1146                 /* general case */
1147                 LASTMARK_SAVE();
1148                 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
1149                        || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1150                     state->ptr = ctx->ptr;
1151                     DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1152                             ctx->pattern+ctx->pattern[0]);
1153                     if (ret) {
1154                         RETURN_ON_ERROR(ret);
1155                         RETURN_SUCCESS;
1156                     }
1157                     state->ptr = ctx->ptr;
1158                     ret = SRE_COUNT(state, ctx->pattern+3, 1);
1159                     RETURN_ON_ERROR(ret);
1160                     DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1161                     if (ret == 0)
1162                         break;
1163                     assert(ret == 1);
1164                     ctx->ptr++;
1165                     ctx->count++;
1166                     LASTMARK_RESTORE();
1167                 }
1168             }
1169             RETURN_FAILURE;
1170 
1171         case SRE_OP_REPEAT:
1172             /* create repeat context.  all the hard work is done
1173                by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1174             /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1175             TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1176                    ctx->pattern[1], ctx->pattern[2]));
1177 
1178             /* install new repeat context */
1179             ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1180             if (!ctx->u.rep) {
1181                 PyErr_NoMemory();
1182                 RETURN_FAILURE;
1183             }
1184             ctx->u.rep->count = -1;
1185             ctx->u.rep->pattern = ctx->pattern;
1186             ctx->u.rep->prev = state->repeat;
1187             ctx->u.rep->last_ptr = NULL;
1188             state->repeat = ctx->u.rep;
1189 
1190             state->ptr = ctx->ptr;
1191             DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1192             state->repeat = ctx->u.rep->prev;
1193             PyObject_FREE(ctx->u.rep);
1194 
1195             if (ret) {
1196                 RETURN_ON_ERROR(ret);
1197                 RETURN_SUCCESS;
1198             }
1199             RETURN_FAILURE;
1200 
1201         case SRE_OP_MAX_UNTIL:
1202             /* maximizing repeat */
1203             /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1204 
1205             /* FIXME: we probably need to deal with zero-width
1206                matches in here... */
1207 
1208             ctx->u.rep = state->repeat;
1209             if (!ctx->u.rep)
1210                 RETURN_ERROR(SRE_ERROR_STATE);
1211 
1212             state->ptr = ctx->ptr;
1213 
1214             ctx->count = ctx->u.rep->count+1;
1215 
1216             TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1217                    ctx->ptr, ctx->count));
1218 
1219             if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1220                 /* not enough matches */
1221                 ctx->u.rep->count = ctx->count;
1222                 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1223                         ctx->u.rep->pattern+3);
1224                 if (ret) {
1225                     RETURN_ON_ERROR(ret);
1226                     RETURN_SUCCESS;
1227                 }
1228                 ctx->u.rep->count = ctx->count-1;
1229                 state->ptr = ctx->ptr;
1230                 RETURN_FAILURE;
1231             }
1232 
1233             if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1234                 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1235                 state->ptr != ctx->u.rep->last_ptr) {
1236                 /* we may have enough matches, but if we can
1237                    match another item, do so */
1238                 ctx->u.rep->count = ctx->count;
1239                 LASTMARK_SAVE();
1240                 MARK_PUSH(ctx->lastmark);
1241                 /* zero-width match protection */
1242                 DATA_PUSH(&ctx->u.rep->last_ptr);
1243                 ctx->u.rep->last_ptr = state->ptr;
1244                 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1245                         ctx->u.rep->pattern+3);
1246                 DATA_POP(&ctx->u.rep->last_ptr);
1247                 if (ret) {
1248                     MARK_POP_DISCARD(ctx->lastmark);
1249                     RETURN_ON_ERROR(ret);
1250                     RETURN_SUCCESS;
1251                 }
1252                 MARK_POP(ctx->lastmark);
1253                 LASTMARK_RESTORE();
1254                 ctx->u.rep->count = ctx->count-1;
1255                 state->ptr = ctx->ptr;
1256             }
1257 
1258             /* cannot match more repeated items here.  make sure the
1259                tail matches */
1260             state->repeat = ctx->u.rep->prev;
1261             DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1262             RETURN_ON_SUCCESS(ret);
1263             state->repeat = ctx->u.rep;
1264             state->ptr = ctx->ptr;
1265             RETURN_FAILURE;
1266 
1267         case SRE_OP_MIN_UNTIL:
1268             /* minimizing repeat */
1269             /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1270 
1271             ctx->u.rep = state->repeat;
1272             if (!ctx->u.rep)
1273                 RETURN_ERROR(SRE_ERROR_STATE);
1274 
1275             state->ptr = ctx->ptr;
1276 
1277             ctx->count = ctx->u.rep->count+1;
1278 
1279             TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
1280                    ctx->ptr, ctx->count, ctx->u.rep->pattern));
1281 
1282             if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1283                 /* not enough matches */
1284                 ctx->u.rep->count = ctx->count;
1285                 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1286                         ctx->u.rep->pattern+3);
1287                 if (ret) {
1288                     RETURN_ON_ERROR(ret);
1289                     RETURN_SUCCESS;
1290                 }
1291                 ctx->u.rep->count = ctx->count-1;
1292                 state->ptr = ctx->ptr;
1293                 RETURN_FAILURE;
1294             }
1295 
1296             LASTMARK_SAVE();
1297 
1298             /* see if the tail matches */
1299             state->repeat = ctx->u.rep->prev;
1300             DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1301             if (ret) {
1302                 RETURN_ON_ERROR(ret);
1303                 RETURN_SUCCESS;
1304             }
1305 
1306             state->repeat = ctx->u.rep;
1307             state->ptr = ctx->ptr;
1308 
1309             LASTMARK_RESTORE();
1310 
1311             if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1312                 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1313                 state->ptr == ctx->u.rep->last_ptr)
1314                 RETURN_FAILURE;
1315 
1316             ctx->u.rep->count = ctx->count;
1317             /* zero-width match protection */
1318             DATA_PUSH(&ctx->u.rep->last_ptr);
1319             ctx->u.rep->last_ptr = state->ptr;
1320             DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1321                     ctx->u.rep->pattern+3);
1322             DATA_POP(&ctx->u.rep->last_ptr);
1323             if (ret) {
1324                 RETURN_ON_ERROR(ret);
1325                 RETURN_SUCCESS;
1326             }
1327             ctx->u.rep->count = ctx->count-1;
1328             state->ptr = ctx->ptr;
1329             RETURN_FAILURE;
1330 
1331         case SRE_OP_GROUPREF:
1332             /* match backreference */
1333             TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1334                    ctx->ptr, ctx->pattern[0]));
1335             i = ctx->pattern[0];
1336             {
1337                 Py_ssize_t groupref = i+i;
1338                 if (groupref >= state->lastmark) {
1339                     RETURN_FAILURE;
1340                 } else {
1341                     SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1342                     SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1343                     if (!p || !e || e < p)
1344                         RETURN_FAILURE;
1345                     while (p < e) {
1346                         if (ctx->ptr >= end || *ctx->ptr != *p)
1347                             RETURN_FAILURE;
1348                         p++; ctx->ptr++;
1349                     }
1350                 }
1351             }
1352             ctx->pattern++;
1353             break;
1354 
1355         case SRE_OP_GROUPREF_IGNORE:
1356             /* match backreference */
1357             TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1358                    ctx->ptr, ctx->pattern[0]));
1359             i = ctx->pattern[0];
1360             {
1361                 Py_ssize_t groupref = i+i;
1362                 if (groupref >= state->lastmark) {
1363                     RETURN_FAILURE;
1364                 } else {
1365                     SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1366                     SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1367                     if (!p || !e || e < p)
1368                         RETURN_FAILURE;
1369                     while (p < e) {
1370                         if (ctx->ptr >= end ||
1371                             state->lower(*ctx->ptr) != state->lower(*p))
1372                             RETURN_FAILURE;
1373                         p++; ctx->ptr++;
1374                     }
1375                 }
1376             }
1377             ctx->pattern++;
1378             break;
1379 
1380         case SRE_OP_GROUPREF_EXISTS:
1381             TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1382                    ctx->ptr, ctx->pattern[0]));
1383             /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1384             i = ctx->pattern[0];
1385             {
1386                 Py_ssize_t groupref = i+i;
1387                 if (groupref >= state->lastmark) {
1388                     ctx->pattern += ctx->pattern[1];
1389                     break;
1390                 } else {
1391                     SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1392                     SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1393                     if (!p || !e || e < p) {
1394                         ctx->pattern += ctx->pattern[1];
1395                         break;
1396                     }
1397                 }
1398             }
1399             ctx->pattern += 2;
1400             break;
1401 
1402         case SRE_OP_ASSERT:
1403             /* assert subpattern */
1404             /* <ASSERT> <skip> <back> <pattern> */
1405             TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1406                    ctx->ptr, ctx->pattern[1]));
1407             if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1])
1408                 RETURN_FAILURE;
1409             state->ptr = ctx->ptr - ctx->pattern[1];
1410             DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1411             RETURN_ON_FAILURE(ret);
1412             ctx->pattern += ctx->pattern[0];
1413             break;
1414 
1415         case SRE_OP_ASSERT_NOT:
1416             /* assert not subpattern */
1417             /* <ASSERT_NOT> <skip> <back> <pattern> */
1418             TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1419                    ctx->ptr, ctx->pattern[1]));
1420             if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
1421                 state->ptr = ctx->ptr - ctx->pattern[1];
1422                 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1423                 if (ret) {
1424                     RETURN_ON_ERROR(ret);
1425                     RETURN_FAILURE;
1426                 }
1427             }
1428             ctx->pattern += ctx->pattern[0];
1429             break;
1430 
1431         case SRE_OP_FAILURE:
1432             /* immediate failure */
1433             TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1434             RETURN_FAILURE;
1435 
1436         default:
1437             TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1438                    ctx->pattern[-1]));
1439             RETURN_ERROR(SRE_ERROR_ILLEGAL);
1440         }
1441     }
1442 
1443 exit:
1444     ctx_pos = ctx->last_ctx_pos;
1445     jump = ctx->jump;
1446     DATA_POP_DISCARD(ctx);
1447     if (ctx_pos == -1)
1448         return ret;
1449     DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1450 
1451     switch (jump) {
1452         case JUMP_MAX_UNTIL_2:
1453             TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1454             goto jump_max_until_2;
1455         case JUMP_MAX_UNTIL_3:
1456             TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1457             goto jump_max_until_3;
1458         case JUMP_MIN_UNTIL_2:
1459             TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1460             goto jump_min_until_2;
1461         case JUMP_MIN_UNTIL_3:
1462             TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1463             goto jump_min_until_3;
1464         case JUMP_BRANCH:
1465             TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1466             goto jump_branch;
1467         case JUMP_MAX_UNTIL_1:
1468             TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1469             goto jump_max_until_1;
1470         case JUMP_MIN_UNTIL_1:
1471             TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1472             goto jump_min_until_1;
1473         case JUMP_REPEAT:
1474             TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1475             goto jump_repeat;
1476         case JUMP_REPEAT_ONE_1:
1477             TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1478             goto jump_repeat_one_1;
1479         case JUMP_REPEAT_ONE_2:
1480             TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1481             goto jump_repeat_one_2;
1482         case JUMP_MIN_REPEAT_ONE:
1483             TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1484             goto jump_min_repeat_one;
1485         case JUMP_ASSERT:
1486             TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1487             goto jump_assert;
1488         case JUMP_ASSERT_NOT:
1489             TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1490             goto jump_assert_not;
1491         case JUMP_NONE:
1492             TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1493                    ctx->ptr, ret));
1494             break;
1495     }
1496 
1497     return ret; /* should never get here */
1498 }
1499 
1500 LOCAL(Py_ssize_t)
SRE_SEARCH(SRE_STATE * state,SRE_CODE * pattern)1501 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1502 {
1503     SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1504     SRE_CHAR* end = (SRE_CHAR *)state->end;
1505     Py_ssize_t status = 0;
1506     Py_ssize_t prefix_len = 0;
1507     Py_ssize_t prefix_skip = 0;
1508     SRE_CODE* prefix = NULL;
1509     SRE_CODE* charset = NULL;
1510     SRE_CODE* overlap = NULL;
1511     int flags = 0;
1512 
1513     if (ptr > end)
1514         return 0;
1515 
1516     if (pattern[0] == SRE_OP_INFO) {
1517         /* optimization info block */
1518         /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info>  */
1519 
1520         flags = pattern[2];
1521 
1522         if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1523             TRACE(("reject (got %u chars, need %u)\n",
1524                    (unsigned int)(end - ptr), pattern[3]));
1525             return 0;
1526         }
1527         if (pattern[3] > 1) {
1528             /* adjust end point (but make sure we leave at least one
1529                character in there, so literal search will work) */
1530             end -= pattern[3]-1;
1531             if (end <= ptr)
1532                 end = ptr+1;
1533         }
1534 
1535         if (flags & SRE_INFO_PREFIX) {
1536             /* pattern starts with a known prefix */
1537             /* <length> <skip> <prefix data> <overlap data> */
1538             prefix_len = pattern[5];
1539             prefix_skip = pattern[6];
1540             prefix = pattern + 7;
1541             overlap = prefix + prefix_len - 1;
1542         } else if (flags & SRE_INFO_CHARSET)
1543             /* pattern starts with a character from a known set */
1544             /* <charset> */
1545             charset = pattern + 5;
1546 
1547         pattern += 1 + pattern[1];
1548     }
1549 
1550     TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1551            prefix, prefix_len, prefix_skip));
1552     TRACE(("charset = %p\n", charset));
1553 
1554 #if defined(USE_FAST_SEARCH)
1555     if (prefix_len > 1) {
1556         /* pattern starts with a known prefix.  use the overlap
1557            table to skip forward as fast as we possibly can */
1558         Py_ssize_t i = 0;
1559         end = (SRE_CHAR *)state->end;
1560         while (ptr < end) {
1561             for (;;) {
1562                 if ((SRE_CODE) ptr[0] != prefix[i]) {
1563                     if (!i)
1564                         break;
1565                     else
1566                         i = overlap[i];
1567                 } else {
1568                     if (++i == prefix_len) {
1569                         /* found a potential match */
1570                         TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1571                         state->start = ptr + 1 - prefix_len;
1572                         state->ptr = ptr + 1 - prefix_len + prefix_skip;
1573                         if (flags & SRE_INFO_LITERAL)
1574                             return 1; /* we got all of it */
1575                         status = SRE_MATCH(state, pattern + 2*prefix_skip);
1576                         if (status != 0)
1577                             return status;
1578                         /* close but no cigar -- try again */
1579                         i = overlap[i];
1580                     }
1581                     break;
1582                 }
1583             }
1584             ptr++;
1585         }
1586         return 0;
1587     }
1588 #endif
1589 
1590     if (pattern[0] == SRE_OP_LITERAL) {
1591         /* pattern starts with a literal character.  this is used
1592            for short prefixes, and if fast search is disabled */
1593         SRE_CODE chr = pattern[1];
1594         end = (SRE_CHAR *)state->end;
1595         for (;;) {
1596             while (ptr < end && (SRE_CODE) ptr[0] != chr)
1597                 ptr++;
1598             if (ptr >= end)
1599                 return 0;
1600             TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1601             state->start = ptr;
1602             state->ptr = ++ptr;
1603             if (flags & SRE_INFO_LITERAL)
1604                 return 1; /* we got all of it */
1605             status = SRE_MATCH(state, pattern + 2);
1606             if (status != 0)
1607                 break;
1608         }
1609     } else if (charset) {
1610         /* pattern starts with a character from a known set */
1611         end = (SRE_CHAR *)state->end;
1612         for (;;) {
1613             while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1614                 ptr++;
1615             if (ptr >= end)
1616                 return 0;
1617             TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1618             state->start = ptr;
1619             state->ptr = ptr;
1620             status = SRE_MATCH(state, pattern);
1621             if (status != 0)
1622                 break;
1623             ptr++;
1624         }
1625     } else {
1626         /* general case */
1627         assert(ptr <= end);
1628         while (1) {
1629             TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1630             state->start = state->ptr = ptr;
1631             status = SRE_MATCH(state, pattern);
1632             if (status != 0 || ptr >= end)
1633                 break;
1634             ptr++;
1635         }
1636     }
1637 
1638     return status;
1639 }
1640 
1641 LOCAL(int)
SRE_LITERAL_TEMPLATE(SRE_CHAR * ptr,Py_ssize_t len)1642 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1643 {
1644     /* check if given string is a literal template (i.e. no escapes) */
1645     while (len-- > 0)
1646         if (*ptr++ == '\\')
1647             return 0;
1648     return 1;
1649 }
1650 
1651 #if !defined(SRE_RECURSIVE)
1652 
1653 /* -------------------------------------------------------------------- */
1654 /* factories and destructors */
1655 
1656 /* see sre.h for object declarations */
1657 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1658 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1659 
1660 static PyObject *
sre_codesize(PyObject * self,PyObject * unused)1661 sre_codesize(PyObject* self, PyObject *unused)
1662 {
1663     return PyInt_FromSize_t(sizeof(SRE_CODE));
1664 }
1665 
1666 static PyObject *
sre_getlower(PyObject * self,PyObject * args)1667 sre_getlower(PyObject* self, PyObject* args)
1668 {
1669     int character, flags;
1670     if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1671         return NULL;
1672     if (flags & SRE_FLAG_LOCALE)
1673         return Py_BuildValue("i", sre_lower_locale(character));
1674     if (flags & SRE_FLAG_UNICODE)
1675 #if defined(HAVE_UNICODE)
1676         return Py_BuildValue("i", sre_lower_unicode(character));
1677 #else
1678         return Py_BuildValue("i", sre_lower_locale(character));
1679 #endif
1680     return Py_BuildValue("i", sre_lower(character));
1681 }
1682 
1683 LOCAL(void)
state_reset(SRE_STATE * state)1684 state_reset(SRE_STATE* state)
1685 {
1686     /* FIXME: dynamic! */
1687     /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1688 
1689     state->lastmark = -1;
1690     state->lastindex = -1;
1691 
1692     state->repeat = NULL;
1693 
1694     data_stack_dealloc(state);
1695 }
1696 
1697 static void*
getstring(PyObject * string,Py_ssize_t * p_length,int * p_charsize)1698 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1699 {
1700     /* given a python object, return a data pointer, a length (in
1701        characters), and a character size.  return NULL if the object
1702        is not a string (or not compatible) */
1703 
1704     PyBufferProcs *buffer;
1705     Py_ssize_t size, bytes;
1706     int charsize;
1707     void* ptr;
1708 
1709 #if defined(HAVE_UNICODE)
1710     if (PyUnicode_Check(string)) {
1711         /* unicode strings doesn't always support the buffer interface */
1712         ptr = (void*) PyUnicode_AS_DATA(string);
1713         /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1714         size = PyUnicode_GET_SIZE(string);
1715         charsize = sizeof(Py_UNICODE);
1716 
1717     } else {
1718 #endif
1719 
1720     /* get pointer to string buffer */
1721     buffer = Py_TYPE(string)->tp_as_buffer;
1722     if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1723         buffer->bf_getsegcount(string, NULL) != 1) {
1724         PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1725         return NULL;
1726     }
1727 
1728     /* determine buffer size */
1729     bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1730     if (bytes < 0) {
1731         PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1732         return NULL;
1733     }
1734 
1735     /* determine character size */
1736 #if PY_VERSION_HEX >= 0x01060000
1737     size = PyObject_Size(string);
1738 #else
1739     size = PyObject_Length(string);
1740 #endif
1741 
1742     if (PyString_Check(string) || bytes == size)
1743         charsize = 1;
1744 #if defined(HAVE_UNICODE)
1745     else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1746         charsize = sizeof(Py_UNICODE);
1747 #endif
1748     else {
1749         PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1750         return NULL;
1751     }
1752 
1753 #if defined(HAVE_UNICODE)
1754     }
1755 #endif
1756 
1757     *p_length = size;
1758     *p_charsize = charsize;
1759 
1760     return ptr;
1761 }
1762 
1763 LOCAL(PyObject*)
state_init(SRE_STATE * state,PatternObject * pattern,PyObject * string,Py_ssize_t start,Py_ssize_t end)1764 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1765            Py_ssize_t start, Py_ssize_t end)
1766 {
1767     /* prepare state object */
1768 
1769     Py_ssize_t length;
1770     int charsize;
1771     void* ptr;
1772 
1773     memset(state, 0, sizeof(SRE_STATE));
1774 
1775     state->lastmark = -1;
1776     state->lastindex = -1;
1777 
1778     ptr = getstring(string, &length, &charsize);
1779     if (!ptr)
1780         return NULL;
1781 
1782     /* adjust boundaries */
1783     if (start < 0)
1784         start = 0;
1785     else if (start > length)
1786         start = length;
1787 
1788     if (end < 0)
1789         end = 0;
1790     else if (end > length)
1791         end = length;
1792 
1793     state->charsize = charsize;
1794 
1795     state->beginning = ptr;
1796 
1797     state->start = (void*) ((char*) ptr + start * state->charsize);
1798     state->end = (void*) ((char*) ptr + end * state->charsize);
1799 
1800     Py_INCREF(string);
1801     state->string = string;
1802     state->pos = start;
1803     state->endpos = end;
1804 
1805     if (pattern->flags & SRE_FLAG_LOCALE)
1806         state->lower = sre_lower_locale;
1807     else if (pattern->flags & SRE_FLAG_UNICODE)
1808 #if defined(HAVE_UNICODE)
1809         state->lower = sre_lower_unicode;
1810 #else
1811         state->lower = sre_lower_locale;
1812 #endif
1813     else
1814         state->lower = sre_lower;
1815 
1816     return string;
1817 }
1818 
1819 LOCAL(void)
state_fini(SRE_STATE * state)1820 state_fini(SRE_STATE* state)
1821 {
1822     Py_XDECREF(state->string);
1823     data_stack_dealloc(state);
1824 }
1825 
1826 /* calculate offset from start of string */
1827 #define STATE_OFFSET(state, member)\
1828     (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1829 
1830 LOCAL(PyObject*)
state_getslice(SRE_STATE * state,Py_ssize_t index,PyObject * string,int empty)1831 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1832 {
1833     Py_ssize_t i, j;
1834 
1835     index = (index - 1) * 2;
1836 
1837     if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1838         if (empty)
1839             /* want empty string */
1840             i = j = 0;
1841         else {
1842             Py_INCREF(Py_None);
1843             return Py_None;
1844         }
1845     } else {
1846         i = STATE_OFFSET(state, state->mark[index]);
1847         j = STATE_OFFSET(state, state->mark[index+1]);
1848     }
1849 
1850     return PySequence_GetSlice(string, i, j);
1851 }
1852 
1853 static void
pattern_error(int status)1854 pattern_error(int status)
1855 {
1856     switch (status) {
1857     case SRE_ERROR_RECURSION_LIMIT:
1858         PyErr_SetString(
1859             PyExc_RuntimeError,
1860             "maximum recursion limit exceeded"
1861             );
1862         break;
1863     case SRE_ERROR_MEMORY:
1864         PyErr_NoMemory();
1865         break;
1866     case SRE_ERROR_INTERRUPTED:
1867     /* An exception has already been raised, so let it fly */
1868         break;
1869     default:
1870         /* other error codes indicate compiler/engine bugs */
1871         PyErr_SetString(
1872             PyExc_RuntimeError,
1873             "internal error in regular expression engine"
1874             );
1875     }
1876 }
1877 
1878 static void
pattern_dealloc(PatternObject * self)1879 pattern_dealloc(PatternObject* self)
1880 {
1881     if (self->weakreflist != NULL)
1882         PyObject_ClearWeakRefs((PyObject *) self);
1883     Py_XDECREF(self->pattern);
1884     Py_XDECREF(self->groupindex);
1885     Py_XDECREF(self->indexgroup);
1886     PyObject_DEL(self);
1887 }
1888 
1889 static int
check_args_size(const char * name,PyObject * args,PyObject * kw,int n)1890 check_args_size(const char *name, PyObject* args, PyObject* kw, int n)
1891 {
1892     Py_ssize_t m = PyTuple_GET_SIZE(args) + (kw ? PyDict_Size(kw) : 0);
1893     if (m <= n)
1894         return 1;
1895     PyErr_Format(PyExc_TypeError,
1896                  "%s() takes at most %d positional arguments (%zd given)",
1897                  name, n, m);
1898     return 0;
1899 }
1900 
1901 static PyObject*
fix_string_param(PyObject * string,PyObject * string2,const char * oldname)1902 fix_string_param(PyObject *string, PyObject *string2, const char *oldname)
1903 {
1904     if (string2 != NULL) {
1905         char buf[100];
1906         if (string != NULL) {
1907             PyErr_Format(PyExc_TypeError,
1908                          "Argument given by name ('%s') and position (1)",
1909                          oldname);
1910             return NULL;
1911         }
1912         sprintf(buf, "The '%s' keyword parameter name is deprecated.  "
1913                      "Use 'string' instead.", oldname);
1914         if (PyErr_Warn(PyExc_DeprecationWarning, buf) < 0)
1915             return NULL;
1916         return string2;
1917     }
1918     if (string == NULL) {
1919         PyErr_SetString(PyExc_TypeError,
1920                         "Required argument 'string' (pos 1) not found");
1921         return NULL;
1922     }
1923     return string;
1924 }
1925 
1926 static PyObject*
pattern_match(PatternObject * self,PyObject * args,PyObject * kw)1927 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1928 {
1929     SRE_STATE state;
1930     int status;
1931 
1932     PyObject *string = NULL, *string2 = NULL;
1933     Py_ssize_t start = 0;
1934     Py_ssize_t end = PY_SSIZE_T_MAX;
1935     static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1936     if (!check_args_size("match", args, kw, 3))
1937         return NULL;
1938 
1939     if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:match", kwlist,
1940                                      &string, &start, &end, &string2))
1941         return NULL;
1942 
1943     string = fix_string_param(string, string2, "pattern");
1944     if (!string)
1945         return NULL;
1946 
1947     string = state_init(&state, self, string, start, end);
1948     if (!string)
1949         return NULL;
1950 
1951     state.ptr = state.start;
1952 
1953     TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1954 
1955     if (state.charsize == 1) {
1956         status = sre_match(&state, PatternObject_GetCode(self));
1957     } else {
1958 #if defined(HAVE_UNICODE)
1959         status = sre_umatch(&state, PatternObject_GetCode(self));
1960 #endif
1961     }
1962 
1963     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1964     if (PyErr_Occurred())
1965         return NULL;
1966 
1967     state_fini(&state);
1968 
1969     return pattern_new_match(self, &state, status);
1970 }
1971 
1972 static PyObject*
pattern_search(PatternObject * self,PyObject * args,PyObject * kw)1973 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1974 {
1975     SRE_STATE state;
1976     int status;
1977 
1978     PyObject *string = NULL, *string2 = NULL;
1979     Py_ssize_t start = 0;
1980     Py_ssize_t end = PY_SSIZE_T_MAX;
1981     static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1982     if (!check_args_size("search", args, kw, 3))
1983         return NULL;
1984 
1985     if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:search", kwlist,
1986                                      &string, &start, &end, &string2))
1987         return NULL;
1988 
1989     string = fix_string_param(string, string2, "pattern");
1990     if (!string)
1991         return NULL;
1992 
1993     string = state_init(&state, self, string, start, end);
1994     if (!string)
1995         return NULL;
1996 
1997     TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1998 
1999     if (state.charsize == 1) {
2000         status = sre_search(&state, PatternObject_GetCode(self));
2001     } else {
2002 #if defined(HAVE_UNICODE)
2003         status = sre_usearch(&state, PatternObject_GetCode(self));
2004 #endif
2005     }
2006 
2007     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2008 
2009     state_fini(&state);
2010 
2011     if (PyErr_Occurred())
2012         return NULL;
2013 
2014     return pattern_new_match(self, &state, status);
2015 }
2016 
2017 static PyObject*
call(char * module,char * function,PyObject * args)2018 call(char* module, char* function, PyObject* args)
2019 {
2020     PyObject* name;
2021     PyObject* mod;
2022     PyObject* func;
2023     PyObject* result;
2024 
2025     if (!args)
2026         return NULL;
2027     name = PyString_FromString(module);
2028     if (!name)
2029         return NULL;
2030     mod = PyImport_Import(name);
2031     Py_DECREF(name);
2032     if (!mod)
2033         return NULL;
2034     func = PyObject_GetAttrString(mod, function);
2035     Py_DECREF(mod);
2036     if (!func)
2037         return NULL;
2038     result = PyObject_CallObject(func, args);
2039     Py_DECREF(func);
2040     Py_DECREF(args);
2041     return result;
2042 }
2043 
2044 #ifdef USE_BUILTIN_COPY
2045 static int
deepcopy(PyObject ** object,PyObject * memo)2046 deepcopy(PyObject** object, PyObject* memo)
2047 {
2048     PyObject* copy;
2049 
2050     copy = call(
2051         "copy", "deepcopy",
2052         PyTuple_Pack(2, *object, memo)
2053         );
2054     if (!copy)
2055         return 0;
2056 
2057     Py_SETREF(*object, copy);
2058 
2059     return 1; /* success */
2060 }
2061 #endif
2062 
2063 static PyObject*
join_list(PyObject * list,PyObject * string)2064 join_list(PyObject* list, PyObject* string)
2065 {
2066     /* join list elements */
2067 
2068     PyObject* joiner;
2069 #if PY_VERSION_HEX >= 0x01060000
2070     PyObject* function;
2071     PyObject* args;
2072 #endif
2073     PyObject* result;
2074 
2075     joiner = PySequence_GetSlice(string, 0, 0);
2076     if (!joiner)
2077         return NULL;
2078 
2079     if (PyList_GET_SIZE(list) == 0) {
2080         Py_DECREF(list);
2081         return joiner;
2082     }
2083 
2084 #if PY_VERSION_HEX >= 0x01060000
2085     function = PyObject_GetAttrString(joiner, "join");
2086     if (!function) {
2087         Py_DECREF(joiner);
2088         return NULL;
2089     }
2090     args = PyTuple_New(1);
2091     if (!args) {
2092         Py_DECREF(function);
2093         Py_DECREF(joiner);
2094         return NULL;
2095     }
2096     PyTuple_SET_ITEM(args, 0, list);
2097     result = PyObject_CallObject(function, args);
2098     Py_DECREF(args); /* also removes list */
2099     Py_DECREF(function);
2100 #else
2101     result = call(
2102         "string", "join",
2103         PyTuple_Pack(2, list, joiner)
2104         );
2105 #endif
2106     Py_DECREF(joiner);
2107 
2108     return result;
2109 }
2110 
2111 static PyObject*
pattern_findall(PatternObject * self,PyObject * args,PyObject * kw)2112 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2113 {
2114     SRE_STATE state;
2115     PyObject* list;
2116     int status;
2117     Py_ssize_t i, b, e;
2118 
2119     PyObject *string = NULL, *string2 = NULL;
2120     Py_ssize_t start = 0;
2121     Py_ssize_t end = PY_SSIZE_T_MAX;
2122     static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
2123     if (!check_args_size("findall", args, kw, 3))
2124         return NULL;
2125 
2126     if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:findall", kwlist,
2127                                      &string, &start, &end, &string2))
2128         return NULL;
2129 
2130     string = fix_string_param(string, string2, "source");
2131     if (!string)
2132         return NULL;
2133 
2134     string = state_init(&state, self, string, start, end);
2135     if (!string)
2136         return NULL;
2137 
2138     list = PyList_New(0);
2139     if (!list) {
2140         state_fini(&state);
2141         return NULL;
2142     }
2143 
2144     while (state.start <= state.end) {
2145 
2146         PyObject* item;
2147 
2148         state_reset(&state);
2149 
2150         state.ptr = state.start;
2151 
2152         if (state.charsize == 1) {
2153             status = sre_search(&state, PatternObject_GetCode(self));
2154         } else {
2155 #if defined(HAVE_UNICODE)
2156             status = sre_usearch(&state, PatternObject_GetCode(self));
2157 #endif
2158         }
2159 
2160 	if (PyErr_Occurred())
2161 	    goto error;
2162 
2163         if (status <= 0) {
2164             if (status == 0)
2165                 break;
2166             pattern_error(status);
2167             goto error;
2168         }
2169 
2170         /* don't bother to build a match object */
2171         switch (self->groups) {
2172         case 0:
2173             b = STATE_OFFSET(&state, state.start);
2174             e = STATE_OFFSET(&state, state.ptr);
2175             item = PySequence_GetSlice(string, b, e);
2176             if (!item)
2177                 goto error;
2178             break;
2179         case 1:
2180             item = state_getslice(&state, 1, string, 1);
2181             if (!item)
2182                 goto error;
2183             break;
2184         default:
2185             item = PyTuple_New(self->groups);
2186             if (!item)
2187                 goto error;
2188             for (i = 0; i < self->groups; i++) {
2189                 PyObject* o = state_getslice(&state, i+1, string, 1);
2190                 if (!o) {
2191                     Py_DECREF(item);
2192                     goto error;
2193                 }
2194                 PyTuple_SET_ITEM(item, i, o);
2195             }
2196             break;
2197         }
2198 
2199         status = PyList_Append(list, item);
2200         Py_DECREF(item);
2201         if (status < 0)
2202             goto error;
2203 
2204         if (state.ptr == state.start)
2205             state.start = (void*) ((char*) state.ptr + state.charsize);
2206         else
2207             state.start = state.ptr;
2208 
2209     }
2210 
2211     state_fini(&state);
2212     return list;
2213 
2214 error:
2215     Py_DECREF(list);
2216     state_fini(&state);
2217     return NULL;
2218 
2219 }
2220 
2221 #if PY_VERSION_HEX >= 0x02020000
2222 static PyObject*
pattern_finditer(PatternObject * pattern,PyObject * args)2223 pattern_finditer(PatternObject* pattern, PyObject* args)
2224 {
2225     PyObject* scanner;
2226     PyObject* search;
2227     PyObject* iterator;
2228 
2229     scanner = pattern_scanner(pattern, args);
2230     if (!scanner)
2231         return NULL;
2232 
2233     search = PyObject_GetAttrString(scanner, "search");
2234     Py_DECREF(scanner);
2235     if (!search)
2236         return NULL;
2237 
2238     iterator = PyCallIter_New(search, Py_None);
2239     Py_DECREF(search);
2240 
2241     return iterator;
2242 }
2243 #endif
2244 
2245 static PyObject*
pattern_split(PatternObject * self,PyObject * args,PyObject * kw)2246 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2247 {
2248     SRE_STATE state;
2249     PyObject* list;
2250     PyObject* item;
2251     int status;
2252     Py_ssize_t n;
2253     Py_ssize_t i;
2254     void* last;
2255 
2256     PyObject *string = NULL, *string2 = NULL;
2257     Py_ssize_t maxsplit = 0;
2258     static char* kwlist[] = { "string", "maxsplit", "source", NULL };
2259     if (!check_args_size("split", args, kw, 2))
2260         return NULL;
2261 
2262     if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnO:split", kwlist,
2263                                      &string, &maxsplit, &string2))
2264         return NULL;
2265 
2266     string = fix_string_param(string, string2, "source");
2267     if (!string)
2268         return NULL;
2269 
2270     if (Py_Py3kWarningFlag &&
2271         (self->code[0] != SRE_OP_INFO || self->code[3] == 0))
2272     {
2273         if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
2274             if (PyErr_WarnPy3k("split() requires a non-empty pattern match.",
2275                                1) < 0)
2276                 return NULL;
2277         }
2278         else if (PyErr_WarnEx(PyExc_FutureWarning,
2279                               "split() requires a non-empty pattern match.",
2280                               1) < 0)
2281             return NULL;
2282     }
2283 
2284     string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2285     if (!string)
2286         return NULL;
2287 
2288     list = PyList_New(0);
2289     if (!list) {
2290         state_fini(&state);
2291         return NULL;
2292     }
2293 
2294     n = 0;
2295     last = state.start;
2296 
2297     while (!maxsplit || n < maxsplit) {
2298 
2299         state_reset(&state);
2300 
2301         state.ptr = state.start;
2302 
2303         if (state.charsize == 1) {
2304             status = sre_search(&state, PatternObject_GetCode(self));
2305         } else {
2306 #if defined(HAVE_UNICODE)
2307             status = sre_usearch(&state, PatternObject_GetCode(self));
2308 #endif
2309         }
2310 
2311 	if (PyErr_Occurred())
2312 	    goto error;
2313 
2314         if (status <= 0) {
2315             if (status == 0)
2316                 break;
2317             pattern_error(status);
2318             goto error;
2319         }
2320 
2321         if (state.start == state.ptr) {
2322             if (last == state.end || state.ptr == state.end)
2323                 break;
2324             /* skip one character */
2325             state.start = (void*) ((char*) state.ptr + state.charsize);
2326             continue;
2327         }
2328 
2329         /* get segment before this match */
2330         item = PySequence_GetSlice(
2331             string, STATE_OFFSET(&state, last),
2332             STATE_OFFSET(&state, state.start)
2333             );
2334         if (!item)
2335             goto error;
2336         status = PyList_Append(list, item);
2337         Py_DECREF(item);
2338         if (status < 0)
2339             goto error;
2340 
2341         /* add groups (if any) */
2342         for (i = 0; i < self->groups; i++) {
2343             item = state_getslice(&state, i+1, string, 0);
2344             if (!item)
2345                 goto error;
2346             status = PyList_Append(list, item);
2347             Py_DECREF(item);
2348             if (status < 0)
2349                 goto error;
2350         }
2351 
2352         n = n + 1;
2353 
2354         last = state.start = state.ptr;
2355 
2356     }
2357 
2358     /* get segment following last match (even if empty) */
2359     item = PySequence_GetSlice(
2360         string, STATE_OFFSET(&state, last), state.endpos
2361         );
2362     if (!item)
2363         goto error;
2364     status = PyList_Append(list, item);
2365     Py_DECREF(item);
2366     if (status < 0)
2367         goto error;
2368 
2369     state_fini(&state);
2370     return list;
2371 
2372 error:
2373     Py_DECREF(list);
2374     state_fini(&state);
2375     return NULL;
2376 
2377 }
2378 
2379 static PyObject*
pattern_subx(PatternObject * self,PyObject * ptemplate,PyObject * string,Py_ssize_t count,Py_ssize_t subn)2380 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2381              Py_ssize_t count, Py_ssize_t subn)
2382 {
2383     SRE_STATE state;
2384     PyObject* list;
2385     PyObject* item;
2386     PyObject* filter;
2387     PyObject* args;
2388     PyObject* match;
2389     void* ptr;
2390     int status;
2391     Py_ssize_t n;
2392     Py_ssize_t i, b, e;
2393     int bint;
2394     int filter_is_callable;
2395 
2396     if (PyCallable_Check(ptemplate)) {
2397         /* sub/subn takes either a function or a template */
2398         filter = ptemplate;
2399         Py_INCREF(filter);
2400         filter_is_callable = 1;
2401     } else {
2402         /* if not callable, check if it's a literal string */
2403         int literal;
2404         ptr = getstring(ptemplate, &n, &bint);
2405         b = bint;
2406         if (ptr) {
2407             if (b == 1) {
2408 		    literal = sre_literal_template((unsigned char *)ptr, n);
2409             } else {
2410 #if defined(HAVE_UNICODE)
2411 		    literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2412 #endif
2413             }
2414         } else {
2415             PyErr_Clear();
2416             literal = 0;
2417         }
2418         if (literal) {
2419             filter = ptemplate;
2420             Py_INCREF(filter);
2421             filter_is_callable = 0;
2422         } else {
2423             /* not a literal; hand it over to the template compiler */
2424             filter = call(
2425                 SRE_PY_MODULE, "_subx",
2426                 PyTuple_Pack(2, self, ptemplate)
2427                 );
2428             if (!filter)
2429                 return NULL;
2430             filter_is_callable = PyCallable_Check(filter);
2431         }
2432     }
2433 
2434     string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2435     if (!string) {
2436         Py_DECREF(filter);
2437         return NULL;
2438     }
2439 
2440     list = PyList_New(0);
2441     if (!list) {
2442         Py_DECREF(filter);
2443         state_fini(&state);
2444         return NULL;
2445     }
2446 
2447     n = i = 0;
2448 
2449     while (!count || n < count) {
2450 
2451         state_reset(&state);
2452 
2453         state.ptr = state.start;
2454 
2455         if (state.charsize == 1) {
2456             status = sre_search(&state, PatternObject_GetCode(self));
2457         } else {
2458 #if defined(HAVE_UNICODE)
2459             status = sre_usearch(&state, PatternObject_GetCode(self));
2460 #endif
2461         }
2462 
2463 	if (PyErr_Occurred())
2464 	    goto error;
2465 
2466         if (status <= 0) {
2467             if (status == 0)
2468                 break;
2469             pattern_error(status);
2470             goto error;
2471         }
2472 
2473         b = STATE_OFFSET(&state, state.start);
2474         e = STATE_OFFSET(&state, state.ptr);
2475 
2476         if (i < b) {
2477             /* get segment before this match */
2478             item = PySequence_GetSlice(string, i, b);
2479             if (!item)
2480                 goto error;
2481             status = PyList_Append(list, item);
2482             Py_DECREF(item);
2483             if (status < 0)
2484                 goto error;
2485 
2486         } else if (i == b && i == e && n > 0)
2487             /* ignore empty match on latest position */
2488             goto next;
2489 
2490         if (filter_is_callable) {
2491             /* pass match object through filter */
2492             match = pattern_new_match(self, &state, 1);
2493             if (!match)
2494                 goto error;
2495             args = PyTuple_Pack(1, match);
2496             if (!args) {
2497                 Py_DECREF(match);
2498                 goto error;
2499             }
2500             item = PyObject_CallObject(filter, args);
2501             Py_DECREF(args);
2502             Py_DECREF(match);
2503             if (!item)
2504                 goto error;
2505         } else {
2506             /* filter is literal string */
2507             item = filter;
2508             Py_INCREF(item);
2509         }
2510 
2511         /* add to list */
2512         if (item != Py_None) {
2513             status = PyList_Append(list, item);
2514             Py_DECREF(item);
2515             if (status < 0)
2516                 goto error;
2517         }
2518 
2519         i = e;
2520         n = n + 1;
2521 
2522 next:
2523         /* move on */
2524         if (state.ptr == state.end)
2525             break;
2526         if (state.ptr == state.start)
2527             state.start = (void*) ((char*) state.ptr + state.charsize);
2528         else
2529             state.start = state.ptr;
2530 
2531     }
2532 
2533     /* get segment following last match */
2534     if (i < state.endpos) {
2535         item = PySequence_GetSlice(string, i, state.endpos);
2536         if (!item)
2537             goto error;
2538         status = PyList_Append(list, item);
2539         Py_DECREF(item);
2540         if (status < 0)
2541             goto error;
2542     }
2543 
2544     state_fini(&state);
2545 
2546     Py_DECREF(filter);
2547 
2548     /* convert list to single string (also removes list) */
2549     item = join_list(list, string);
2550 
2551     if (!item)
2552         return NULL;
2553 
2554     if (subn)
2555         return Py_BuildValue("Nn", item, n);
2556 
2557     return item;
2558 
2559 error:
2560     Py_DECREF(list);
2561     state_fini(&state);
2562     Py_DECREF(filter);
2563     return NULL;
2564 
2565 }
2566 
2567 static PyObject*
pattern_sub(PatternObject * self,PyObject * args,PyObject * kw)2568 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2569 {
2570     PyObject* ptemplate;
2571     PyObject* string;
2572     Py_ssize_t count = 0;
2573     static char* kwlist[] = { "repl", "string", "count", NULL };
2574     if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2575                                      &ptemplate, &string, &count))
2576         return NULL;
2577 
2578     return pattern_subx(self, ptemplate, string, count, 0);
2579 }
2580 
2581 static PyObject*
pattern_subn(PatternObject * self,PyObject * args,PyObject * kw)2582 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2583 {
2584     PyObject* ptemplate;
2585     PyObject* string;
2586     Py_ssize_t count = 0;
2587     static char* kwlist[] = { "repl", "string", "count", NULL };
2588     if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2589                                      &ptemplate, &string, &count))
2590         return NULL;
2591 
2592     return pattern_subx(self, ptemplate, string, count, 1);
2593 }
2594 
2595 static PyObject*
pattern_copy(PatternObject * self,PyObject * unused)2596 pattern_copy(PatternObject* self, PyObject *unused)
2597 {
2598 #ifdef USE_BUILTIN_COPY
2599     PatternObject* copy;
2600     int offset;
2601 
2602     copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2603     if (!copy)
2604         return NULL;
2605 
2606     offset = offsetof(PatternObject, groups);
2607 
2608     Py_XINCREF(self->groupindex);
2609     Py_XINCREF(self->indexgroup);
2610     Py_XINCREF(self->pattern);
2611 
2612     memcpy((char*) copy + offset, (char*) self + offset,
2613            sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2614     copy->weakreflist = NULL;
2615 
2616     return (PyObject*) copy;
2617 #else
2618     PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2619     return NULL;
2620 #endif
2621 }
2622 
2623 static PyObject*
pattern_deepcopy(PatternObject * self,PyObject * memo)2624 pattern_deepcopy(PatternObject* self, PyObject* memo)
2625 {
2626 #ifdef USE_BUILTIN_COPY
2627     PatternObject* copy;
2628 
2629     copy = (PatternObject*) pattern_copy(self);
2630     if (!copy)
2631         return NULL;
2632 
2633     if (!deepcopy(&copy->groupindex, memo) ||
2634         !deepcopy(&copy->indexgroup, memo) ||
2635         !deepcopy(&copy->pattern, memo)) {
2636         Py_DECREF(copy);
2637         return NULL;
2638     }
2639 
2640 #else
2641     PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2642     return NULL;
2643 #endif
2644 }
2645 
2646 PyDoc_STRVAR(pattern_match_doc,
2647 "match(string[, pos[, endpos]]) --> match object or None.\n\
2648     Matches zero or more characters at the beginning of the string");
2649 
2650 PyDoc_STRVAR(pattern_search_doc,
2651 "search(string[, pos[, endpos]]) --> match object or None.\n\
2652     Scan through string looking for a match, and return a corresponding\n\
2653     match object instance. Return None if no position in the string matches.");
2654 
2655 PyDoc_STRVAR(pattern_split_doc,
2656 "split(string[, maxsplit = 0])  --> list.\n\
2657     Split string by the occurrences of pattern.");
2658 
2659 PyDoc_STRVAR(pattern_findall_doc,
2660 "findall(string[, pos[, endpos]]) --> list.\n\
2661    Return a list of all non-overlapping matches of pattern in string.");
2662 
2663 PyDoc_STRVAR(pattern_finditer_doc,
2664 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2665     Return an iterator over all non-overlapping matches for the \n\
2666     RE pattern in string. For each match, the iterator returns a\n\
2667     match object.");
2668 
2669 PyDoc_STRVAR(pattern_sub_doc,
2670 "sub(repl, string[, count = 0]) --> newstring\n\
2671     Return the string obtained by replacing the leftmost non-overlapping\n\
2672     occurrences of pattern in string by the replacement repl.");
2673 
2674 PyDoc_STRVAR(pattern_subn_doc,
2675 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2676     Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2677     the leftmost non-overlapping occurrences of pattern with the\n\
2678     replacement repl.");
2679 
2680 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2681 
2682 static PyMethodDef pattern_methods[] = {
2683     {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2684 	pattern_match_doc},
2685     {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2686 	pattern_search_doc},
2687     {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2688 	pattern_sub_doc},
2689     {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2690 	pattern_subn_doc},
2691     {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2692 	pattern_split_doc},
2693     {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2694 	pattern_findall_doc},
2695 #if PY_VERSION_HEX >= 0x02020000
2696     {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2697 	pattern_finditer_doc},
2698 #endif
2699     {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2700     {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2701     {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2702     {NULL, NULL}
2703 };
2704 
2705 #define PAT_OFF(x) offsetof(PatternObject, x)
2706 static PyMemberDef pattern_members[] = {
2707     {"pattern",    T_OBJECT,    PAT_OFF(pattern),       READONLY},
2708     {"flags",      T_INT,       PAT_OFF(flags),         READONLY},
2709     {"groups",     T_PYSSIZET,  PAT_OFF(groups),        READONLY},
2710     {"groupindex", T_OBJECT,    PAT_OFF(groupindex),    READONLY},
2711     {NULL}  /* Sentinel */
2712 };
2713 
2714 statichere PyTypeObject Pattern_Type = {
2715     PyVarObject_HEAD_INIT(NULL, 0)
2716     "_" SRE_MODULE ".SRE_Pattern",
2717     sizeof(PatternObject), sizeof(SRE_CODE),
2718     (destructor)pattern_dealloc, /*tp_dealloc*/
2719     0,                                  /* tp_print */
2720     0,                                  /* tp_getattrn */
2721     0,					/* tp_setattr */
2722     0,					/* tp_compare */
2723     0,					/* tp_repr */
2724     0,					/* tp_as_number */
2725     0,					/* tp_as_sequence */
2726     0,					/* tp_as_mapping */
2727     0,					/* tp_hash */
2728     0,					/* tp_call */
2729     0,					/* tp_str */
2730     0,					/* tp_getattro */
2731     0,					/* tp_setattro */
2732     0,					/* tp_as_buffer */
2733     Py_TPFLAGS_DEFAULT,		        /* tp_flags */
2734     pattern_doc,			/* tp_doc */
2735     0,					/* tp_traverse */
2736     0,					/* tp_clear */
2737     0,					/* tp_richcompare */
2738     offsetof(PatternObject, weakreflist),	/* tp_weaklistoffset */
2739     0,					/* tp_iter */
2740     0,					/* tp_iternext */
2741     pattern_methods,			/* tp_methods */
2742     pattern_members,			/* tp_members */
2743 };
2744 
2745 static int _validate(PatternObject *self); /* Forward */
2746 
2747 static PyObject *
_compile(PyObject * self_,PyObject * args)2748 _compile(PyObject* self_, PyObject* args)
2749 {
2750     /* "compile" pattern descriptor to pattern object */
2751 
2752     PatternObject* self;
2753     Py_ssize_t i, n;
2754 
2755     PyObject* pattern;
2756     int flags = 0;
2757     PyObject* code;
2758     Py_ssize_t groups = 0;
2759     PyObject* groupindex = NULL;
2760     PyObject* indexgroup = NULL;
2761     if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2762                           &PyList_Type, &code, &groups,
2763                           &groupindex, &indexgroup))
2764         return NULL;
2765 
2766     n = PyList_GET_SIZE(code);
2767     /* coverity[ampersand_in_size] */
2768     self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2769     if (!self)
2770         return NULL;
2771     self->weakreflist = NULL;
2772     self->pattern = NULL;
2773     self->groupindex = NULL;
2774     self->indexgroup = NULL;
2775 
2776     self->codesize = n;
2777 
2778     for (i = 0; i < n; i++) {
2779         PyObject *o = PyList_GET_ITEM(code, i);
2780         unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2781                                               : PyLong_AsUnsignedLong(o);
2782         if (value == (unsigned long)-1 && PyErr_Occurred()) {
2783             if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
2784                 PyErr_SetString(PyExc_OverflowError,
2785                                 "regular expression code size limit exceeded");
2786             }
2787             break;
2788         }
2789         self->code[i] = (SRE_CODE) value;
2790         if ((unsigned long) self->code[i] != value) {
2791             PyErr_SetString(PyExc_OverflowError,
2792                             "regular expression code size limit exceeded");
2793             break;
2794         }
2795     }
2796 
2797     if (PyErr_Occurred()) {
2798         Py_DECREF(self);
2799         return NULL;
2800     }
2801 
2802     Py_INCREF(pattern);
2803     self->pattern = pattern;
2804 
2805     self->flags = flags;
2806 
2807     self->groups = groups;
2808 
2809     Py_XINCREF(groupindex);
2810     self->groupindex = groupindex;
2811 
2812     Py_XINCREF(indexgroup);
2813     self->indexgroup = indexgroup;
2814 
2815     self->weakreflist = NULL;
2816 
2817     if (!_validate(self)) {
2818         Py_DECREF(self);
2819         return NULL;
2820     }
2821 
2822     return (PyObject*) self;
2823 }
2824 
2825 /* -------------------------------------------------------------------- */
2826 /* Code validation */
2827 
2828 /* To learn more about this code, have a look at the _compile() function in
2829    Lib/sre_compile.py.  The validation functions below checks the code array
2830    for conformance with the code patterns generated there.
2831 
2832    The nice thing about the generated code is that it is position-independent:
2833    all jumps are relative jumps forward.  Also, jumps don't cross each other:
2834    the target of a later jump is always earlier than the target of an earlier
2835    jump.  IOW, this is okay:
2836 
2837    J---------J-------T--------T
2838     \         \_____/        /
2839      \______________________/
2840 
2841    but this is not:
2842 
2843    J---------J-------T--------T
2844     \_________\_____/        /
2845                \____________/
2846 
2847    It also helps that SRE_CODE is always an unsigned type.
2848 */
2849 
2850 /* Defining this one enables tracing of the validator */
2851 #undef VVERBOSE
2852 
2853 /* Trace macro for the validator */
2854 #if defined(VVERBOSE)
2855 #define VTRACE(v) printf v
2856 #else
2857 #define VTRACE(v) do {} while(0)  /* do nothing */
2858 #endif
2859 
2860 /* Report failure */
2861 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2862 
2863 /* Extract opcode, argument, or skip count from code array */
2864 #define GET_OP                                          \
2865     do {                                                \
2866         VTRACE(("%p: ", code));                         \
2867         if (code >= end) FAIL;                          \
2868         op = *code++;                                   \
2869         VTRACE(("%lu (op)\n", (unsigned long)op));      \
2870     } while (0)
2871 #define GET_ARG                                         \
2872     do {                                                \
2873         VTRACE(("%p= ", code));                         \
2874         if (code >= end) FAIL;                          \
2875         arg = *code++;                                  \
2876         VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
2877     } while (0)
2878 #define GET_SKIP_ADJ(adj)                               \
2879     do {                                                \
2880         VTRACE(("%p= ", code));                         \
2881         if (code >= end) FAIL;                          \
2882         skip = *code;                                   \
2883         VTRACE(("%lu (skip to %p)\n",                   \
2884                (unsigned long)skip, code+skip));        \
2885         if (skip-adj > (Py_uintptr_t)(end - code))      \
2886             FAIL;                                       \
2887         code++;                                         \
2888     } while (0)
2889 #define GET_SKIP GET_SKIP_ADJ(0)
2890 
2891 static int
_validate_charset(SRE_CODE * code,SRE_CODE * end)2892 _validate_charset(SRE_CODE *code, SRE_CODE *end)
2893 {
2894     /* Some variables are manipulated by the macros above */
2895     SRE_CODE op;
2896     SRE_CODE arg;
2897     SRE_CODE offset;
2898     int i;
2899 
2900     while (code < end) {
2901         GET_OP;
2902         switch (op) {
2903 
2904         case SRE_OP_NEGATE:
2905             break;
2906 
2907         case SRE_OP_LITERAL:
2908             GET_ARG;
2909             break;
2910 
2911         case SRE_OP_RANGE:
2912             GET_ARG;
2913             GET_ARG;
2914             break;
2915 
2916         case SRE_OP_CHARSET:
2917             offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2918             if (offset > (Py_uintptr_t)(end - code))
2919                 FAIL;
2920             code += offset;
2921             break;
2922 
2923         case SRE_OP_BIGCHARSET:
2924             GET_ARG; /* Number of blocks */
2925             offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2926             if (offset > (Py_uintptr_t)(end - code))
2927                 FAIL;
2928             /* Make sure that each byte points to a valid block */
2929             for (i = 0; i < 256; i++) {
2930                 if (((unsigned char *)code)[i] >= arg)
2931                     FAIL;
2932             }
2933             code += offset;
2934             offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2935             if (offset > (Py_uintptr_t)(end - code))
2936                 FAIL;
2937             code += offset;
2938             break;
2939 
2940         case SRE_OP_CATEGORY:
2941             GET_ARG;
2942             switch (arg) {
2943             case SRE_CATEGORY_DIGIT:
2944             case SRE_CATEGORY_NOT_DIGIT:
2945             case SRE_CATEGORY_SPACE:
2946             case SRE_CATEGORY_NOT_SPACE:
2947             case SRE_CATEGORY_WORD:
2948             case SRE_CATEGORY_NOT_WORD:
2949             case SRE_CATEGORY_LINEBREAK:
2950             case SRE_CATEGORY_NOT_LINEBREAK:
2951             case SRE_CATEGORY_LOC_WORD:
2952             case SRE_CATEGORY_LOC_NOT_WORD:
2953             case SRE_CATEGORY_UNI_DIGIT:
2954             case SRE_CATEGORY_UNI_NOT_DIGIT:
2955             case SRE_CATEGORY_UNI_SPACE:
2956             case SRE_CATEGORY_UNI_NOT_SPACE:
2957             case SRE_CATEGORY_UNI_WORD:
2958             case SRE_CATEGORY_UNI_NOT_WORD:
2959             case SRE_CATEGORY_UNI_LINEBREAK:
2960             case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2961                 break;
2962             default:
2963                 FAIL;
2964             }
2965             break;
2966 
2967         default:
2968             FAIL;
2969 
2970         }
2971     }
2972 
2973     return 1;
2974 }
2975 
2976 static int
_validate_inner(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)2977 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2978 {
2979     /* Some variables are manipulated by the macros above */
2980     SRE_CODE op;
2981     SRE_CODE arg;
2982     SRE_CODE skip;
2983 
2984     VTRACE(("code=%p, end=%p\n", code, end));
2985 
2986     if (code > end)
2987         FAIL;
2988 
2989     while (code < end) {
2990         GET_OP;
2991         switch (op) {
2992 
2993         case SRE_OP_MARK:
2994             /* We don't check whether marks are properly nested; the
2995                sre_match() code is robust even if they don't, and the worst
2996                you can get is nonsensical match results. */
2997             GET_ARG;
2998             if (arg > 2 * (size_t)groups + 1) {
2999                 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
3000                 FAIL;
3001             }
3002             break;
3003 
3004         case SRE_OP_LITERAL:
3005         case SRE_OP_NOT_LITERAL:
3006         case SRE_OP_LITERAL_IGNORE:
3007         case SRE_OP_NOT_LITERAL_IGNORE:
3008             GET_ARG;
3009             /* The arg is just a character, nothing to check */
3010             break;
3011 
3012         case SRE_OP_SUCCESS:
3013         case SRE_OP_FAILURE:
3014             /* Nothing to check; these normally end the matching process */
3015             break;
3016 
3017         case SRE_OP_AT:
3018             GET_ARG;
3019             switch (arg) {
3020             case SRE_AT_BEGINNING:
3021             case SRE_AT_BEGINNING_STRING:
3022             case SRE_AT_BEGINNING_LINE:
3023             case SRE_AT_END:
3024             case SRE_AT_END_LINE:
3025             case SRE_AT_END_STRING:
3026             case SRE_AT_BOUNDARY:
3027             case SRE_AT_NON_BOUNDARY:
3028             case SRE_AT_LOC_BOUNDARY:
3029             case SRE_AT_LOC_NON_BOUNDARY:
3030             case SRE_AT_UNI_BOUNDARY:
3031             case SRE_AT_UNI_NON_BOUNDARY:
3032                 break;
3033             default:
3034                 FAIL;
3035             }
3036             break;
3037 
3038         case SRE_OP_ANY:
3039         case SRE_OP_ANY_ALL:
3040             /* These have no operands */
3041             break;
3042 
3043         case SRE_OP_IN:
3044         case SRE_OP_IN_IGNORE:
3045             GET_SKIP;
3046             /* Stop 1 before the end; we check the FAILURE below */
3047             if (!_validate_charset(code, code+skip-2))
3048                 FAIL;
3049             if (code[skip-2] != SRE_OP_FAILURE)
3050                 FAIL;
3051             code += skip-1;
3052             break;
3053 
3054         case SRE_OP_INFO:
3055             {
3056                 /* A minimal info field is
3057                    <INFO> <1=skip> <2=flags> <3=min> <4=max>;
3058                    If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
3059                    more follows. */
3060                 SRE_CODE flags, i;
3061                 SRE_CODE *newcode;
3062                 GET_SKIP;
3063                 newcode = code+skip-1;
3064                 GET_ARG; flags = arg;
3065                 GET_ARG; /* min */
3066                 GET_ARG; /* max */
3067                 /* Check that only valid flags are present */
3068                 if ((flags & ~(SRE_INFO_PREFIX |
3069                                SRE_INFO_LITERAL |
3070                                SRE_INFO_CHARSET)) != 0)
3071                     FAIL;
3072                 /* PREFIX and CHARSET are mutually exclusive */
3073                 if ((flags & SRE_INFO_PREFIX) &&
3074                     (flags & SRE_INFO_CHARSET))
3075                     FAIL;
3076                 /* LITERAL implies PREFIX */
3077                 if ((flags & SRE_INFO_LITERAL) &&
3078                     !(flags & SRE_INFO_PREFIX))
3079                     FAIL;
3080                 /* Validate the prefix */
3081                 if (flags & SRE_INFO_PREFIX) {
3082                     SRE_CODE prefix_len;
3083                     GET_ARG; prefix_len = arg;
3084                     GET_ARG; /* prefix skip */
3085                     /* Here comes the prefix string */
3086                     if (prefix_len > (Py_uintptr_t)(newcode - code))
3087                         FAIL;
3088                     code += prefix_len;
3089                     /* And here comes the overlap table */
3090                     if (prefix_len > (Py_uintptr_t)(newcode - code))
3091                         FAIL;
3092                     /* Each overlap value should be < prefix_len */
3093                     for (i = 0; i < prefix_len; i++) {
3094                         if (code[i] >= prefix_len)
3095                             FAIL;
3096                     }
3097                     code += prefix_len;
3098                 }
3099                 /* Validate the charset */
3100                 if (flags & SRE_INFO_CHARSET) {
3101                     if (!_validate_charset(code, newcode-1))
3102                         FAIL;
3103                     if (newcode[-1] != SRE_OP_FAILURE)
3104                         FAIL;
3105                     code = newcode;
3106                 }
3107                 else if (code != newcode) {
3108                   VTRACE(("code=%p, newcode=%p\n", code, newcode));
3109                     FAIL;
3110                 }
3111             }
3112             break;
3113 
3114         case SRE_OP_BRANCH:
3115             {
3116                 SRE_CODE *target = NULL;
3117                 for (;;) {
3118                     GET_SKIP;
3119                     if (skip == 0)
3120                         break;
3121                     /* Stop 2 before the end; we check the JUMP below */
3122                     if (!_validate_inner(code, code+skip-3, groups))
3123                         FAIL;
3124                     code += skip-3;
3125                     /* Check that it ends with a JUMP, and that each JUMP
3126                        has the same target */
3127                     GET_OP;
3128                     if (op != SRE_OP_JUMP)
3129                         FAIL;
3130                     GET_SKIP;
3131                     if (target == NULL)
3132                         target = code+skip-1;
3133                     else if (code+skip-1 != target)
3134                         FAIL;
3135                 }
3136             }
3137             break;
3138 
3139         case SRE_OP_REPEAT_ONE:
3140         case SRE_OP_MIN_REPEAT_ONE:
3141             {
3142                 SRE_CODE min, max;
3143                 GET_SKIP;
3144                 GET_ARG; min = arg;
3145                 GET_ARG; max = arg;
3146                 if (min > max)
3147                     FAIL;
3148                 if (max > SRE_MAXREPEAT)
3149                     FAIL;
3150                 if (!_validate_inner(code, code+skip-4, groups))
3151                     FAIL;
3152                 code += skip-4;
3153                 GET_OP;
3154                 if (op != SRE_OP_SUCCESS)
3155                     FAIL;
3156             }
3157             break;
3158 
3159         case SRE_OP_REPEAT:
3160             {
3161                 SRE_CODE min, max;
3162                 GET_SKIP;
3163                 GET_ARG; min = arg;
3164                 GET_ARG; max = arg;
3165                 if (min > max)
3166                     FAIL;
3167                 if (max > SRE_MAXREPEAT)
3168                     FAIL;
3169                 if (!_validate_inner(code, code+skip-3, groups))
3170                     FAIL;
3171                 code += skip-3;
3172                 GET_OP;
3173                 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3174                     FAIL;
3175             }
3176             break;
3177 
3178         case SRE_OP_GROUPREF:
3179         case SRE_OP_GROUPREF_IGNORE:
3180             GET_ARG;
3181             if (arg >= (size_t)groups)
3182                 FAIL;
3183             break;
3184 
3185         case SRE_OP_GROUPREF_EXISTS:
3186             /* The regex syntax for this is: '(?(group)then|else)', where
3187                'group' is either an integer group number or a group name,
3188                'then' and 'else' are sub-regexes, and 'else' is optional. */
3189             GET_ARG;
3190             if (arg >= (size_t)groups)
3191                 FAIL;
3192             GET_SKIP_ADJ(1);
3193             code--; /* The skip is relative to the first arg! */
3194             /* There are two possibilities here: if there is both a 'then'
3195                part and an 'else' part, the generated code looks like:
3196 
3197                GROUPREF_EXISTS
3198                <group>
3199                <skipyes>
3200                ...then part...
3201                JUMP
3202                <skipno>
3203                (<skipyes> jumps here)
3204                ...else part...
3205                (<skipno> jumps here)
3206 
3207                If there is only a 'then' part, it looks like:
3208 
3209                GROUPREF_EXISTS
3210                <group>
3211                <skip>
3212                ...then part...
3213                (<skip> jumps here)
3214 
3215                There is no direct way to decide which it is, and we don't want
3216                to allow arbitrary jumps anywhere in the code; so we just look
3217                for a JUMP opcode preceding our skip target.
3218             */
3219             if (skip >= 3 && skip-3 < (Py_uintptr_t)(end - code) &&
3220                 code[skip-3] == SRE_OP_JUMP)
3221             {
3222                 VTRACE(("both then and else parts present\n"));
3223                 if (!_validate_inner(code+1, code+skip-3, groups))
3224                     FAIL;
3225                 code += skip-2; /* Position after JUMP, at <skipno> */
3226                 GET_SKIP;
3227                 if (!_validate_inner(code, code+skip-1, groups))
3228                     FAIL;
3229                 code += skip-1;
3230             }
3231             else {
3232                 VTRACE(("only a then part present\n"));
3233                 if (!_validate_inner(code+1, code+skip-1, groups))
3234                     FAIL;
3235                 code += skip-1;
3236             }
3237             break;
3238 
3239         case SRE_OP_ASSERT:
3240         case SRE_OP_ASSERT_NOT:
3241             GET_SKIP;
3242             GET_ARG; /* 0 for lookahead, width for lookbehind */
3243             code--; /* Back up over arg to simplify math below */
3244             if (arg & 0x80000000)
3245                 FAIL; /* Width too large */
3246             /* Stop 1 before the end; we check the SUCCESS below */
3247             if (!_validate_inner(code+1, code+skip-2, groups))
3248                 FAIL;
3249             code += skip-2;
3250             GET_OP;
3251             if (op != SRE_OP_SUCCESS)
3252                 FAIL;
3253             break;
3254 
3255         default:
3256             FAIL;
3257 
3258         }
3259     }
3260 
3261     VTRACE(("okay\n"));
3262     return 1;
3263 }
3264 
3265 static int
_validate_outer(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)3266 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3267 {
3268     if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3269         FAIL;
3270     if (groups == 0)  /* fix for simplejson */
3271         groups = 100; /* 100 groups should always be safe */
3272     return _validate_inner(code, end-1, groups);
3273 }
3274 
3275 static int
_validate(PatternObject * self)3276 _validate(PatternObject *self)
3277 {
3278     if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3279     {
3280         PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3281         return 0;
3282     }
3283     else
3284         VTRACE(("Success!\n"));
3285     return 1;
3286 }
3287 
3288 /* -------------------------------------------------------------------- */
3289 /* match methods */
3290 
3291 static void
match_dealloc(MatchObject * self)3292 match_dealloc(MatchObject* self)
3293 {
3294     Py_XDECREF(self->regs);
3295     Py_XDECREF(self->string);
3296     Py_DECREF(self->pattern);
3297     PyObject_DEL(self);
3298 }
3299 
3300 static PyObject*
match_getslice_by_index(MatchObject * self,Py_ssize_t index,PyObject * def)3301 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3302 {
3303     if (index < 0 || index >= self->groups) {
3304         /* raise IndexError if we were given a bad group number */
3305         PyErr_SetString(
3306             PyExc_IndexError,
3307             "no such group"
3308             );
3309         return NULL;
3310     }
3311 
3312     index *= 2;
3313 
3314     if (self->string == Py_None || self->mark[index] < 0) {
3315         /* return default value if the string or group is undefined */
3316         Py_INCREF(def);
3317         return def;
3318     }
3319 
3320     return PySequence_GetSlice(
3321         self->string, self->mark[index], self->mark[index+1]
3322         );
3323 }
3324 
3325 static Py_ssize_t
match_getindex(MatchObject * self,PyObject * index)3326 match_getindex(MatchObject* self, PyObject* index)
3327 {
3328     Py_ssize_t i;
3329 
3330     if (_PyAnyInt_Check(index))
3331         return PyInt_AsSsize_t(index);
3332 
3333     i = -1;
3334 
3335     if (self->pattern->groupindex) {
3336         index = PyObject_GetItem(self->pattern->groupindex, index);
3337         if (index) {
3338             if (_PyAnyInt_Check(index))
3339                 i = PyInt_AsSsize_t(index);
3340             Py_DECREF(index);
3341         } else
3342             PyErr_Clear();
3343     }
3344 
3345     return i;
3346 }
3347 
3348 static PyObject*
match_getslice(MatchObject * self,PyObject * index,PyObject * def)3349 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3350 {
3351     return match_getslice_by_index(self, match_getindex(self, index), def);
3352 }
3353 
3354 static PyObject*
match_expand(MatchObject * self,PyObject * ptemplate)3355 match_expand(MatchObject* self, PyObject* ptemplate)
3356 {
3357     /* delegate to Python code */
3358     return call(
3359         SRE_PY_MODULE, "_expand",
3360         PyTuple_Pack(3, self->pattern, self, ptemplate)
3361         );
3362 }
3363 
3364 static PyObject*
match_group(MatchObject * self,PyObject * args)3365 match_group(MatchObject* self, PyObject* args)
3366 {
3367     PyObject* result;
3368     Py_ssize_t i, size;
3369 
3370     size = PyTuple_GET_SIZE(args);
3371 
3372     switch (size) {
3373     case 0:
3374         result = match_getslice(self, Py_False, Py_None);
3375         break;
3376     case 1:
3377         result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3378         break;
3379     default:
3380         /* fetch multiple items */
3381         result = PyTuple_New(size);
3382         if (!result)
3383             return NULL;
3384         for (i = 0; i < size; i++) {
3385             PyObject* item = match_getslice(
3386                 self, PyTuple_GET_ITEM(args, i), Py_None
3387                 );
3388             if (!item) {
3389                 Py_DECREF(result);
3390                 return NULL;
3391             }
3392             PyTuple_SET_ITEM(result, i, item);
3393         }
3394         break;
3395     }
3396     return result;
3397 }
3398 
3399 static PyObject*
match_groups(MatchObject * self,PyObject * args,PyObject * kw)3400 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3401 {
3402     PyObject* result;
3403     Py_ssize_t index;
3404 
3405     PyObject* def = Py_None;
3406     static char* kwlist[] = { "default", NULL };
3407     if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3408         return NULL;
3409 
3410     result = PyTuple_New(self->groups-1);
3411     if (!result)
3412         return NULL;
3413 
3414     for (index = 1; index < self->groups; index++) {
3415         PyObject* item;
3416         item = match_getslice_by_index(self, index, def);
3417         if (!item) {
3418             Py_DECREF(result);
3419             return NULL;
3420         }
3421         PyTuple_SET_ITEM(result, index-1, item);
3422     }
3423 
3424     return result;
3425 }
3426 
3427 static PyObject*
match_groupdict(MatchObject * self,PyObject * args,PyObject * kw)3428 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3429 {
3430     PyObject* result;
3431     PyObject* keys;
3432     Py_ssize_t index;
3433 
3434     PyObject* def = Py_None;
3435     static char* kwlist[] = { "default", NULL };
3436     if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3437         return NULL;
3438 
3439     result = PyDict_New();
3440     if (!result || !self->pattern->groupindex)
3441         return result;
3442 
3443     keys = PyMapping_Keys(self->pattern->groupindex);
3444     if (!keys)
3445         goto failed;
3446 
3447     for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3448         int status;
3449         PyObject* key;
3450         PyObject* value;
3451         key = PyList_GET_ITEM(keys, index);
3452         if (!key)
3453             goto failed;
3454         value = match_getslice(self, key, def);
3455         if (!value)
3456             goto failed;
3457         status = PyDict_SetItem(result, key, value);
3458         Py_DECREF(value);
3459         if (status < 0)
3460             goto failed;
3461     }
3462 
3463     Py_DECREF(keys);
3464 
3465     return result;
3466 
3467 failed:
3468     Py_XDECREF(keys);
3469     Py_DECREF(result);
3470     return NULL;
3471 }
3472 
3473 static PyObject*
match_start(MatchObject * self,PyObject * args)3474 match_start(MatchObject* self, PyObject* args)
3475 {
3476     Py_ssize_t index;
3477 
3478     PyObject* index_ = Py_False; /* zero */
3479     if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3480         return NULL;
3481 
3482     index = match_getindex(self, index_);
3483 
3484     if (index < 0 || index >= self->groups) {
3485         PyErr_SetString(
3486             PyExc_IndexError,
3487             "no such group"
3488             );
3489         return NULL;
3490     }
3491 
3492     /* mark is -1 if group is undefined */
3493     return PyInt_FromSsize_t(self->mark[index*2]);
3494 }
3495 
3496 static PyObject*
match_end(MatchObject * self,PyObject * args)3497 match_end(MatchObject* self, PyObject* args)
3498 {
3499     Py_ssize_t index;
3500 
3501     PyObject* index_ = Py_False; /* zero */
3502     if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3503         return NULL;
3504 
3505     index = match_getindex(self, index_);
3506 
3507     if (index < 0 || index >= self->groups) {
3508         PyErr_SetString(
3509             PyExc_IndexError,
3510             "no such group"
3511             );
3512         return NULL;
3513     }
3514 
3515     /* mark is -1 if group is undefined */
3516     return PyInt_FromSsize_t(self->mark[index*2+1]);
3517 }
3518 
3519 LOCAL(PyObject*)
_pair(Py_ssize_t i1,Py_ssize_t i2)3520 _pair(Py_ssize_t i1, Py_ssize_t i2)
3521 {
3522     PyObject* pair;
3523     PyObject* item;
3524 
3525     pair = PyTuple_New(2);
3526     if (!pair)
3527         return NULL;
3528 
3529     item = PyInt_FromSsize_t(i1);
3530     if (!item)
3531         goto error;
3532     PyTuple_SET_ITEM(pair, 0, item);
3533 
3534     item = PyInt_FromSsize_t(i2);
3535     if (!item)
3536         goto error;
3537     PyTuple_SET_ITEM(pair, 1, item);
3538 
3539     return pair;
3540 
3541   error:
3542     Py_DECREF(pair);
3543     return NULL;
3544 }
3545 
3546 static PyObject*
match_span(MatchObject * self,PyObject * args)3547 match_span(MatchObject* self, PyObject* args)
3548 {
3549     Py_ssize_t index;
3550 
3551     PyObject* index_ = Py_False; /* zero */
3552     if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3553         return NULL;
3554 
3555     index = match_getindex(self, index_);
3556 
3557     if (index < 0 || index >= self->groups) {
3558         PyErr_SetString(
3559             PyExc_IndexError,
3560             "no such group"
3561             );
3562         return NULL;
3563     }
3564 
3565     /* marks are -1 if group is undefined */
3566     return _pair(self->mark[index*2], self->mark[index*2+1]);
3567 }
3568 
3569 static PyObject*
match_regs(MatchObject * self)3570 match_regs(MatchObject* self)
3571 {
3572     PyObject* regs;
3573     PyObject* item;
3574     Py_ssize_t index;
3575 
3576     regs = PyTuple_New(self->groups);
3577     if (!regs)
3578         return NULL;
3579 
3580     for (index = 0; index < self->groups; index++) {
3581         item = _pair(self->mark[index*2], self->mark[index*2+1]);
3582         if (!item) {
3583             Py_DECREF(regs);
3584             return NULL;
3585         }
3586         PyTuple_SET_ITEM(regs, index, item);
3587     }
3588 
3589     Py_INCREF(regs);
3590     self->regs = regs;
3591 
3592     return regs;
3593 }
3594 
3595 static PyObject*
match_copy(MatchObject * self,PyObject * unused)3596 match_copy(MatchObject* self, PyObject *unused)
3597 {
3598 #ifdef USE_BUILTIN_COPY
3599     MatchObject* copy;
3600     Py_ssize_t slots, offset;
3601 
3602     slots = 2 * (self->pattern->groups+1);
3603 
3604     copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3605     if (!copy)
3606         return NULL;
3607 
3608     /* this value a constant, but any compiler should be able to
3609        figure that out all by itself */
3610     offset = offsetof(MatchObject, string);
3611 
3612     Py_XINCREF(self->pattern);
3613     Py_XINCREF(self->string);
3614     Py_XINCREF(self->regs);
3615 
3616     memcpy((char*) copy + offset, (char*) self + offset,
3617            sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3618 
3619     return (PyObject*) copy;
3620 #else
3621     PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3622     return NULL;
3623 #endif
3624 }
3625 
3626 static PyObject*
match_deepcopy(MatchObject * self,PyObject * memo)3627 match_deepcopy(MatchObject* self, PyObject* memo)
3628 {
3629 #ifdef USE_BUILTIN_COPY
3630     MatchObject* copy;
3631 
3632     copy = (MatchObject*) match_copy(self);
3633     if (!copy)
3634         return NULL;
3635 
3636     if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3637         !deepcopy(&copy->string, memo) ||
3638         !deepcopy(&copy->regs, memo)) {
3639         Py_DECREF(copy);
3640         return NULL;
3641     }
3642 
3643 #else
3644     PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3645     return NULL;
3646 #endif
3647 }
3648 
3649 PyDoc_STRVAR(match_doc,
3650 "The result of re.match() and re.search().\n\
3651 Match objects always have a boolean value of True.");
3652 
3653 PyDoc_STRVAR(match_group_doc,
3654 "group([group1, ...]) -> str or tuple.\n\
3655     Return subgroup(s) of the match by indices or names.\n\
3656     For 0 returns the entire match.");
3657 
3658 PyDoc_STRVAR(match_start_doc,
3659 "start([group=0]) -> int.\n\
3660     Return index of the start of the substring matched by group.");
3661 
3662 PyDoc_STRVAR(match_end_doc,
3663 "end([group=0]) -> int.\n\
3664     Return index of the end of the substring matched by group.");
3665 
3666 PyDoc_STRVAR(match_span_doc,
3667 "span([group]) -> tuple.\n\
3668     For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3669 
3670 PyDoc_STRVAR(match_groups_doc,
3671 "groups([default=None]) -> tuple.\n\
3672     Return a tuple containing all the subgroups of the match, from 1.\n\
3673     The default argument is used for groups\n\
3674     that did not participate in the match");
3675 
3676 PyDoc_STRVAR(match_groupdict_doc,
3677 "groupdict([default=None]) -> dict.\n\
3678     Return a dictionary containing all the named subgroups of the match,\n\
3679     keyed by the subgroup name. The default argument is used for groups\n\
3680     that did not participate in the match");
3681 
3682 PyDoc_STRVAR(match_expand_doc,
3683 "expand(template) -> str.\n\
3684     Return the string obtained by doing backslash substitution\n\
3685     on the string template, as done by the sub() method.");
3686 
3687 static PyMethodDef match_methods[] = {
3688     {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3689     {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3690     {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3691     {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3692     {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3693         match_groups_doc},
3694     {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3695         match_groupdict_doc},
3696     {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
3697     {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3698     {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3699     {NULL, NULL}
3700 };
3701 
3702 static PyObject *
match_lastindex_get(MatchObject * self)3703 match_lastindex_get(MatchObject *self)
3704 {
3705     if (self->lastindex >= 0)
3706         return PyInt_FromSsize_t(self->lastindex);
3707     Py_INCREF(Py_None);
3708     return Py_None;
3709 }
3710 
3711 static PyObject *
match_lastgroup_get(MatchObject * self)3712 match_lastgroup_get(MatchObject *self)
3713 {
3714     if (self->pattern->indexgroup && self->lastindex >= 0) {
3715         PyObject* result = PySequence_GetItem(
3716             self->pattern->indexgroup, self->lastindex
3717             );
3718         if (result)
3719             return result;
3720         PyErr_Clear();
3721     }
3722     Py_INCREF(Py_None);
3723     return Py_None;
3724 }
3725 
3726 static PyObject *
match_regs_get(MatchObject * self)3727 match_regs_get(MatchObject *self)
3728 {
3729     if (self->regs) {
3730         Py_INCREF(self->regs);
3731         return self->regs;
3732     } else
3733         return match_regs(self);
3734 }
3735 
3736 static PyGetSetDef match_getset[] = {
3737     {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3738     {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3739     {"regs",      (getter)match_regs_get,      (setter)NULL},
3740     {NULL}
3741 };
3742 
3743 #define MATCH_OFF(x) offsetof(MatchObject, x)
3744 static PyMemberDef match_members[] = {
3745     {"string",  T_OBJECT,   MATCH_OFF(string),  READONLY},
3746     {"re",      T_OBJECT,   MATCH_OFF(pattern), READONLY},
3747     {"pos",     T_PYSSIZET, MATCH_OFF(pos),     READONLY},
3748     {"endpos",  T_PYSSIZET, MATCH_OFF(endpos),  READONLY},
3749     {NULL}
3750 };
3751 
3752 
3753 /* FIXME: implement setattr("string", None) as a special case (to
3754    detach the associated string, if any */
3755 
3756 static PyTypeObject Match_Type = {
3757     PyVarObject_HEAD_INIT(NULL, 0)
3758     "_" SRE_MODULE ".SRE_Match",
3759     sizeof(MatchObject), sizeof(Py_ssize_t),
3760     (destructor)match_dealloc,  /* tp_dealloc */
3761     0,                          /* tp_print */
3762     0,                          /* tp_getattr */
3763     0,                          /* tp_setattr */
3764     0,                          /* tp_compare */
3765     0,                          /* tp_repr */
3766     0,                          /* tp_as_number */
3767     0,                          /* tp_as_sequence */
3768     0,                          /* tp_as_mapping */
3769     0,                          /* tp_hash */
3770     0,                          /* tp_call */
3771     0,                          /* tp_str */
3772     0,                          /* tp_getattro */
3773     0,                          /* tp_setattro */
3774     0,                          /* tp_as_buffer */
3775     Py_TPFLAGS_DEFAULT,
3776     match_doc,                  /* tp_doc */
3777     0,                          /* tp_traverse */
3778     0,                          /* tp_clear */
3779     0,                          /* tp_richcompare */
3780     0,                          /* tp_weaklistoffset */
3781     0,                          /* tp_iter */
3782     0,                          /* tp_iternext */
3783     match_methods,		/* tp_methods */
3784     match_members,		/* tp_members */
3785     match_getset,  	        /* tp_getset */
3786 };
3787 
3788 static PyObject*
pattern_new_match(PatternObject * pattern,SRE_STATE * state,int status)3789 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3790 {
3791     /* create match object (from state object) */
3792 
3793     MatchObject* match;
3794     Py_ssize_t i, j;
3795     char* base;
3796     int n;
3797 
3798     if (status > 0) {
3799 
3800         /* create match object (with room for extra group marks) */
3801         /* coverity[ampersand_in_size] */
3802         match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3803                                  2*(pattern->groups+1));
3804         if (!match)
3805             return NULL;
3806 
3807         Py_INCREF(pattern);
3808         match->pattern = pattern;
3809 
3810         Py_INCREF(state->string);
3811         match->string = state->string;
3812 
3813         match->regs = NULL;
3814         match->groups = pattern->groups+1;
3815 
3816         /* fill in group slices */
3817 
3818         base = (char*) state->beginning;
3819         n = state->charsize;
3820 
3821         match->mark[0] = ((char*) state->start - base) / n;
3822         match->mark[1] = ((char*) state->ptr - base) / n;
3823 
3824         for (i = j = 0; i < pattern->groups; i++, j+=2)
3825             if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3826                 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3827                 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3828             } else
3829                 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3830 
3831         match->pos = state->pos;
3832         match->endpos = state->endpos;
3833 
3834         match->lastindex = state->lastindex;
3835 
3836         return (PyObject*) match;
3837 
3838     } else if (status == 0) {
3839 
3840         /* no match */
3841         Py_INCREF(Py_None);
3842         return Py_None;
3843 
3844     }
3845 
3846     /* internal error */
3847     pattern_error(status);
3848     return NULL;
3849 }
3850 
3851 
3852 /* -------------------------------------------------------------------- */
3853 /* scanner methods (experimental) */
3854 
3855 static void
scanner_dealloc(ScannerObject * self)3856 scanner_dealloc(ScannerObject* self)
3857 {
3858     state_fini(&self->state);
3859     Py_XDECREF(self->pattern);
3860     PyObject_DEL(self);
3861 }
3862 
3863 static PyObject*
scanner_match(ScannerObject * self,PyObject * unused)3864 scanner_match(ScannerObject* self, PyObject *unused)
3865 {
3866     SRE_STATE* state = &self->state;
3867     PyObject* match;
3868     int status;
3869 
3870     if (state->start == NULL)
3871         Py_RETURN_NONE;
3872 
3873     state_reset(state);
3874 
3875     state->ptr = state->start;
3876 
3877     if (state->charsize == 1) {
3878         status = sre_match(state, PatternObject_GetCode(self->pattern));
3879     } else {
3880 #if defined(HAVE_UNICODE)
3881         status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3882 #endif
3883     }
3884     if (PyErr_Occurred())
3885         return NULL;
3886 
3887     match = pattern_new_match((PatternObject*) self->pattern,
3888                                state, status);
3889 
3890     if (status == 0)
3891         state->start = NULL;
3892     else if (state->ptr != state->start)
3893         state->start = state->ptr;
3894     else if (state->ptr != state->end)
3895         state->start = (void*) ((char*) state->ptr + state->charsize);
3896     else
3897         state->start = NULL;
3898 
3899     return match;
3900 }
3901 
3902 
3903 static PyObject*
scanner_search(ScannerObject * self,PyObject * unused)3904 scanner_search(ScannerObject* self, PyObject *unused)
3905 {
3906     SRE_STATE* state = &self->state;
3907     PyObject* match;
3908     int status;
3909 
3910     if (state->start == NULL)
3911         Py_RETURN_NONE;
3912 
3913     state_reset(state);
3914 
3915     state->ptr = state->start;
3916 
3917     if (state->charsize == 1) {
3918         status = sre_search(state, PatternObject_GetCode(self->pattern));
3919     } else {
3920 #if defined(HAVE_UNICODE)
3921         status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3922 #endif
3923     }
3924     if (PyErr_Occurred())
3925         return NULL;
3926 
3927     match = pattern_new_match((PatternObject*) self->pattern,
3928                                state, status);
3929 
3930     if (status == 0)
3931         state->start = NULL;
3932     else if (state->ptr != state->start)
3933         state->start = state->ptr;
3934     else if (state->ptr != state->end)
3935         state->start = (void*) ((char*) state->ptr + state->charsize);
3936     else
3937         state->start = NULL;
3938 
3939     return match;
3940 }
3941 
3942 static PyMethodDef scanner_methods[] = {
3943     {"match", (PyCFunction) scanner_match, METH_NOARGS},
3944     {"search", (PyCFunction) scanner_search, METH_NOARGS},
3945     {NULL, NULL}
3946 };
3947 
3948 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3949 static PyMemberDef scanner_members[] = {
3950     {"pattern",	T_OBJECT,	SCAN_OFF(pattern),	READONLY},
3951     {NULL}  /* Sentinel */
3952 };
3953 
3954 statichere PyTypeObject Scanner_Type = {
3955     PyVarObject_HEAD_INIT(NULL, 0)
3956     "_" SRE_MODULE ".SRE_Scanner",
3957     sizeof(ScannerObject), 0,
3958     (destructor)scanner_dealloc, /*tp_dealloc*/
3959     0,				/* tp_print */
3960     0,				/* tp_getattr */
3961     0,				/* tp_setattr */
3962     0,				/* tp_reserved */
3963     0,				/* tp_repr */
3964     0,				/* tp_as_number */
3965     0,				/* tp_as_sequence */
3966     0,				/* tp_as_mapping */
3967     0,				/* tp_hash */
3968     0,				/* tp_call */
3969     0,				/* tp_str */
3970     0,				/* tp_getattro */
3971     0,				/* tp_setattro */
3972     0,				/* tp_as_buffer */
3973     Py_TPFLAGS_DEFAULT,		/* tp_flags */
3974     0,				/* tp_doc */
3975     0,				/* tp_traverse */
3976     0,				/* tp_clear */
3977     0,				/* tp_richcompare */
3978     0,				/* tp_weaklistoffset */
3979     0,				/* tp_iter */
3980     0,				/* tp_iternext */
3981     scanner_methods,		/* tp_methods */
3982     scanner_members,		/* tp_members */
3983     0,				/* tp_getset */
3984 };
3985 
3986 static PyObject*
pattern_scanner(PatternObject * pattern,PyObject * args)3987 pattern_scanner(PatternObject* pattern, PyObject* args)
3988 {
3989     /* create search state object */
3990 
3991     ScannerObject* self;
3992 
3993     PyObject* string;
3994     Py_ssize_t start = 0;
3995     Py_ssize_t end = PY_SSIZE_T_MAX;
3996     if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3997         return NULL;
3998 
3999     /* create scanner object */
4000     self = PyObject_NEW(ScannerObject, &Scanner_Type);
4001     if (!self)
4002         return NULL;
4003     self->pattern = NULL;
4004 
4005     string = state_init(&self->state, pattern, string, start, end);
4006     if (!string) {
4007         Py_DECREF(self);
4008         return NULL;
4009     }
4010 
4011     Py_INCREF(pattern);
4012     self->pattern = (PyObject*) pattern;
4013 
4014     return (PyObject*) self;
4015 }
4016 
4017 static PyMethodDef _functions[] = {
4018     {"compile", _compile, METH_VARARGS},
4019     {"getcodesize", sre_codesize, METH_NOARGS},
4020     {"getlower", sre_getlower, METH_VARARGS},
4021     {NULL, NULL}
4022 };
4023 
4024 #if PY_VERSION_HEX < 0x02030000
init_sre(void)4025 DL_EXPORT(void) init_sre(void)
4026 #else
4027 PyMODINIT_FUNC init_sre(void)
4028 #endif
4029 {
4030     PyObject* m;
4031     PyObject* d;
4032     PyObject* x;
4033 
4034     /* Patch object types */
4035     if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
4036         PyType_Ready(&Scanner_Type))
4037         return;
4038 
4039     m = Py_InitModule("_" SRE_MODULE, _functions);
4040     if (m == NULL)
4041     	return;
4042     d = PyModule_GetDict(m);
4043 
4044     x = PyInt_FromLong(SRE_MAGIC);
4045     if (x) {
4046         PyDict_SetItemString(d, "MAGIC", x);
4047         Py_DECREF(x);
4048     }
4049 
4050     x = PyInt_FromLong(sizeof(SRE_CODE));
4051     if (x) {
4052         PyDict_SetItemString(d, "CODESIZE", x);
4053         Py_DECREF(x);
4054     }
4055 
4056     x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
4057     if (x) {
4058         PyDict_SetItemString(d, "MAXREPEAT", x);
4059         Py_DECREF(x);
4060     }
4061 
4062     x = PyString_FromString(copyright);
4063     if (x) {
4064         PyDict_SetItemString(d, "copyright", x);
4065         Py_DECREF(x);
4066     }
4067 }
4068 
4069 #endif /* !defined(SRE_RECURSIVE) */
4070 
4071 /* vim:ts=4:sw=4:et
4072 */
4073