• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Secret Labs' Regular Expression Engine
3  *
4  * regular expression matching engine
5  *
6  * partial history:
7  * 1999-10-24 fl  created (based on existing template matcher code)
8  * 2000-03-06 fl  first alpha, sort of
9  * 2000-08-01 fl  fixes for 1.6b1
10  * 2000-08-07 fl  use PyOS_CheckStack() if available
11  * 2000-09-20 fl  added expand method
12  * 2001-03-20 fl  lots of fixes for 2.1b2
13  * 2001-04-15 fl  export copyright as Python attribute, not global
14  * 2001-04-28 fl  added __copy__ methods (work in progress)
15  * 2001-05-14 fl  fixes for 1.5.2 compatibility
16  * 2001-07-01 fl  added BIGCHARSET support (from Martin von Loewis)
17  * 2001-10-18 fl  fixed group reset issue (from Matthew Mueller)
18  * 2001-10-20 fl  added split primitive; reenable unicode for 1.6/2.0/2.1
19  * 2001-10-21 fl  added sub/subn primitive
20  * 2001-10-24 fl  added finditer primitive (for 2.2 only)
21  * 2001-12-07 fl  fixed memory leak in sub/subn (Guido van Rossum)
22  * 2002-11-09 fl  fixed empty sub/subn return type
23  * 2003-04-18 mvl fully support 4-byte codes
24  * 2003-10-17 gn  implemented non recursive scheme
25  *
26  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
27  *
28  * This version of the SRE library can be redistributed under CNRI's
29  * Python 1.6 license.  For any other use, please contact Secret Labs
30  * AB (info@pythonware.com).
31  *
32  * Portions of this engine have been developed in cooperation with
33  * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
34  * other compatibility work.
35  */
36 
37 #ifndef SRE_RECURSIVE
38 
39 static char copyright[] =
40     " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
41 
42 #define PY_SSIZE_T_CLEAN
43 
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
46 
47 #include "sre.h"
48 
49 #include <ctype.h>
50 
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
55 
56 #define SRE_PY_MODULE "re"
57 
58 /* defining this one enables tracing */
59 #undef VERBOSE
60 
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX  < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
64 #define HAVE_UNICODE
65 #endif
66 #endif
67 
68 /* -------------------------------------------------------------------- */
69 /* optional features */
70 
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
73 
74 /* enables aggressive inlining (always on for Visual C) */
75 #undef USE_INLINE
76 
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
79 
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
82 #endif
83 
84 /* -------------------------------------------------------------------- */
85 
86 #if defined(_MSC_VER)
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
93 #else
94 #define LOCAL(type) static type
95 #endif
96 
97 /* error codes */
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
102 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
103 
104 #if defined(VERBOSE)
105 #define TRACE(v) printf v
106 #else
107 #define TRACE(v)
108 #endif
109 
110 /* -------------------------------------------------------------------- */
111 /* search engine state */
112 
113 /* default character predicates (run sre_chars.py to regenerate tables) */
114 
115 #define SRE_DIGIT_MASK 1
116 #define SRE_SPACE_MASK 2
117 #define SRE_LINEBREAK_MASK 4
118 #define SRE_ALNUM_MASK 8
119 #define SRE_WORD_MASK 16
120 
121 /* FIXME: this assumes ASCII.  create tables in init_sre() instead */
122 
123 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
124 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
126 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
127 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
128 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
129 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130 
131 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
132 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
133 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
134 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
135 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139 120, 121, 122, 123, 124, 125, 126, 127 };
140 
141 #define SRE_IS_DIGIT(ch)\
142     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143 #define SRE_IS_SPACE(ch)\
144     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145 #define SRE_IS_LINEBREAK(ch)\
146     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147 #define SRE_IS_ALNUM(ch)\
148     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149 #define SRE_IS_WORD(ch)\
150     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151 
sre_lower(unsigned int ch)152 static unsigned int sre_lower(unsigned int ch)
153 {
154     return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
155 }
156 
157 /* locale-specific character predicates */
158 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159  * warnings when c's type supports only numbers < N+1 */
160 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
162 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
163 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
164 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165 
sre_lower_locale(unsigned int ch)166 static unsigned int sre_lower_locale(unsigned int ch)
167 {
168     return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
169 }
170 
171 /* unicode-specific character predicates */
172 
173 #if defined(HAVE_UNICODE)
174 
175 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
179 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180 
sre_lower_unicode(unsigned int ch)181 static unsigned int sre_lower_unicode(unsigned int ch)
182 {
183     return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184 }
185 
186 #endif
187 
188 LOCAL(int)
sre_category(SRE_CODE category,unsigned int ch)189 sre_category(SRE_CODE category, unsigned int ch)
190 {
191     switch (category) {
192 
193     case SRE_CATEGORY_DIGIT:
194         return SRE_IS_DIGIT(ch);
195     case SRE_CATEGORY_NOT_DIGIT:
196         return !SRE_IS_DIGIT(ch);
197     case SRE_CATEGORY_SPACE:
198         return SRE_IS_SPACE(ch);
199     case SRE_CATEGORY_NOT_SPACE:
200         return !SRE_IS_SPACE(ch);
201     case SRE_CATEGORY_WORD:
202         return SRE_IS_WORD(ch);
203     case SRE_CATEGORY_NOT_WORD:
204         return !SRE_IS_WORD(ch);
205     case SRE_CATEGORY_LINEBREAK:
206         return SRE_IS_LINEBREAK(ch);
207     case SRE_CATEGORY_NOT_LINEBREAK:
208         return !SRE_IS_LINEBREAK(ch);
209 
210     case SRE_CATEGORY_LOC_WORD:
211         return SRE_LOC_IS_WORD(ch);
212     case SRE_CATEGORY_LOC_NOT_WORD:
213         return !SRE_LOC_IS_WORD(ch);
214 
215 #if defined(HAVE_UNICODE)
216     case SRE_CATEGORY_UNI_DIGIT:
217         return SRE_UNI_IS_DIGIT(ch);
218     case SRE_CATEGORY_UNI_NOT_DIGIT:
219         return !SRE_UNI_IS_DIGIT(ch);
220     case SRE_CATEGORY_UNI_SPACE:
221         return SRE_UNI_IS_SPACE(ch);
222     case SRE_CATEGORY_UNI_NOT_SPACE:
223         return !SRE_UNI_IS_SPACE(ch);
224     case SRE_CATEGORY_UNI_WORD:
225         return SRE_UNI_IS_WORD(ch);
226     case SRE_CATEGORY_UNI_NOT_WORD:
227         return !SRE_UNI_IS_WORD(ch);
228     case SRE_CATEGORY_UNI_LINEBREAK:
229         return SRE_UNI_IS_LINEBREAK(ch);
230     case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231         return !SRE_UNI_IS_LINEBREAK(ch);
232 #else
233     case SRE_CATEGORY_UNI_DIGIT:
234         return SRE_IS_DIGIT(ch);
235     case SRE_CATEGORY_UNI_NOT_DIGIT:
236         return !SRE_IS_DIGIT(ch);
237     case SRE_CATEGORY_UNI_SPACE:
238         return SRE_IS_SPACE(ch);
239     case SRE_CATEGORY_UNI_NOT_SPACE:
240         return !SRE_IS_SPACE(ch);
241     case SRE_CATEGORY_UNI_WORD:
242         return SRE_LOC_IS_WORD(ch);
243     case SRE_CATEGORY_UNI_NOT_WORD:
244         return !SRE_LOC_IS_WORD(ch);
245     case SRE_CATEGORY_UNI_LINEBREAK:
246         return SRE_IS_LINEBREAK(ch);
247     case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248         return !SRE_IS_LINEBREAK(ch);
249 #endif
250     }
251     return 0;
252 }
253 
254 /* helpers */
255 
256 static void
data_stack_dealloc(SRE_STATE * state)257 data_stack_dealloc(SRE_STATE* state)
258 {
259     if (state->data_stack) {
260         PyMem_FREE(state->data_stack);
261         state->data_stack = NULL;
262     }
263     state->data_stack_size = state->data_stack_base = 0;
264 }
265 
266 static int
data_stack_grow(SRE_STATE * state,Py_ssize_t size)267 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
268 {
269     Py_ssize_t minsize, cursize;
270     minsize = state->data_stack_base+size;
271     cursize = state->data_stack_size;
272     if (cursize < minsize) {
273         void* stack;
274         cursize = minsize+minsize/4+1024;
275         TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
276         stack = PyMem_REALLOC(state->data_stack, cursize);
277         if (!stack) {
278             data_stack_dealloc(state);
279             return SRE_ERROR_MEMORY;
280         }
281         state->data_stack = (char *)stack;
282         state->data_stack_size = cursize;
283     }
284     return 0;
285 }
286 
287 /* generate 8-bit version */
288 
289 #define SRE_CHAR unsigned char
290 #define SRE_AT sre_at
291 #define SRE_COUNT sre_count
292 #define SRE_CHARSET sre_charset
293 #define SRE_INFO sre_info
294 #define SRE_MATCH sre_match
295 #define SRE_MATCH_CONTEXT sre_match_context
296 #define SRE_SEARCH sre_search
297 #define SRE_LITERAL_TEMPLATE sre_literal_template
298 
299 #if defined(HAVE_UNICODE)
300 
301 #define SRE_RECURSIVE
302 #include "_sre.c"
303 #undef SRE_RECURSIVE
304 
305 #undef SRE_LITERAL_TEMPLATE
306 #undef SRE_SEARCH
307 #undef SRE_MATCH
308 #undef SRE_MATCH_CONTEXT
309 #undef SRE_INFO
310 #undef SRE_CHARSET
311 #undef SRE_COUNT
312 #undef SRE_AT
313 #undef SRE_CHAR
314 
315 /* generate 16-bit unicode version */
316 
317 #define SRE_CHAR Py_UNICODE
318 #define SRE_AT sre_uat
319 #define SRE_COUNT sre_ucount
320 #define SRE_CHARSET sre_ucharset
321 #define SRE_INFO sre_uinfo
322 #define SRE_MATCH sre_umatch
323 #define SRE_MATCH_CONTEXT sre_umatch_context
324 #define SRE_SEARCH sre_usearch
325 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
326 #endif
327 
328 #endif /* SRE_RECURSIVE */
329 
330 /* -------------------------------------------------------------------- */
331 /* String matching engine */
332 
333 /* the following section is compiled twice, with different character
334    settings */
335 
336 LOCAL(int)
SRE_AT(SRE_STATE * state,SRE_CHAR * ptr,SRE_CODE at)337 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338 {
339     /* check if pointer is at given position */
340 
341     Py_ssize_t thisp, thatp;
342 
343     switch (at) {
344 
345     case SRE_AT_BEGINNING:
346     case SRE_AT_BEGINNING_STRING:
347         return ((void*) ptr == state->beginning);
348 
349     case SRE_AT_BEGINNING_LINE:
350         return ((void*) ptr == state->beginning ||
351                 SRE_IS_LINEBREAK((int) ptr[-1]));
352 
353     case SRE_AT_END:
354         return (((SRE_CHAR *)state->end - ptr == 1 &&
355                  SRE_IS_LINEBREAK((int) ptr[0])) ||
356                 ((void*) ptr == state->end));
357 
358     case SRE_AT_END_LINE:
359         return ((void*) ptr == state->end ||
360                 SRE_IS_LINEBREAK((int) ptr[0]));
361 
362     case SRE_AT_END_STRING:
363         return ((void*) ptr == state->end);
364 
365     case SRE_AT_BOUNDARY:
366         if (state->beginning == state->end)
367             return 0;
368         thatp = ((void*) ptr > state->beginning) ?
369             SRE_IS_WORD((int) ptr[-1]) : 0;
370         thisp = ((void*) ptr < state->end) ?
371             SRE_IS_WORD((int) ptr[0]) : 0;
372         return thisp != thatp;
373 
374     case SRE_AT_NON_BOUNDARY:
375         if (state->beginning == state->end)
376             return 0;
377         thatp = ((void*) ptr > state->beginning) ?
378             SRE_IS_WORD((int) ptr[-1]) : 0;
379         thisp = ((void*) ptr < state->end) ?
380             SRE_IS_WORD((int) ptr[0]) : 0;
381         return thisp == thatp;
382 
383     case SRE_AT_LOC_BOUNDARY:
384         if (state->beginning == state->end)
385             return 0;
386         thatp = ((void*) ptr > state->beginning) ?
387             SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
388         thisp = ((void*) ptr < state->end) ?
389             SRE_LOC_IS_WORD((int) ptr[0]) : 0;
390         return thisp != thatp;
391 
392     case SRE_AT_LOC_NON_BOUNDARY:
393         if (state->beginning == state->end)
394             return 0;
395         thatp = ((void*) ptr > state->beginning) ?
396             SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
397         thisp = ((void*) ptr < state->end) ?
398             SRE_LOC_IS_WORD((int) ptr[0]) : 0;
399         return thisp == thatp;
400 
401 #if defined(HAVE_UNICODE)
402     case SRE_AT_UNI_BOUNDARY:
403         if (state->beginning == state->end)
404             return 0;
405         thatp = ((void*) ptr > state->beginning) ?
406             SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
407         thisp = ((void*) ptr < state->end) ?
408             SRE_UNI_IS_WORD((int) ptr[0]) : 0;
409         return thisp != thatp;
410 
411     case SRE_AT_UNI_NON_BOUNDARY:
412         if (state->beginning == state->end)
413             return 0;
414         thatp = ((void*) ptr > state->beginning) ?
415             SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
416         thisp = ((void*) ptr < state->end) ?
417             SRE_UNI_IS_WORD((int) ptr[0]) : 0;
418         return thisp == thatp;
419 #endif
420 
421     }
422 
423     return 0;
424 }
425 
426 LOCAL(int)
SRE_CHARSET(SRE_CODE * set,SRE_CODE ch)427 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428 {
429     /* check if character is a member of the given set */
430 
431     int ok = 1;
432 
433     for (;;) {
434         switch (*set++) {
435 
436         case SRE_OP_FAILURE:
437             return !ok;
438 
439         case SRE_OP_LITERAL:
440             /* <LITERAL> <code> */
441             if (ch == set[0])
442                 return ok;
443             set++;
444             break;
445 
446         case SRE_OP_CATEGORY:
447             /* <CATEGORY> <code> */
448             if (sre_category(set[0], (int) ch))
449                 return ok;
450             set += 1;
451             break;
452 
453         case SRE_OP_CHARSET:
454             if (sizeof(SRE_CODE) == 2) {
455                 /* <CHARSET> <bitmap> (16 bits per code word) */
456                 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457                     return ok;
458                 set += 16;
459             }
460             else {
461                 /* <CHARSET> <bitmap> (32 bits per code word) */
462                 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
463                     return ok;
464                 set += 8;
465             }
466             break;
467 
468         case SRE_OP_RANGE:
469             /* <RANGE> <lower> <upper> */
470             if (set[0] <= ch && ch <= set[1])
471                 return ok;
472             set += 2;
473             break;
474 
475         case SRE_OP_NEGATE:
476             ok = !ok;
477             break;
478 
479         case SRE_OP_BIGCHARSET:
480             /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481         {
482             Py_ssize_t count, block;
483             count = *(set++);
484 
485             if (sizeof(SRE_CODE) == 2) {
486                 block = ((unsigned char*)set)[ch >> 8];
487                 set += 128;
488                 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489                     return ok;
490                 set += count*16;
491             }
492             else {
493                 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494                  * warnings when c's type supports only numbers < N+1 */
495                 if (!(ch & ~65535))
496                     block = ((unsigned char*)set)[ch >> 8];
497                 else
498                     block = -1;
499                 set += 64;
500                 if (block >=0 &&
501                     (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
502                     return ok;
503                 set += count*8;
504             }
505             break;
506         }
507 
508         default:
509             /* internal error -- there's not much we can do about it
510                here, so let's just pretend it didn't match... */
511             return 0;
512         }
513     }
514 }
515 
516 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517 
518 LOCAL(Py_ssize_t)
SRE_COUNT(SRE_STATE * state,SRE_CODE * pattern,Py_ssize_t maxcount)519 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
520 {
521     SRE_CODE chr;
522     SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523     SRE_CHAR* end = (SRE_CHAR *)state->end;
524     Py_ssize_t i;
525 
526     /* adjust end */
527     if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
528         end = ptr + maxcount;
529 
530     switch (pattern[0]) {
531 
532     case SRE_OP_IN:
533         /* repeated set */
534         TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535         while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536             ptr++;
537         break;
538 
539     case SRE_OP_ANY:
540         /* repeated dot wildcard. */
541         TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
542         while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543             ptr++;
544         break;
545 
546     case SRE_OP_ANY_ALL:
547         /* repeated dot wildcard.  skip to the end of the target
548            string, and backtrack from there */
549         TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
550         ptr = end;
551         break;
552 
553     case SRE_OP_LITERAL:
554         /* repeated literal */
555         chr = pattern[1];
556         TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
557         while (ptr < end && (SRE_CODE) *ptr == chr)
558             ptr++;
559         break;
560 
561     case SRE_OP_LITERAL_IGNORE:
562         /* repeated literal */
563         chr = pattern[1];
564         TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
565         while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566             ptr++;
567         break;
568 
569     case SRE_OP_NOT_LITERAL:
570         /* repeated non-literal */
571         chr = pattern[1];
572         TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
573         while (ptr < end && (SRE_CODE) *ptr != chr)
574             ptr++;
575         break;
576 
577     case SRE_OP_NOT_LITERAL_IGNORE:
578         /* repeated non-literal */
579         chr = pattern[1];
580         TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
581         while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582             ptr++;
583         break;
584 
585     default:
586         /* repeated single character pattern */
587         TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
588         while ((SRE_CHAR*) state->ptr < end) {
589             i = SRE_MATCH(state, pattern);
590             if (i < 0)
591                 return i;
592             if (!i)
593                 break;
594         }
595         TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
596                (SRE_CHAR*) state->ptr - ptr));
597         return (SRE_CHAR*) state->ptr - ptr;
598     }
599 
600     TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
601            ptr - (SRE_CHAR*) state->ptr));
602     return ptr - (SRE_CHAR*) state->ptr;
603 }
604 
605 #if 0 /* not used in this release */
606 LOCAL(int)
607 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
608 {
609     /* check if an SRE_OP_INFO block matches at the current position.
610        returns the number of SRE_CODE objects to skip if successful, 0
611        if no match */
612 
613     SRE_CHAR* end = state->end;
614     SRE_CHAR* ptr = state->ptr;
615     Py_ssize_t i;
616 
617     /* check minimal length */
618     if (pattern[3] && (end - ptr) < pattern[3])
619         return 0;
620 
621     /* check known prefix */
622     if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
623         /* <length> <skip> <prefix data> <overlap data> */
624         for (i = 0; i < pattern[5]; i++)
625             if ((SRE_CODE) ptr[i] != pattern[7 + i])
626                 return 0;
627         return pattern[0] + 2 * pattern[6];
628     }
629     return pattern[0];
630 }
631 #endif
632 
633 /* The macros below should be used to protect recursive SRE_MATCH()
634  * calls that *failed* and do *not* return immediately (IOW, those
635  * that will backtrack). Explaining:
636  *
637  * - Recursive SRE_MATCH() returned true: that's usually a success
638  *   (besides atypical cases like ASSERT_NOT), therefore there's no
639  *   reason to restore lastmark;
640  *
641  * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
642  *   is returning to the caller: If the current SRE_MATCH() is the
643  *   top function of the recursion, returning false will be a matching
644  *   failure, and it doesn't matter where lastmark is pointing to.
645  *   If it's *not* the top function, it will be a recursive SRE_MATCH()
646  *   failure by itself, and the calling SRE_MATCH() will have to deal
647  *   with the failure by the same rules explained here (it will restore
648  *   lastmark by itself if necessary);
649  *
650  * - Recursive SRE_MATCH() returned false, and will continue the
651  *   outside 'for' loop: must be protected when breaking, since the next
652  *   OP could potentially depend on lastmark;
653  *
654  * - Recursive SRE_MATCH() returned false, and will be called again
655  *   inside a local for/while loop: must be protected between each
656  *   loop iteration, since the recursive SRE_MATCH() could do anything,
657  *   and could potentially depend on lastmark.
658  *
659  * For more information, check the discussion at SF patch #712900.
660  */
661 #define LASTMARK_SAVE()     \
662     do { \
663         ctx->lastmark = state->lastmark; \
664         ctx->lastindex = state->lastindex; \
665     } while (0)
666 #define LASTMARK_RESTORE()  \
667     do { \
668         state->lastmark = ctx->lastmark; \
669         state->lastindex = ctx->lastindex; \
670     } while (0)
671 
672 #define RETURN_ERROR(i) do { return i; } while(0)
673 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
674 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
675 
676 #define RETURN_ON_ERROR(i) \
677     do { if (i < 0) RETURN_ERROR(i); } while (0)
678 #define RETURN_ON_SUCCESS(i) \
679     do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
680 #define RETURN_ON_FAILURE(i) \
681     do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
682 
683 #define SFY(x) #x
684 
685 #define DATA_STACK_ALLOC(state, type, ptr) \
686 do { \
687     alloc_pos = state->data_stack_base; \
688     TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
689            "(%" PY_FORMAT_SIZE_T "d)\n", \
690            SFY(type), alloc_pos, sizeof(type))); \
691     if (sizeof(type) > state->data_stack_size - alloc_pos) { \
692         int j = data_stack_grow(state, sizeof(type)); \
693         if (j < 0) return j; \
694         if (ctx_pos != -1) \
695             DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
696     } \
697     ptr = (type*)(state->data_stack+alloc_pos); \
698     state->data_stack_base += sizeof(type); \
699 } while (0)
700 
701 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
702 do { \
703     TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
704     ptr = (type*)(state->data_stack+pos); \
705 } while (0)
706 
707 #define DATA_STACK_PUSH(state, data, size) \
708 do { \
709     TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
710            "(%" PY_FORMAT_SIZE_T "d)\n", \
711            data, state->data_stack_base, size)); \
712     if (size > state->data_stack_size - state->data_stack_base) { \
713         int j = data_stack_grow(state, size); \
714         if (j < 0) return j; \
715         if (ctx_pos != -1) \
716             DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
717     } \
718     memcpy(state->data_stack+state->data_stack_base, data, size); \
719     state->data_stack_base += size; \
720 } while (0)
721 
722 #define DATA_STACK_POP(state, data, size, discard) \
723 do { \
724     TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
725            "(%" PY_FORMAT_SIZE_T "d)\n", \
726            data, state->data_stack_base-size, size)); \
727     memcpy(data, state->data_stack+state->data_stack_base-size, size); \
728     if (discard) \
729         state->data_stack_base -= size; \
730 } while (0)
731 
732 #define DATA_STACK_POP_DISCARD(state, size) \
733 do { \
734     TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
735            "(%" PY_FORMAT_SIZE_T "d)\n", \
736            state->data_stack_base-size, size)); \
737     state->data_stack_base -= size; \
738 } while(0)
739 
740 #define DATA_PUSH(x) \
741     DATA_STACK_PUSH(state, (x), sizeof(*(x)))
742 #define DATA_POP(x) \
743     DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
744 #define DATA_POP_DISCARD(x) \
745     DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
746 #define DATA_ALLOC(t,p) \
747     DATA_STACK_ALLOC(state, t, p)
748 #define DATA_LOOKUP_AT(t,p,pos) \
749     DATA_STACK_LOOKUP_AT(state,t,p,pos)
750 
751 #define MARK_PUSH(lastmark) \
752     do if (lastmark > 0) { \
753         i = lastmark; /* ctx->lastmark may change if reallocated */ \
754         DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
755     } while (0)
756 #define MARK_POP(lastmark) \
757     do if (lastmark > 0) { \
758         DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
759     } while (0)
760 #define MARK_POP_KEEP(lastmark) \
761     do if (lastmark > 0) { \
762         DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
763     } while (0)
764 #define MARK_POP_DISCARD(lastmark) \
765     do if (lastmark > 0) { \
766         DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
767     } while (0)
768 
769 #define JUMP_NONE            0
770 #define JUMP_MAX_UNTIL_1     1
771 #define JUMP_MAX_UNTIL_2     2
772 #define JUMP_MAX_UNTIL_3     3
773 #define JUMP_MIN_UNTIL_1     4
774 #define JUMP_MIN_UNTIL_2     5
775 #define JUMP_MIN_UNTIL_3     6
776 #define JUMP_REPEAT          7
777 #define JUMP_REPEAT_ONE_1    8
778 #define JUMP_REPEAT_ONE_2    9
779 #define JUMP_MIN_REPEAT_ONE  10
780 #define JUMP_BRANCH          11
781 #define JUMP_ASSERT          12
782 #define JUMP_ASSERT_NOT      13
783 
784 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
785     DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
786     nextctx->last_ctx_pos = ctx_pos; \
787     nextctx->jump = jumpvalue; \
788     nextctx->pattern = nextpattern; \
789     ctx_pos = alloc_pos; \
790     ctx = nextctx; \
791     goto entrance; \
792     jumplabel: \
793     while (0) /* gcc doesn't like labels at end of scopes */ \
794 
795 typedef struct {
796     Py_ssize_t last_ctx_pos;
797     Py_ssize_t jump;
798     SRE_CHAR* ptr;
799     SRE_CODE* pattern;
800     Py_ssize_t count;
801     Py_ssize_t lastmark;
802     Py_ssize_t lastindex;
803     union {
804         SRE_CODE chr;
805         SRE_REPEAT* rep;
806     } u;
807 } SRE_MATCH_CONTEXT;
808 
809 /* check if string matches the given pattern.  returns <0 for
810    error, 0 for failure, and 1 for success */
811 LOCAL(Py_ssize_t)
SRE_MATCH(SRE_STATE * state,SRE_CODE * pattern)812 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
813 {
814     SRE_CHAR* end = (SRE_CHAR *)state->end;
815     Py_ssize_t alloc_pos, ctx_pos = -1;
816     Py_ssize_t i, ret = 0;
817     Py_ssize_t jump;
818     unsigned int sigcount=0;
819 
820     SRE_MATCH_CONTEXT* ctx;
821     SRE_MATCH_CONTEXT* nextctx;
822 
823     TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
824 
825     DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
826     ctx->last_ctx_pos = -1;
827     ctx->jump = JUMP_NONE;
828     ctx->pattern = pattern;
829     ctx_pos = alloc_pos;
830 
831 entrance:
832 
833     ctx->ptr = (SRE_CHAR *)state->ptr;
834 
835     if (ctx->pattern[0] == SRE_OP_INFO) {
836         /* optimization info block */
837         /* <INFO> <1=skip> <2=flags> <3=min> ... */
838         if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
839             TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
840                    "need %" PY_FORMAT_SIZE_T "d)\n",
841                    (end - ctx->ptr), (Py_ssize_t) ctx->pattern[3]));
842             RETURN_FAILURE;
843         }
844         ctx->pattern += ctx->pattern[1] + 1;
845     }
846 
847     for (;;) {
848         ++sigcount;
849         if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
850             RETURN_ERROR(SRE_ERROR_INTERRUPTED);
851 
852         switch (*ctx->pattern++) {
853 
854         case SRE_OP_MARK:
855             /* set mark */
856             /* <MARK> <gid> */
857             TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
858                    ctx->ptr, ctx->pattern[0]));
859             i = ctx->pattern[0];
860             if (i & 1)
861                 state->lastindex = i/2 + 1;
862             if (i > state->lastmark) {
863                 /* state->lastmark is the highest valid index in the
864                    state->mark array.  If it is increased by more than 1,
865                    the intervening marks must be set to NULL to signal
866                    that these marks have not been encountered. */
867                 Py_ssize_t j = state->lastmark + 1;
868                 while (j < i)
869                     state->mark[j++] = NULL;
870                 state->lastmark = i;
871             }
872             state->mark[i] = ctx->ptr;
873             ctx->pattern++;
874             break;
875 
876         case SRE_OP_LITERAL:
877             /* match literal string */
878             /* <LITERAL> <code> */
879             TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
880                    ctx->ptr, *ctx->pattern));
881             if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
882                 RETURN_FAILURE;
883             ctx->pattern++;
884             ctx->ptr++;
885             break;
886 
887         case SRE_OP_NOT_LITERAL:
888             /* match anything that is not literal character */
889             /* <NOT_LITERAL> <code> */
890             TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
891                    ctx->ptr, *ctx->pattern));
892             if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
893                 RETURN_FAILURE;
894             ctx->pattern++;
895             ctx->ptr++;
896             break;
897 
898         case SRE_OP_SUCCESS:
899             /* end of pattern */
900             TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
901             state->ptr = ctx->ptr;
902             RETURN_SUCCESS;
903 
904         case SRE_OP_AT:
905             /* match at given position */
906             /* <AT> <code> */
907             TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
908             if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
909                 RETURN_FAILURE;
910             ctx->pattern++;
911             break;
912 
913         case SRE_OP_CATEGORY:
914             /* match at given category */
915             /* <CATEGORY> <code> */
916             TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
917                    ctx->ptr, *ctx->pattern));
918             if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
919                 RETURN_FAILURE;
920             ctx->pattern++;
921             ctx->ptr++;
922             break;
923 
924         case SRE_OP_ANY:
925             /* match anything (except a newline) */
926             /* <ANY> */
927             TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
928             if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
929                 RETURN_FAILURE;
930             ctx->ptr++;
931             break;
932 
933         case SRE_OP_ANY_ALL:
934             /* match anything */
935             /* <ANY_ALL> */
936             TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
937             if (ctx->ptr >= end)
938                 RETURN_FAILURE;
939             ctx->ptr++;
940             break;
941 
942         case SRE_OP_IN:
943             /* match set member (or non_member) */
944             /* <IN> <skip> <set> */
945             TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
946             if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
947                 RETURN_FAILURE;
948             ctx->pattern += ctx->pattern[0];
949             ctx->ptr++;
950             break;
951 
952         case SRE_OP_LITERAL_IGNORE:
953             TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
954                    ctx->pattern, ctx->ptr, ctx->pattern[0]));
955             if (ctx->ptr >= end ||
956                 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
957                 RETURN_FAILURE;
958             ctx->pattern++;
959             ctx->ptr++;
960             break;
961 
962         case SRE_OP_NOT_LITERAL_IGNORE:
963             TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
964                    ctx->pattern, ctx->ptr, *ctx->pattern));
965             if (ctx->ptr >= end ||
966                 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
967                 RETURN_FAILURE;
968             ctx->pattern++;
969             ctx->ptr++;
970             break;
971 
972         case SRE_OP_IN_IGNORE:
973             TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
974             if (ctx->ptr >= end
975                 || !SRE_CHARSET(ctx->pattern+1,
976                                 (SRE_CODE)state->lower(*ctx->ptr)))
977                 RETURN_FAILURE;
978             ctx->pattern += ctx->pattern[0];
979             ctx->ptr++;
980             break;
981 
982         case SRE_OP_JUMP:
983         case SRE_OP_INFO:
984             /* jump forward */
985             /* <JUMP> <offset> */
986             TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
987                    ctx->ptr, ctx->pattern[0]));
988             ctx->pattern += ctx->pattern[0];
989             break;
990 
991         case SRE_OP_BRANCH:
992             /* alternation */
993             /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
994             TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
995             LASTMARK_SAVE();
996             ctx->u.rep = state->repeat;
997             if (ctx->u.rep)
998                 MARK_PUSH(ctx->lastmark);
999             for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
1000                 if (ctx->pattern[1] == SRE_OP_LITERAL &&
1001                     (ctx->ptr >= end ||
1002                      (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
1003                     continue;
1004                 if (ctx->pattern[1] == SRE_OP_IN &&
1005                     (ctx->ptr >= end ||
1006                      !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1007                     continue;
1008                 state->ptr = ctx->ptr;
1009                 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1010                 if (ret) {
1011                     if (ctx->u.rep)
1012                         MARK_POP_DISCARD(ctx->lastmark);
1013                     RETURN_ON_ERROR(ret);
1014                     RETURN_SUCCESS;
1015                 }
1016                 if (ctx->u.rep)
1017                     MARK_POP_KEEP(ctx->lastmark);
1018                 LASTMARK_RESTORE();
1019             }
1020             if (ctx->u.rep)
1021                 MARK_POP_DISCARD(ctx->lastmark);
1022             RETURN_FAILURE;
1023 
1024         case SRE_OP_REPEAT_ONE:
1025             /* match repeated sequence (maximizing regexp) */
1026 
1027             /* this operator only works if the repeated item is
1028                exactly one character wide, and we're not already
1029                collecting backtracking points.  for other cases,
1030                use the MAX_REPEAT operator */
1031 
1032             /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1033 
1034             TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1035                    ctx->pattern[1], ctx->pattern[2]));
1036 
1037             if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1038                 RETURN_FAILURE; /* cannot match */
1039 
1040             state->ptr = ctx->ptr;
1041 
1042             ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1043             RETURN_ON_ERROR(ret);
1044             DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1045             ctx->count = ret;
1046             ctx->ptr += ctx->count;
1047 
1048             /* when we arrive here, count contains the number of
1049                matches, and ctx->ptr points to the tail of the target
1050                string.  check if the rest of the pattern matches,
1051                and backtrack if not. */
1052 
1053             if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1054                 RETURN_FAILURE;
1055 
1056             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1057                 /* tail is empty.  we're finished */
1058                 state->ptr = ctx->ptr;
1059                 RETURN_SUCCESS;
1060             }
1061 
1062             LASTMARK_SAVE();
1063 
1064             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1065                 /* tail starts with a literal. skip positions where
1066                    the rest of the pattern cannot possibly match */
1067                 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1068                 for (;;) {
1069                     while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1070                            (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1071                         ctx->ptr--;
1072                         ctx->count--;
1073                     }
1074                     if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1075                         break;
1076                     state->ptr = ctx->ptr;
1077                     DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1078                             ctx->pattern+ctx->pattern[0]);
1079                     if (ret) {
1080                         RETURN_ON_ERROR(ret);
1081                         RETURN_SUCCESS;
1082                     }
1083 
1084                     LASTMARK_RESTORE();
1085 
1086                     ctx->ptr--;
1087                     ctx->count--;
1088                 }
1089 
1090             } else {
1091                 /* general case */
1092                 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1093                     state->ptr = ctx->ptr;
1094                     DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1095                             ctx->pattern+ctx->pattern[0]);
1096                     if (ret) {
1097                         RETURN_ON_ERROR(ret);
1098                         RETURN_SUCCESS;
1099                     }
1100                     ctx->ptr--;
1101                     ctx->count--;
1102                     LASTMARK_RESTORE();
1103                 }
1104             }
1105             RETURN_FAILURE;
1106 
1107         case SRE_OP_MIN_REPEAT_ONE:
1108             /* match repeated sequence (minimizing regexp) */
1109 
1110             /* this operator only works if the repeated item is
1111                exactly one character wide, and we're not already
1112                collecting backtracking points.  for other cases,
1113                use the MIN_REPEAT operator */
1114 
1115             /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1116 
1117             TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1118                    ctx->pattern[1], ctx->pattern[2]));
1119 
1120             if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1121                 RETURN_FAILURE; /* cannot match */
1122 
1123             state->ptr = ctx->ptr;
1124 
1125             if (ctx->pattern[1] == 0)
1126                 ctx->count = 0;
1127             else {
1128                 /* count using pattern min as the maximum */
1129                 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1130                 RETURN_ON_ERROR(ret);
1131                 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1132                 if (ret < (Py_ssize_t) ctx->pattern[1])
1133                     /* didn't match minimum number of times */
1134                     RETURN_FAILURE;
1135                 /* advance past minimum matches of repeat */
1136                 ctx->count = ret;
1137                 ctx->ptr += ctx->count;
1138             }
1139 
1140             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1141                 /* tail is empty.  we're finished */
1142                 state->ptr = ctx->ptr;
1143                 RETURN_SUCCESS;
1144 
1145             } else {
1146                 /* general case */
1147                 LASTMARK_SAVE();
1148                 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
1149                        || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1150                     state->ptr = ctx->ptr;
1151                     DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1152                             ctx->pattern+ctx->pattern[0]);
1153                     if (ret) {
1154                         RETURN_ON_ERROR(ret);
1155                         RETURN_SUCCESS;
1156                     }
1157                     state->ptr = ctx->ptr;
1158                     ret = SRE_COUNT(state, ctx->pattern+3, 1);
1159                     RETURN_ON_ERROR(ret);
1160                     DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1161                     if (ret == 0)
1162                         break;
1163                     assert(ret == 1);
1164                     ctx->ptr++;
1165                     ctx->count++;
1166                     LASTMARK_RESTORE();
1167                 }
1168             }
1169             RETURN_FAILURE;
1170 
1171         case SRE_OP_REPEAT:
1172             /* create repeat context.  all the hard work is done
1173                by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1174             /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1175             TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1176                    ctx->pattern[1], ctx->pattern[2]));
1177 
1178             /* install new repeat context */
1179             ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1180             if (!ctx->u.rep) {
1181                 PyErr_NoMemory();
1182                 RETURN_FAILURE;
1183             }
1184             ctx->u.rep->count = -1;
1185             ctx->u.rep->pattern = ctx->pattern;
1186             ctx->u.rep->prev = state->repeat;
1187             ctx->u.rep->last_ptr = NULL;
1188             state->repeat = ctx->u.rep;
1189 
1190             state->ptr = ctx->ptr;
1191             DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1192             state->repeat = ctx->u.rep->prev;
1193             PyObject_FREE(ctx->u.rep);
1194 
1195             if (ret) {
1196                 RETURN_ON_ERROR(ret);
1197                 RETURN_SUCCESS;
1198             }
1199             RETURN_FAILURE;
1200 
1201         case SRE_OP_MAX_UNTIL:
1202             /* maximizing repeat */
1203             /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1204 
1205             /* FIXME: we probably need to deal with zero-width
1206                matches in here... */
1207 
1208             ctx->u.rep = state->repeat;
1209             if (!ctx->u.rep)
1210                 RETURN_ERROR(SRE_ERROR_STATE);
1211 
1212             state->ptr = ctx->ptr;
1213 
1214             ctx->count = ctx->u.rep->count+1;
1215 
1216             TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1217                    ctx->ptr, ctx->count));
1218 
1219             if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1220                 /* not enough matches */
1221                 ctx->u.rep->count = ctx->count;
1222                 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1223                         ctx->u.rep->pattern+3);
1224                 if (ret) {
1225                     RETURN_ON_ERROR(ret);
1226                     RETURN_SUCCESS;
1227                 }
1228                 ctx->u.rep->count = ctx->count-1;
1229                 state->ptr = ctx->ptr;
1230                 RETURN_FAILURE;
1231             }
1232 
1233             if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1234                 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1235                 state->ptr != ctx->u.rep->last_ptr) {
1236                 /* we may have enough matches, but if we can
1237                    match another item, do so */
1238                 ctx->u.rep->count = ctx->count;
1239                 LASTMARK_SAVE();
1240                 MARK_PUSH(ctx->lastmark);
1241                 /* zero-width match protection */
1242                 DATA_PUSH(&ctx->u.rep->last_ptr);
1243                 ctx->u.rep->last_ptr = state->ptr;
1244                 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1245                         ctx->u.rep->pattern+3);
1246                 DATA_POP(&ctx->u.rep->last_ptr);
1247                 if (ret) {
1248                     MARK_POP_DISCARD(ctx->lastmark);
1249                     RETURN_ON_ERROR(ret);
1250                     RETURN_SUCCESS;
1251                 }
1252                 MARK_POP(ctx->lastmark);
1253                 LASTMARK_RESTORE();
1254                 ctx->u.rep->count = ctx->count-1;
1255                 state->ptr = ctx->ptr;
1256             }
1257 
1258             /* cannot match more repeated items here.  make sure the
1259                tail matches */
1260             state->repeat = ctx->u.rep->prev;
1261             DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1262             RETURN_ON_SUCCESS(ret);
1263             state->repeat = ctx->u.rep;
1264             state->ptr = ctx->ptr;
1265             RETURN_FAILURE;
1266 
1267         case SRE_OP_MIN_UNTIL:
1268             /* minimizing repeat */
1269             /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1270 
1271             ctx->u.rep = state->repeat;
1272             if (!ctx->u.rep)
1273                 RETURN_ERROR(SRE_ERROR_STATE);
1274 
1275             state->ptr = ctx->ptr;
1276 
1277             ctx->count = ctx->u.rep->count+1;
1278 
1279             TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
1280                    ctx->ptr, ctx->count, ctx->u.rep->pattern));
1281 
1282             if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1283                 /* not enough matches */
1284                 ctx->u.rep->count = ctx->count;
1285                 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1286                         ctx->u.rep->pattern+3);
1287                 if (ret) {
1288                     RETURN_ON_ERROR(ret);
1289                     RETURN_SUCCESS;
1290                 }
1291                 ctx->u.rep->count = ctx->count-1;
1292                 state->ptr = ctx->ptr;
1293                 RETURN_FAILURE;
1294             }
1295 
1296             LASTMARK_SAVE();
1297 
1298             /* see if the tail matches */
1299             state->repeat = ctx->u.rep->prev;
1300             DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1301             if (ret) {
1302                 RETURN_ON_ERROR(ret);
1303                 RETURN_SUCCESS;
1304             }
1305 
1306             state->repeat = ctx->u.rep;
1307             state->ptr = ctx->ptr;
1308 
1309             LASTMARK_RESTORE();
1310 
1311             if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1312                 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1313                 state->ptr == ctx->u.rep->last_ptr)
1314                 RETURN_FAILURE;
1315 
1316             ctx->u.rep->count = ctx->count;
1317             /* zero-width match protection */
1318             DATA_PUSH(&ctx->u.rep->last_ptr);
1319             ctx->u.rep->last_ptr = state->ptr;
1320             DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1321                     ctx->u.rep->pattern+3);
1322             DATA_POP(&ctx->u.rep->last_ptr);
1323             if (ret) {
1324                 RETURN_ON_ERROR(ret);
1325                 RETURN_SUCCESS;
1326             }
1327             ctx->u.rep->count = ctx->count-1;
1328             state->ptr = ctx->ptr;
1329             RETURN_FAILURE;
1330 
1331         case SRE_OP_GROUPREF:
1332             /* match backreference */
1333             TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1334                    ctx->ptr, ctx->pattern[0]));
1335             i = ctx->pattern[0];
1336             {
1337                 Py_ssize_t groupref = i+i;
1338                 if (groupref >= state->lastmark) {
1339                     RETURN_FAILURE;
1340                 } else {
1341                     SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1342                     SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1343                     if (!p || !e || e < p)
1344                         RETURN_FAILURE;
1345                     while (p < e) {
1346                         if (ctx->ptr >= end || *ctx->ptr != *p)
1347                             RETURN_FAILURE;
1348                         p++; ctx->ptr++;
1349                     }
1350                 }
1351             }
1352             ctx->pattern++;
1353             break;
1354 
1355         case SRE_OP_GROUPREF_IGNORE:
1356             /* match backreference */
1357             TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1358                    ctx->ptr, ctx->pattern[0]));
1359             i = ctx->pattern[0];
1360             {
1361                 Py_ssize_t groupref = i+i;
1362                 if (groupref >= state->lastmark) {
1363                     RETURN_FAILURE;
1364                 } else {
1365                     SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1366                     SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1367                     if (!p || !e || e < p)
1368                         RETURN_FAILURE;
1369                     while (p < e) {
1370                         if (ctx->ptr >= end ||
1371                             state->lower(*ctx->ptr) != state->lower(*p))
1372                             RETURN_FAILURE;
1373                         p++; ctx->ptr++;
1374                     }
1375                 }
1376             }
1377             ctx->pattern++;
1378             break;
1379 
1380         case SRE_OP_GROUPREF_EXISTS:
1381             TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1382                    ctx->ptr, ctx->pattern[0]));
1383             /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1384             i = ctx->pattern[0];
1385             {
1386                 Py_ssize_t groupref = i+i;
1387                 if (groupref >= state->lastmark) {
1388                     ctx->pattern += ctx->pattern[1];
1389                     break;
1390                 } else {
1391                     SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1392                     SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1393                     if (!p || !e || e < p) {
1394                         ctx->pattern += ctx->pattern[1];
1395                         break;
1396                     }
1397                 }
1398             }
1399             ctx->pattern += 2;
1400             break;
1401 
1402         case SRE_OP_ASSERT:
1403             /* assert subpattern */
1404             /* <ASSERT> <skip> <back> <pattern> */
1405             TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1406                    ctx->ptr, ctx->pattern[1]));
1407             if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1])
1408                 RETURN_FAILURE;
1409             state->ptr = ctx->ptr - ctx->pattern[1];
1410             DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1411             RETURN_ON_FAILURE(ret);
1412             ctx->pattern += ctx->pattern[0];
1413             break;
1414 
1415         case SRE_OP_ASSERT_NOT:
1416             /* assert not subpattern */
1417             /* <ASSERT_NOT> <skip> <back> <pattern> */
1418             TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1419                    ctx->ptr, ctx->pattern[1]));
1420             if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
1421                 state->ptr = ctx->ptr - ctx->pattern[1];
1422                 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1423                 if (ret) {
1424                     RETURN_ON_ERROR(ret);
1425                     RETURN_FAILURE;
1426                 }
1427             }
1428             ctx->pattern += ctx->pattern[0];
1429             break;
1430 
1431         case SRE_OP_FAILURE:
1432             /* immediate failure */
1433             TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1434             RETURN_FAILURE;
1435 
1436         default:
1437             TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1438                    ctx->pattern[-1]));
1439             RETURN_ERROR(SRE_ERROR_ILLEGAL);
1440         }
1441     }
1442 
1443 exit:
1444     ctx_pos = ctx->last_ctx_pos;
1445     jump = ctx->jump;
1446     DATA_POP_DISCARD(ctx);
1447     if (ctx_pos == -1)
1448         return ret;
1449     DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1450 
1451     switch (jump) {
1452         case JUMP_MAX_UNTIL_2:
1453             TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1454             goto jump_max_until_2;
1455         case JUMP_MAX_UNTIL_3:
1456             TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1457             goto jump_max_until_3;
1458         case JUMP_MIN_UNTIL_2:
1459             TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1460             goto jump_min_until_2;
1461         case JUMP_MIN_UNTIL_3:
1462             TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1463             goto jump_min_until_3;
1464         case JUMP_BRANCH:
1465             TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1466             goto jump_branch;
1467         case JUMP_MAX_UNTIL_1:
1468             TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1469             goto jump_max_until_1;
1470         case JUMP_MIN_UNTIL_1:
1471             TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1472             goto jump_min_until_1;
1473         case JUMP_REPEAT:
1474             TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1475             goto jump_repeat;
1476         case JUMP_REPEAT_ONE_1:
1477             TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1478             goto jump_repeat_one_1;
1479         case JUMP_REPEAT_ONE_2:
1480             TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1481             goto jump_repeat_one_2;
1482         case JUMP_MIN_REPEAT_ONE:
1483             TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1484             goto jump_min_repeat_one;
1485         case JUMP_ASSERT:
1486             TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1487             goto jump_assert;
1488         case JUMP_ASSERT_NOT:
1489             TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1490             goto jump_assert_not;
1491         case JUMP_NONE:
1492             TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1493                    ctx->ptr, ret));
1494             break;
1495     }
1496 
1497     return ret; /* should never get here */
1498 }
1499 
1500 LOCAL(Py_ssize_t)
SRE_SEARCH(SRE_STATE * state,SRE_CODE * pattern)1501 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1502 {
1503     SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1504     SRE_CHAR* end = (SRE_CHAR *)state->end;
1505     Py_ssize_t status = 0;
1506     Py_ssize_t prefix_len = 0;
1507     Py_ssize_t prefix_skip = 0;
1508     SRE_CODE* prefix = NULL;
1509     SRE_CODE* charset = NULL;
1510     SRE_CODE* overlap = NULL;
1511     int flags = 0;
1512 
1513     if (ptr > end)
1514         return 0;
1515 
1516     if (pattern[0] == SRE_OP_INFO) {
1517         /* optimization info block */
1518         /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info>  */
1519 
1520         flags = pattern[2];
1521 
1522         if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1523             TRACE(("reject (got %u chars, need %u)\n",
1524                    (unsigned int)(end - ptr), pattern[3]));
1525             return 0;
1526         }
1527         if (pattern[3] > 1) {
1528             /* adjust end point (but make sure we leave at least one
1529                character in there, so literal search will work) */
1530             end -= pattern[3]-1;
1531             if (end <= ptr)
1532                 end = ptr+1;
1533         }
1534 
1535         if (flags & SRE_INFO_PREFIX) {
1536             /* pattern starts with a known prefix */
1537             /* <length> <skip> <prefix data> <overlap data> */
1538             prefix_len = pattern[5];
1539             prefix_skip = pattern[6];
1540             prefix = pattern + 7;
1541             overlap = prefix + prefix_len - 1;
1542         } else if (flags & SRE_INFO_CHARSET)
1543             /* pattern starts with a character from a known set */
1544             /* <charset> */
1545             charset = pattern + 5;
1546 
1547         pattern += 1 + pattern[1];
1548     }
1549 
1550     TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1551            prefix, prefix_len, prefix_skip));
1552     TRACE(("charset = %p\n", charset));
1553 
1554 #if defined(USE_FAST_SEARCH)
1555     if (prefix_len > 1) {
1556         /* pattern starts with a known prefix.  use the overlap
1557            table to skip forward as fast as we possibly can */
1558         Py_ssize_t i = 0;
1559         end = (SRE_CHAR *)state->end;
1560         while (ptr < end) {
1561             for (;;) {
1562                 if ((SRE_CODE) ptr[0] != prefix[i]) {
1563                     if (!i)
1564                         break;
1565                     else
1566                         i = overlap[i];
1567                 } else {
1568                     if (++i == prefix_len) {
1569                         /* found a potential match */
1570                         TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1571                         state->start = ptr + 1 - prefix_len;
1572                         state->ptr = ptr + 1 - prefix_len + prefix_skip;
1573                         if (flags & SRE_INFO_LITERAL)
1574                             return 1; /* we got all of it */
1575                         status = SRE_MATCH(state, pattern + 2*prefix_skip);
1576                         if (status != 0)
1577                             return status;
1578                         /* close but no cigar -- try again */
1579                         i = overlap[i];
1580                     }
1581                     break;
1582                 }
1583             }
1584             ptr++;
1585         }
1586         return 0;
1587     }
1588 #endif
1589 
1590     if (pattern[0] == SRE_OP_LITERAL) {
1591         /* pattern starts with a literal character.  this is used
1592            for short prefixes, and if fast search is disabled */
1593         SRE_CODE chr = pattern[1];
1594         end = (SRE_CHAR *)state->end;
1595         for (;;) {
1596             while (ptr < end && (SRE_CODE) ptr[0] != chr)
1597                 ptr++;
1598             if (ptr >= end)
1599                 return 0;
1600             TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1601             state->start = ptr;
1602             state->ptr = ++ptr;
1603             if (flags & SRE_INFO_LITERAL)
1604                 return 1; /* we got all of it */
1605             status = SRE_MATCH(state, pattern + 2);
1606             if (status != 0)
1607                 break;
1608         }
1609     } else if (charset) {
1610         /* pattern starts with a character from a known set */
1611         end = (SRE_CHAR *)state->end;
1612         for (;;) {
1613             while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1614                 ptr++;
1615             if (ptr >= end)
1616                 return 0;
1617             TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1618             state->start = ptr;
1619             state->ptr = ptr;
1620             status = SRE_MATCH(state, pattern);
1621             if (status != 0)
1622                 break;
1623             ptr++;
1624         }
1625     } else {
1626         /* general case */
1627         assert(ptr <= end);
1628         while (1) {
1629             TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1630             state->start = state->ptr = ptr;
1631             status = SRE_MATCH(state, pattern);
1632             if (status != 0 || ptr >= end)
1633                 break;
1634             ptr++;
1635         }
1636     }
1637 
1638     return status;
1639 }
1640 
1641 LOCAL(int)
SRE_LITERAL_TEMPLATE(SRE_CHAR * ptr,Py_ssize_t len)1642 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1643 {
1644     /* check if given string is a literal template (i.e. no escapes) */
1645     while (len-- > 0)
1646         if (*ptr++ == '\\')
1647             return 0;
1648     return 1;
1649 }
1650 
1651 #if !defined(SRE_RECURSIVE)
1652 
1653 /* -------------------------------------------------------------------- */
1654 /* factories and destructors */
1655 
1656 /* see sre.h for object declarations */
1657 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1658 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1659 
1660 static PyObject *
sre_codesize(PyObject * self,PyObject * unused)1661 sre_codesize(PyObject* self, PyObject *unused)
1662 {
1663     return PyInt_FromSize_t(sizeof(SRE_CODE));
1664 }
1665 
1666 static PyObject *
sre_getlower(PyObject * self,PyObject * args)1667 sre_getlower(PyObject* self, PyObject* args)
1668 {
1669     int character, flags;
1670     if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1671         return NULL;
1672     if (flags & SRE_FLAG_LOCALE)
1673         return Py_BuildValue("i", sre_lower_locale(character));
1674     if (flags & SRE_FLAG_UNICODE)
1675 #if defined(HAVE_UNICODE)
1676         return Py_BuildValue("i", sre_lower_unicode(character));
1677 #else
1678         return Py_BuildValue("i", sre_lower_locale(character));
1679 #endif
1680     return Py_BuildValue("i", sre_lower(character));
1681 }
1682 
1683 LOCAL(void)
state_reset(SRE_STATE * state)1684 state_reset(SRE_STATE* state)
1685 {
1686     /* FIXME: dynamic! */
1687     /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1688 
1689     state->lastmark = -1;
1690     state->lastindex = -1;
1691 
1692     state->repeat = NULL;
1693 
1694     data_stack_dealloc(state);
1695 }
1696 
1697 static void*
getstring(PyObject * string,Py_ssize_t * p_length,int * p_charsize)1698 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1699 {
1700     /* given a python object, return a data pointer, a length (in
1701        characters), and a character size.  return NULL if the object
1702        is not a string (or not compatible) */
1703 
1704     PyBufferProcs *buffer;
1705     Py_ssize_t size, bytes;
1706     int charsize;
1707     void* ptr;
1708 
1709 #if defined(HAVE_UNICODE)
1710     if (PyUnicode_Check(string)) {
1711         /* unicode strings doesn't always support the buffer interface */
1712         ptr = (void*) PyUnicode_AS_DATA(string);
1713         /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1714         size = PyUnicode_GET_SIZE(string);
1715         charsize = sizeof(Py_UNICODE);
1716 
1717     } else {
1718 #endif
1719 
1720     /* get pointer to string buffer */
1721     buffer = Py_TYPE(string)->tp_as_buffer;
1722     if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1723         buffer->bf_getsegcount(string, NULL) != 1) {
1724         PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1725         return NULL;
1726     }
1727 
1728     /* determine buffer size */
1729     bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1730     if (bytes < 0) {
1731         PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1732         return NULL;
1733     }
1734 
1735     /* determine character size */
1736 #if PY_VERSION_HEX >= 0x01060000
1737     size = PyObject_Size(string);
1738 #else
1739     size = PyObject_Length(string);
1740 #endif
1741 
1742     if (PyString_Check(string) || bytes == size)
1743         charsize = 1;
1744 #if defined(HAVE_UNICODE)
1745     else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1746         charsize = sizeof(Py_UNICODE);
1747 #endif
1748     else {
1749         PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1750         return NULL;
1751     }
1752 
1753 #if defined(HAVE_UNICODE)
1754     }
1755 #endif
1756 
1757     *p_length = size;
1758     *p_charsize = charsize;
1759 
1760     return ptr;
1761 }
1762 
1763 LOCAL(PyObject*)
state_init(SRE_STATE * state,PatternObject * pattern,PyObject * string,Py_ssize_t start,Py_ssize_t end)1764 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1765            Py_ssize_t start, Py_ssize_t end)
1766 {
1767     /* prepare state object */
1768 
1769     Py_ssize_t length;
1770     int charsize;
1771     void* ptr;
1772 
1773     memset(state, 0, sizeof(SRE_STATE));
1774 
1775     state->lastmark = -1;
1776     state->lastindex = -1;
1777 
1778     ptr = getstring(string, &length, &charsize);
1779     if (!ptr)
1780         return NULL;
1781 
1782     /* adjust boundaries */
1783     if (start < 0)
1784         start = 0;
1785     else if (start > length)
1786         start = length;
1787 
1788     if (end < 0)
1789         end = 0;
1790     else if (end > length)
1791         end = length;
1792 
1793     state->charsize = charsize;
1794 
1795     state->beginning = ptr;
1796 
1797     state->start = (void*) ((char*) ptr + start * state->charsize);
1798     state->end = (void*) ((char*) ptr + end * state->charsize);
1799 
1800     Py_INCREF(string);
1801     state->string = string;
1802     state->pos = start;
1803     state->endpos = end;
1804 
1805     if (pattern->flags & SRE_FLAG_LOCALE)
1806         state->lower = sre_lower_locale;
1807     else if (pattern->flags & SRE_FLAG_UNICODE)
1808 #if defined(HAVE_UNICODE)
1809         state->lower = sre_lower_unicode;
1810 #else
1811         state->lower = sre_lower_locale;
1812 #endif
1813     else
1814         state->lower = sre_lower;
1815 
1816     return string;
1817 }
1818 
1819 LOCAL(void)
state_fini(SRE_STATE * state)1820 state_fini(SRE_STATE* state)
1821 {
1822     Py_XDECREF(state->string);
1823     data_stack_dealloc(state);
1824 }
1825 
1826 /* calculate offset from start of string */
1827 #define STATE_OFFSET(state, member)\
1828     (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1829 
1830 LOCAL(PyObject*)
state_getslice(SRE_STATE * state,Py_ssize_t index,PyObject * string,int empty)1831 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1832 {
1833     Py_ssize_t i, j;
1834 
1835     index = (index - 1) * 2;
1836 
1837     if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1838         if (empty)
1839             /* want empty string */
1840             i = j = 0;
1841         else {
1842             Py_INCREF(Py_None);
1843             return Py_None;
1844         }
1845     } else {
1846         i = STATE_OFFSET(state, state->mark[index]);
1847         j = STATE_OFFSET(state, state->mark[index+1]);
1848     }
1849 
1850     return PySequence_GetSlice(string, i, j);
1851 }
1852 
1853 static void
pattern_error(int status)1854 pattern_error(int status)
1855 {
1856     switch (status) {
1857     case SRE_ERROR_RECURSION_LIMIT:
1858         PyErr_SetString(
1859             PyExc_RuntimeError,
1860             "maximum recursion limit exceeded"
1861             );
1862         break;
1863     case SRE_ERROR_MEMORY:
1864         PyErr_NoMemory();
1865         break;
1866     case SRE_ERROR_INTERRUPTED:
1867     /* An exception has already been raised, so let it fly */
1868         break;
1869     default:
1870         /* other error codes indicate compiler/engine bugs */
1871         PyErr_SetString(
1872             PyExc_RuntimeError,
1873             "internal error in regular expression engine"
1874             );
1875     }
1876 }
1877 
1878 static void
pattern_dealloc(PatternObject * self)1879 pattern_dealloc(PatternObject* self)
1880 {
1881     if (self->weakreflist != NULL)
1882         PyObject_ClearWeakRefs((PyObject *) self);
1883     Py_XDECREF(self->pattern);
1884     Py_XDECREF(self->groupindex);
1885     Py_XDECREF(self->indexgroup);
1886     PyObject_DEL(self);
1887 }
1888 
1889 static int
check_args_size(const char * name,PyObject * args,PyObject * kw,int n)1890 check_args_size(const char *name, PyObject* args, PyObject* kw, int n)
1891 {
1892     Py_ssize_t m = PyTuple_GET_SIZE(args) + (kw ? PyDict_Size(kw) : 0);
1893     if (m <= n)
1894         return 1;
1895     PyErr_Format(PyExc_TypeError,
1896                  "%s() takes at most %d positional arguments (%zd given)",
1897                  name, n, m);
1898     return 0;
1899 }
1900 
1901 static PyObject*
fix_string_param(PyObject * string,PyObject * string2,const char * oldname)1902 fix_string_param(PyObject *string, PyObject *string2, const char *oldname)
1903 {
1904     if (string2 != NULL) {
1905         char buf[100];
1906         if (string != NULL) {
1907             PyErr_Format(PyExc_TypeError,
1908                          "Argument given by name ('%s') and position (1)",
1909                          oldname);
1910             return NULL;
1911         }
1912         sprintf(buf, "The '%s' keyword parameter name is deprecated.  "
1913                      "Use 'string' instead.", oldname);
1914         if (PyErr_Warn(PyExc_DeprecationWarning, buf) < 0)
1915             return NULL;
1916         return string2;
1917     }
1918     if (string == NULL) {
1919         PyErr_SetString(PyExc_TypeError,
1920                         "Required argument 'string' (pos 1) not found");
1921         return NULL;
1922     }
1923     return string;
1924 }
1925 
1926 static PyObject*
pattern_match(PatternObject * self,PyObject * args,PyObject * kw)1927 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1928 {
1929     SRE_STATE state;
1930     int status;
1931 
1932     PyObject *string = NULL, *string2 = NULL;
1933     Py_ssize_t start = 0;
1934     Py_ssize_t end = PY_SSIZE_T_MAX;
1935     static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1936     if (!check_args_size("match", args, kw, 3))
1937         return NULL;
1938 
1939     if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:match", kwlist,
1940                                      &string, &start, &end, &string2))
1941         return NULL;
1942 
1943     string = fix_string_param(string, string2, "pattern");
1944     if (!string)
1945         return NULL;
1946 
1947     string = state_init(&state, self, string, start, end);
1948     if (!string)
1949         return NULL;
1950 
1951     state.ptr = state.start;
1952 
1953     TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1954 
1955     if (state.charsize == 1) {
1956         status = sre_match(&state, PatternObject_GetCode(self));
1957     } else {
1958 #if defined(HAVE_UNICODE)
1959         status = sre_umatch(&state, PatternObject_GetCode(self));
1960 #endif
1961     }
1962 
1963     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1964     if (PyErr_Occurred())
1965         return NULL;
1966 
1967     state_fini(&state);
1968 
1969     return pattern_new_match(self, &state, status);
1970 }
1971 
1972 static PyObject*
pattern_search(PatternObject * self,PyObject * args,PyObject * kw)1973 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1974 {
1975     SRE_STATE state;
1976     int status;
1977 
1978     PyObject *string = NULL, *string2 = NULL;
1979     Py_ssize_t start = 0;
1980     Py_ssize_t end = PY_SSIZE_T_MAX;
1981     static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1982     if (!check_args_size("search", args, kw, 3))
1983         return NULL;
1984 
1985     if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:search", kwlist,
1986                                      &string, &start, &end, &string2))
1987         return NULL;
1988 
1989     string = fix_string_param(string, string2, "pattern");
1990     if (!string)
1991         return NULL;
1992 
1993     string = state_init(&state, self, string, start, end);
1994     if (!string)
1995         return NULL;
1996 
1997     TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1998 
1999     if (state.charsize == 1) {
2000         status = sre_search(&state, PatternObject_GetCode(self));
2001     } else {
2002 #if defined(HAVE_UNICODE)
2003         status = sre_usearch(&state, PatternObject_GetCode(self));
2004 #endif
2005     }
2006 
2007     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2008 
2009     state_fini(&state);
2010 
2011     if (PyErr_Occurred())
2012         return NULL;
2013 
2014     return pattern_new_match(self, &state, status);
2015 }
2016 
2017 static PyObject*
call(char * module,char * function,PyObject * args)2018 call(char* module, char* function, PyObject* args)
2019 {
2020     PyObject* name;
2021     PyObject* mod;
2022     PyObject* func;
2023     PyObject* result;
2024 
2025     if (!args)
2026         return NULL;
2027     name = PyString_FromString(module);
2028     if (!name)
2029         return NULL;
2030     mod = PyImport_Import(name);
2031     Py_DECREF(name);
2032     if (!mod)
2033         return NULL;
2034     func = PyObject_GetAttrString(mod, function);
2035     Py_DECREF(mod);
2036     if (!func)
2037         return NULL;
2038     result = PyObject_CallObject(func, args);
2039     Py_DECREF(func);
2040     Py_DECREF(args);
2041     return result;
2042 }
2043 
2044 #ifdef USE_BUILTIN_COPY
2045 static int
deepcopy(PyObject ** object,PyObject * memo)2046 deepcopy(PyObject** object, PyObject* memo)
2047 {
2048     PyObject* copy;
2049 
2050     copy = call(
2051         "copy", "deepcopy",
2052         PyTuple_Pack(2, *object, memo)
2053         );
2054     if (!copy)
2055         return 0;
2056 
2057     Py_SETREF(*object, copy);
2058 
2059     return 1; /* success */
2060 }
2061 #endif
2062 
2063 static PyObject*
join_list(PyObject * list,PyObject * string)2064 join_list(PyObject* list, PyObject* string)
2065 {
2066     /* join list elements */
2067 
2068     PyObject* joiner;
2069 #if PY_VERSION_HEX >= 0x01060000
2070     PyObject* function;
2071     PyObject* args;
2072 #endif
2073     PyObject* result;
2074 
2075     joiner = PySequence_GetSlice(string, 0, 0);
2076     if (!joiner)
2077         return NULL;
2078 
2079     if (PyList_GET_SIZE(list) == 0) {
2080         Py_DECREF(list);
2081         return joiner;
2082     }
2083 
2084 #if PY_VERSION_HEX >= 0x01060000
2085     function = PyObject_GetAttrString(joiner, "join");
2086     if (!function) {
2087         Py_DECREF(joiner);
2088         return NULL;
2089     }
2090     args = PyTuple_New(1);
2091     if (!args) {
2092         Py_DECREF(function);
2093         Py_DECREF(joiner);
2094         return NULL;
2095     }
2096     PyTuple_SET_ITEM(args, 0, list);
2097     result = PyObject_CallObject(function, args);
2098     Py_DECREF(args); /* also removes list */
2099     Py_DECREF(function);
2100 #else
2101     result = call(
2102         "string", "join",
2103         PyTuple_Pack(2, list, joiner)
2104         );
2105 #endif
2106     Py_DECREF(joiner);
2107 
2108     return result;
2109 }
2110 
2111 static PyObject*
pattern_findall(PatternObject * self,PyObject * args,PyObject * kw)2112 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2113 {
2114     SRE_STATE state;
2115     PyObject* list;
2116     int status;
2117     Py_ssize_t i, b, e;
2118 
2119     PyObject *string = NULL, *string2 = NULL;
2120     Py_ssize_t start = 0;
2121     Py_ssize_t end = PY_SSIZE_T_MAX;
2122     static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
2123     if (!check_args_size("findall", args, kw, 3))
2124         return NULL;
2125 
2126     if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:findall", kwlist,
2127                                      &string, &start, &end, &string2))
2128         return NULL;
2129 
2130     string = fix_string_param(string, string2, "source");
2131     if (!string)
2132         return NULL;
2133 
2134     string = state_init(&state, self, string, start, end);
2135     if (!string)
2136         return NULL;
2137 
2138     list = PyList_New(0);
2139     if (!list) {
2140         state_fini(&state);
2141         return NULL;
2142     }
2143 
2144     while (state.start <= state.end) {
2145 
2146         PyObject* item;
2147 
2148         state_reset(&state);
2149 
2150         state.ptr = state.start;
2151 
2152         if (state.charsize == 1) {
2153             status = sre_search(&state, PatternObject_GetCode(self));
2154         } else {
2155 #if defined(HAVE_UNICODE)
2156             status = sre_usearch(&state, PatternObject_GetCode(self));
2157 #endif
2158         }
2159 
2160 	if (PyErr_Occurred())
2161 	    goto error;
2162 
2163         if (status <= 0) {
2164             if (status == 0)
2165                 break;
2166             pattern_error(status);
2167             goto error;
2168         }
2169 
2170         /* don't bother to build a match object */
2171         switch (self->groups) {
2172         case 0:
2173             b = STATE_OFFSET(&state, state.start);
2174             e = STATE_OFFSET(&state, state.ptr);
2175             item = PySequence_GetSlice(string, b, e);
2176             if (!item)
2177                 goto error;
2178             break;
2179         case 1:
2180             item = state_getslice(&state, 1, string, 1);
2181             if (!item)
2182                 goto error;
2183             break;
2184         default:
2185             item = PyTuple_New(self->groups);
2186             if (!item)
2187                 goto error;
2188             for (i = 0; i < self->groups; i++) {
2189                 PyObject* o = state_getslice(&state, i+1, string, 1);
2190                 if (!o) {
2191                     Py_DECREF(item);
2192                     goto error;
2193                 }
2194                 PyTuple_SET_ITEM(item, i, o);
2195             }
2196             break;
2197         }
2198 
2199         status = PyList_Append(list, item);
2200         Py_DECREF(item);
2201         if (status < 0)
2202             goto error;
2203 
2204         if (state.ptr == state.start)
2205             state.start = (void*) ((char*) state.ptr + state.charsize);
2206         else
2207             state.start = state.ptr;
2208 
2209     }
2210 
2211     state_fini(&state);
2212     return list;
2213 
2214 error:
2215     Py_DECREF(list);
2216     state_fini(&state);
2217     return NULL;
2218 
2219 }
2220 
2221 #if PY_VERSION_HEX >= 0x02020000
2222 static PyObject*
pattern_finditer(PatternObject * pattern,PyObject * args)2223 pattern_finditer(PatternObject* pattern, PyObject* args)
2224 {
2225     PyObject* scanner;
2226     PyObject* search;
2227     PyObject* iterator;
2228 
2229     scanner = pattern_scanner(pattern, args);
2230     if (!scanner)
2231         return NULL;
2232 
2233     search = PyObject_GetAttrString(scanner, "search");
2234     Py_DECREF(scanner);
2235     if (!search)
2236         return NULL;
2237 
2238     iterator = PyCallIter_New(search, Py_None);
2239     Py_DECREF(search);
2240 
2241     return iterator;
2242 }
2243 #endif
2244 
2245 static PyObject*
pattern_split(PatternObject * self,PyObject * args,PyObject * kw)2246 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2247 {
2248     SRE_STATE state;
2249     PyObject* list;
2250     PyObject* item;
2251     int status;
2252     Py_ssize_t n;
2253     Py_ssize_t i;
2254     void* last;
2255 
2256     PyObject *string = NULL, *string2 = NULL;
2257     Py_ssize_t maxsplit = 0;
2258     static char* kwlist[] = { "string", "maxsplit", "source", NULL };
2259     if (!check_args_size("split", args, kw, 2))
2260         return NULL;
2261 
2262     if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnO:split", kwlist,
2263                                      &string, &maxsplit, &string2))
2264         return NULL;
2265 
2266     string = fix_string_param(string, string2, "source");
2267     if (!string)
2268         return NULL;
2269 
2270     string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2271     if (!string)
2272         return NULL;
2273 
2274     list = PyList_New(0);
2275     if (!list) {
2276         state_fini(&state);
2277         return NULL;
2278     }
2279 
2280     n = 0;
2281     last = state.start;
2282 
2283     while (!maxsplit || n < maxsplit) {
2284 
2285         state_reset(&state);
2286 
2287         state.ptr = state.start;
2288 
2289         if (state.charsize == 1) {
2290             status = sre_search(&state, PatternObject_GetCode(self));
2291         } else {
2292 #if defined(HAVE_UNICODE)
2293             status = sre_usearch(&state, PatternObject_GetCode(self));
2294 #endif
2295         }
2296 
2297 	if (PyErr_Occurred())
2298 	    goto error;
2299 
2300         if (status <= 0) {
2301             if (status == 0)
2302                 break;
2303             pattern_error(status);
2304             goto error;
2305         }
2306 
2307         if (state.start == state.ptr) {
2308             if (last == state.end || state.ptr == state.end)
2309                 break;
2310             /* skip one character */
2311             state.start = (void*) ((char*) state.ptr + state.charsize);
2312             continue;
2313         }
2314 
2315         /* get segment before this match */
2316         item = PySequence_GetSlice(
2317             string, STATE_OFFSET(&state, last),
2318             STATE_OFFSET(&state, state.start)
2319             );
2320         if (!item)
2321             goto error;
2322         status = PyList_Append(list, item);
2323         Py_DECREF(item);
2324         if (status < 0)
2325             goto error;
2326 
2327         /* add groups (if any) */
2328         for (i = 0; i < self->groups; i++) {
2329             item = state_getslice(&state, i+1, string, 0);
2330             if (!item)
2331                 goto error;
2332             status = PyList_Append(list, item);
2333             Py_DECREF(item);
2334             if (status < 0)
2335                 goto error;
2336         }
2337 
2338         n = n + 1;
2339 
2340         last = state.start = state.ptr;
2341 
2342     }
2343 
2344     /* get segment following last match (even if empty) */
2345     item = PySequence_GetSlice(
2346         string, STATE_OFFSET(&state, last), state.endpos
2347         );
2348     if (!item)
2349         goto error;
2350     status = PyList_Append(list, item);
2351     Py_DECREF(item);
2352     if (status < 0)
2353         goto error;
2354 
2355     state_fini(&state);
2356     return list;
2357 
2358 error:
2359     Py_DECREF(list);
2360     state_fini(&state);
2361     return NULL;
2362 
2363 }
2364 
2365 static PyObject*
pattern_subx(PatternObject * self,PyObject * ptemplate,PyObject * string,Py_ssize_t count,Py_ssize_t subn)2366 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2367              Py_ssize_t count, Py_ssize_t subn)
2368 {
2369     SRE_STATE state;
2370     PyObject* list;
2371     PyObject* item;
2372     PyObject* filter;
2373     PyObject* args;
2374     PyObject* match;
2375     void* ptr;
2376     int status;
2377     Py_ssize_t n;
2378     Py_ssize_t i, b, e;
2379     int bint;
2380     int filter_is_callable;
2381 
2382     if (PyCallable_Check(ptemplate)) {
2383         /* sub/subn takes either a function or a template */
2384         filter = ptemplate;
2385         Py_INCREF(filter);
2386         filter_is_callable = 1;
2387     } else {
2388         /* if not callable, check if it's a literal string */
2389         int literal;
2390         ptr = getstring(ptemplate, &n, &bint);
2391         b = bint;
2392         if (ptr) {
2393             if (b == 1) {
2394 		    literal = sre_literal_template((unsigned char *)ptr, n);
2395             } else {
2396 #if defined(HAVE_UNICODE)
2397 		    literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2398 #endif
2399             }
2400         } else {
2401             PyErr_Clear();
2402             literal = 0;
2403         }
2404         if (literal) {
2405             filter = ptemplate;
2406             Py_INCREF(filter);
2407             filter_is_callable = 0;
2408         } else {
2409             /* not a literal; hand it over to the template compiler */
2410             filter = call(
2411                 SRE_PY_MODULE, "_subx",
2412                 PyTuple_Pack(2, self, ptemplate)
2413                 );
2414             if (!filter)
2415                 return NULL;
2416             filter_is_callable = PyCallable_Check(filter);
2417         }
2418     }
2419 
2420     string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2421     if (!string) {
2422         Py_DECREF(filter);
2423         return NULL;
2424     }
2425 
2426     list = PyList_New(0);
2427     if (!list) {
2428         Py_DECREF(filter);
2429         state_fini(&state);
2430         return NULL;
2431     }
2432 
2433     n = i = 0;
2434 
2435     while (!count || n < count) {
2436 
2437         state_reset(&state);
2438 
2439         state.ptr = state.start;
2440 
2441         if (state.charsize == 1) {
2442             status = sre_search(&state, PatternObject_GetCode(self));
2443         } else {
2444 #if defined(HAVE_UNICODE)
2445             status = sre_usearch(&state, PatternObject_GetCode(self));
2446 #endif
2447         }
2448 
2449 	if (PyErr_Occurred())
2450 	    goto error;
2451 
2452         if (status <= 0) {
2453             if (status == 0)
2454                 break;
2455             pattern_error(status);
2456             goto error;
2457         }
2458 
2459         b = STATE_OFFSET(&state, state.start);
2460         e = STATE_OFFSET(&state, state.ptr);
2461 
2462         if (i < b) {
2463             /* get segment before this match */
2464             item = PySequence_GetSlice(string, i, b);
2465             if (!item)
2466                 goto error;
2467             status = PyList_Append(list, item);
2468             Py_DECREF(item);
2469             if (status < 0)
2470                 goto error;
2471 
2472         } else if (i == b && i == e && n > 0)
2473             /* ignore empty match on latest position */
2474             goto next;
2475 
2476         if (filter_is_callable) {
2477             /* pass match object through filter */
2478             match = pattern_new_match(self, &state, 1);
2479             if (!match)
2480                 goto error;
2481             args = PyTuple_Pack(1, match);
2482             if (!args) {
2483                 Py_DECREF(match);
2484                 goto error;
2485             }
2486             item = PyObject_CallObject(filter, args);
2487             Py_DECREF(args);
2488             Py_DECREF(match);
2489             if (!item)
2490                 goto error;
2491         } else {
2492             /* filter is literal string */
2493             item = filter;
2494             Py_INCREF(item);
2495         }
2496 
2497         /* add to list */
2498         if (item != Py_None) {
2499             status = PyList_Append(list, item);
2500             Py_DECREF(item);
2501             if (status < 0)
2502                 goto error;
2503         }
2504 
2505         i = e;
2506         n = n + 1;
2507 
2508 next:
2509         /* move on */
2510         if (state.ptr == state.end)
2511             break;
2512         if (state.ptr == state.start)
2513             state.start = (void*) ((char*) state.ptr + state.charsize);
2514         else
2515             state.start = state.ptr;
2516 
2517     }
2518 
2519     /* get segment following last match */
2520     if (i < state.endpos) {
2521         item = PySequence_GetSlice(string, i, state.endpos);
2522         if (!item)
2523             goto error;
2524         status = PyList_Append(list, item);
2525         Py_DECREF(item);
2526         if (status < 0)
2527             goto error;
2528     }
2529 
2530     state_fini(&state);
2531 
2532     Py_DECREF(filter);
2533 
2534     /* convert list to single string (also removes list) */
2535     item = join_list(list, string);
2536 
2537     if (!item)
2538         return NULL;
2539 
2540     if (subn)
2541         return Py_BuildValue("Nn", item, n);
2542 
2543     return item;
2544 
2545 error:
2546     Py_DECREF(list);
2547     state_fini(&state);
2548     Py_DECREF(filter);
2549     return NULL;
2550 
2551 }
2552 
2553 static PyObject*
pattern_sub(PatternObject * self,PyObject * args,PyObject * kw)2554 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2555 {
2556     PyObject* ptemplate;
2557     PyObject* string;
2558     Py_ssize_t count = 0;
2559     static char* kwlist[] = { "repl", "string", "count", NULL };
2560     if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2561                                      &ptemplate, &string, &count))
2562         return NULL;
2563 
2564     return pattern_subx(self, ptemplate, string, count, 0);
2565 }
2566 
2567 static PyObject*
pattern_subn(PatternObject * self,PyObject * args,PyObject * kw)2568 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2569 {
2570     PyObject* ptemplate;
2571     PyObject* string;
2572     Py_ssize_t count = 0;
2573     static char* kwlist[] = { "repl", "string", "count", NULL };
2574     if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2575                                      &ptemplate, &string, &count))
2576         return NULL;
2577 
2578     return pattern_subx(self, ptemplate, string, count, 1);
2579 }
2580 
2581 static PyObject*
pattern_copy(PatternObject * self,PyObject * unused)2582 pattern_copy(PatternObject* self, PyObject *unused)
2583 {
2584 #ifdef USE_BUILTIN_COPY
2585     PatternObject* copy;
2586     int offset;
2587 
2588     copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2589     if (!copy)
2590         return NULL;
2591 
2592     offset = offsetof(PatternObject, groups);
2593 
2594     Py_XINCREF(self->groupindex);
2595     Py_XINCREF(self->indexgroup);
2596     Py_XINCREF(self->pattern);
2597 
2598     memcpy((char*) copy + offset, (char*) self + offset,
2599            sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2600     copy->weakreflist = NULL;
2601 
2602     return (PyObject*) copy;
2603 #else
2604     PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2605     return NULL;
2606 #endif
2607 }
2608 
2609 static PyObject*
pattern_deepcopy(PatternObject * self,PyObject * memo)2610 pattern_deepcopy(PatternObject* self, PyObject* memo)
2611 {
2612 #ifdef USE_BUILTIN_COPY
2613     PatternObject* copy;
2614 
2615     copy = (PatternObject*) pattern_copy(self);
2616     if (!copy)
2617         return NULL;
2618 
2619     if (!deepcopy(&copy->groupindex, memo) ||
2620         !deepcopy(&copy->indexgroup, memo) ||
2621         !deepcopy(&copy->pattern, memo)) {
2622         Py_DECREF(copy);
2623         return NULL;
2624     }
2625 
2626 #else
2627     PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2628     return NULL;
2629 #endif
2630 }
2631 
2632 PyDoc_STRVAR(pattern_match_doc,
2633 "match(string[, pos[, endpos]]) --> match object or None.\n\
2634     Matches zero or more characters at the beginning of the string");
2635 
2636 PyDoc_STRVAR(pattern_search_doc,
2637 "search(string[, pos[, endpos]]) --> match object or None.\n\
2638     Scan through string looking for a match, and return a corresponding\n\
2639     match object instance. Return None if no position in the string matches.");
2640 
2641 PyDoc_STRVAR(pattern_split_doc,
2642 "split(string[, maxsplit = 0])  --> list.\n\
2643     Split string by the occurrences of pattern.");
2644 
2645 PyDoc_STRVAR(pattern_findall_doc,
2646 "findall(string[, pos[, endpos]]) --> list.\n\
2647    Return a list of all non-overlapping matches of pattern in string.");
2648 
2649 PyDoc_STRVAR(pattern_finditer_doc,
2650 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2651     Return an iterator over all non-overlapping matches for the \n\
2652     RE pattern in string. For each match, the iterator returns a\n\
2653     match object.");
2654 
2655 PyDoc_STRVAR(pattern_sub_doc,
2656 "sub(repl, string[, count = 0]) --> newstring\n\
2657     Return the string obtained by replacing the leftmost non-overlapping\n\
2658     occurrences of pattern in string by the replacement repl.");
2659 
2660 PyDoc_STRVAR(pattern_subn_doc,
2661 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2662     Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2663     the leftmost non-overlapping occurrences of pattern with the\n\
2664     replacement repl.");
2665 
2666 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2667 
2668 static PyMethodDef pattern_methods[] = {
2669     {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2670 	pattern_match_doc},
2671     {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2672 	pattern_search_doc},
2673     {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2674 	pattern_sub_doc},
2675     {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2676 	pattern_subn_doc},
2677     {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2678 	pattern_split_doc},
2679     {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2680 	pattern_findall_doc},
2681 #if PY_VERSION_HEX >= 0x02020000
2682     {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2683 	pattern_finditer_doc},
2684 #endif
2685     {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2686     {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2687     {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2688     {NULL, NULL}
2689 };
2690 
2691 #define PAT_OFF(x) offsetof(PatternObject, x)
2692 static PyMemberDef pattern_members[] = {
2693     {"pattern",    T_OBJECT,    PAT_OFF(pattern),       READONLY},
2694     {"flags",      T_INT,       PAT_OFF(flags),         READONLY},
2695     {"groups",     T_PYSSIZET,  PAT_OFF(groups),        READONLY},
2696     {"groupindex", T_OBJECT,    PAT_OFF(groupindex),    READONLY},
2697     {NULL}  /* Sentinel */
2698 };
2699 
2700 statichere PyTypeObject Pattern_Type = {
2701     PyObject_HEAD_INIT(NULL)
2702     0, "_" SRE_MODULE ".SRE_Pattern",
2703     sizeof(PatternObject), sizeof(SRE_CODE),
2704     (destructor)pattern_dealloc, /*tp_dealloc*/
2705     0,                                  /* tp_print */
2706     0,                                  /* tp_getattrn */
2707     0,					/* tp_setattr */
2708     0,					/* tp_compare */
2709     0,					/* tp_repr */
2710     0,					/* tp_as_number */
2711     0,					/* tp_as_sequence */
2712     0,					/* tp_as_mapping */
2713     0,					/* tp_hash */
2714     0,					/* tp_call */
2715     0,					/* tp_str */
2716     0,					/* tp_getattro */
2717     0,					/* tp_setattro */
2718     0,					/* tp_as_buffer */
2719     Py_TPFLAGS_DEFAULT,		        /* tp_flags */
2720     pattern_doc,			/* tp_doc */
2721     0,					/* tp_traverse */
2722     0,					/* tp_clear */
2723     0,					/* tp_richcompare */
2724     offsetof(PatternObject, weakreflist),	/* tp_weaklistoffset */
2725     0,					/* tp_iter */
2726     0,					/* tp_iternext */
2727     pattern_methods,			/* tp_methods */
2728     pattern_members,			/* tp_members */
2729 };
2730 
2731 static int _validate(PatternObject *self); /* Forward */
2732 
2733 static PyObject *
_compile(PyObject * self_,PyObject * args)2734 _compile(PyObject* self_, PyObject* args)
2735 {
2736     /* "compile" pattern descriptor to pattern object */
2737 
2738     PatternObject* self;
2739     Py_ssize_t i, n;
2740 
2741     PyObject* pattern;
2742     int flags = 0;
2743     PyObject* code;
2744     Py_ssize_t groups = 0;
2745     PyObject* groupindex = NULL;
2746     PyObject* indexgroup = NULL;
2747     if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2748                           &PyList_Type, &code, &groups,
2749                           &groupindex, &indexgroup))
2750         return NULL;
2751 
2752     n = PyList_GET_SIZE(code);
2753     /* coverity[ampersand_in_size] */
2754     self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2755     if (!self)
2756         return NULL;
2757     self->weakreflist = NULL;
2758     self->pattern = NULL;
2759     self->groupindex = NULL;
2760     self->indexgroup = NULL;
2761 
2762     self->codesize = n;
2763 
2764     for (i = 0; i < n; i++) {
2765         PyObject *o = PyList_GET_ITEM(code, i);
2766         unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2767                                               : PyLong_AsUnsignedLong(o);
2768         if (value == (unsigned long)-1 && PyErr_Occurred()) {
2769             if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
2770                 PyErr_SetString(PyExc_OverflowError,
2771                                 "regular expression code size limit exceeded");
2772             }
2773             break;
2774         }
2775         self->code[i] = (SRE_CODE) value;
2776         if ((unsigned long) self->code[i] != value) {
2777             PyErr_SetString(PyExc_OverflowError,
2778                             "regular expression code size limit exceeded");
2779             break;
2780         }
2781     }
2782 
2783     if (PyErr_Occurred()) {
2784         Py_DECREF(self);
2785         return NULL;
2786     }
2787 
2788     Py_INCREF(pattern);
2789     self->pattern = pattern;
2790 
2791     self->flags = flags;
2792 
2793     self->groups = groups;
2794 
2795     Py_XINCREF(groupindex);
2796     self->groupindex = groupindex;
2797 
2798     Py_XINCREF(indexgroup);
2799     self->indexgroup = indexgroup;
2800 
2801     self->weakreflist = NULL;
2802 
2803     if (!_validate(self)) {
2804         Py_DECREF(self);
2805         return NULL;
2806     }
2807 
2808     return (PyObject*) self;
2809 }
2810 
2811 /* -------------------------------------------------------------------- */
2812 /* Code validation */
2813 
2814 /* To learn more about this code, have a look at the _compile() function in
2815    Lib/sre_compile.py.  The validation functions below checks the code array
2816    for conformance with the code patterns generated there.
2817 
2818    The nice thing about the generated code is that it is position-independent:
2819    all jumps are relative jumps forward.  Also, jumps don't cross each other:
2820    the target of a later jump is always earlier than the target of an earlier
2821    jump.  IOW, this is okay:
2822 
2823    J---------J-------T--------T
2824     \         \_____/        /
2825      \______________________/
2826 
2827    but this is not:
2828 
2829    J---------J-------T--------T
2830     \_________\_____/        /
2831                \____________/
2832 
2833    It also helps that SRE_CODE is always an unsigned type.
2834 */
2835 
2836 /* Defining this one enables tracing of the validator */
2837 #undef VVERBOSE
2838 
2839 /* Trace macro for the validator */
2840 #if defined(VVERBOSE)
2841 #define VTRACE(v) printf v
2842 #else
2843 #define VTRACE(v) do {} while(0)  /* do nothing */
2844 #endif
2845 
2846 /* Report failure */
2847 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2848 
2849 /* Extract opcode, argument, or skip count from code array */
2850 #define GET_OP                                          \
2851     do {                                                \
2852         VTRACE(("%p: ", code));                         \
2853         if (code >= end) FAIL;                          \
2854         op = *code++;                                   \
2855         VTRACE(("%lu (op)\n", (unsigned long)op));      \
2856     } while (0)
2857 #define GET_ARG                                         \
2858     do {                                                \
2859         VTRACE(("%p= ", code));                         \
2860         if (code >= end) FAIL;                          \
2861         arg = *code++;                                  \
2862         VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
2863     } while (0)
2864 #define GET_SKIP_ADJ(adj)                               \
2865     do {                                                \
2866         VTRACE(("%p= ", code));                         \
2867         if (code >= end) FAIL;                          \
2868         skip = *code;                                   \
2869         VTRACE(("%lu (skip to %p)\n",                   \
2870                (unsigned long)skip, code+skip));        \
2871         if (skip-adj > end-code)                        \
2872             FAIL;                                       \
2873         code++;                                         \
2874     } while (0)
2875 #define GET_SKIP GET_SKIP_ADJ(0)
2876 
2877 static int
_validate_charset(SRE_CODE * code,SRE_CODE * end)2878 _validate_charset(SRE_CODE *code, SRE_CODE *end)
2879 {
2880     /* Some variables are manipulated by the macros above */
2881     SRE_CODE op;
2882     SRE_CODE arg;
2883     SRE_CODE offset;
2884     int i;
2885 
2886     while (code < end) {
2887         GET_OP;
2888         switch (op) {
2889 
2890         case SRE_OP_NEGATE:
2891             break;
2892 
2893         case SRE_OP_LITERAL:
2894             GET_ARG;
2895             break;
2896 
2897         case SRE_OP_RANGE:
2898             GET_ARG;
2899             GET_ARG;
2900             break;
2901 
2902         case SRE_OP_CHARSET:
2903             offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2904             if (offset > end-code)
2905                 FAIL;
2906             code += offset;
2907             break;
2908 
2909         case SRE_OP_BIGCHARSET:
2910             GET_ARG; /* Number of blocks */
2911             offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2912             if (offset > end-code)
2913                 FAIL;
2914             /* Make sure that each byte points to a valid block */
2915             for (i = 0; i < 256; i++) {
2916                 if (((unsigned char *)code)[i] >= arg)
2917                     FAIL;
2918             }
2919             code += offset;
2920             offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2921             if (offset > end-code)
2922                 FAIL;
2923             code += offset;
2924             break;
2925 
2926         case SRE_OP_CATEGORY:
2927             GET_ARG;
2928             switch (arg) {
2929             case SRE_CATEGORY_DIGIT:
2930             case SRE_CATEGORY_NOT_DIGIT:
2931             case SRE_CATEGORY_SPACE:
2932             case SRE_CATEGORY_NOT_SPACE:
2933             case SRE_CATEGORY_WORD:
2934             case SRE_CATEGORY_NOT_WORD:
2935             case SRE_CATEGORY_LINEBREAK:
2936             case SRE_CATEGORY_NOT_LINEBREAK:
2937             case SRE_CATEGORY_LOC_WORD:
2938             case SRE_CATEGORY_LOC_NOT_WORD:
2939             case SRE_CATEGORY_UNI_DIGIT:
2940             case SRE_CATEGORY_UNI_NOT_DIGIT:
2941             case SRE_CATEGORY_UNI_SPACE:
2942             case SRE_CATEGORY_UNI_NOT_SPACE:
2943             case SRE_CATEGORY_UNI_WORD:
2944             case SRE_CATEGORY_UNI_NOT_WORD:
2945             case SRE_CATEGORY_UNI_LINEBREAK:
2946             case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2947                 break;
2948             default:
2949                 FAIL;
2950             }
2951             break;
2952 
2953         default:
2954             FAIL;
2955 
2956         }
2957     }
2958 
2959     return 1;
2960 }
2961 
2962 static int
_validate_inner(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)2963 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2964 {
2965     /* Some variables are manipulated by the macros above */
2966     SRE_CODE op;
2967     SRE_CODE arg;
2968     SRE_CODE skip;
2969 
2970     VTRACE(("code=%p, end=%p\n", code, end));
2971 
2972     if (code > end)
2973         FAIL;
2974 
2975     while (code < end) {
2976         GET_OP;
2977         switch (op) {
2978 
2979         case SRE_OP_MARK:
2980             /* We don't check whether marks are properly nested; the
2981                sre_match() code is robust even if they don't, and the worst
2982                you can get is nonsensical match results. */
2983             GET_ARG;
2984             if (arg > 2*groups+1) {
2985                 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2986                 FAIL;
2987             }
2988             break;
2989 
2990         case SRE_OP_LITERAL:
2991         case SRE_OP_NOT_LITERAL:
2992         case SRE_OP_LITERAL_IGNORE:
2993         case SRE_OP_NOT_LITERAL_IGNORE:
2994             GET_ARG;
2995             /* The arg is just a character, nothing to check */
2996             break;
2997 
2998         case SRE_OP_SUCCESS:
2999         case SRE_OP_FAILURE:
3000             /* Nothing to check; these normally end the matching process */
3001             break;
3002 
3003         case SRE_OP_AT:
3004             GET_ARG;
3005             switch (arg) {
3006             case SRE_AT_BEGINNING:
3007             case SRE_AT_BEGINNING_STRING:
3008             case SRE_AT_BEGINNING_LINE:
3009             case SRE_AT_END:
3010             case SRE_AT_END_LINE:
3011             case SRE_AT_END_STRING:
3012             case SRE_AT_BOUNDARY:
3013             case SRE_AT_NON_BOUNDARY:
3014             case SRE_AT_LOC_BOUNDARY:
3015             case SRE_AT_LOC_NON_BOUNDARY:
3016             case SRE_AT_UNI_BOUNDARY:
3017             case SRE_AT_UNI_NON_BOUNDARY:
3018                 break;
3019             default:
3020                 FAIL;
3021             }
3022             break;
3023 
3024         case SRE_OP_ANY:
3025         case SRE_OP_ANY_ALL:
3026             /* These have no operands */
3027             break;
3028 
3029         case SRE_OP_IN:
3030         case SRE_OP_IN_IGNORE:
3031             GET_SKIP;
3032             /* Stop 1 before the end; we check the FAILURE below */
3033             if (!_validate_charset(code, code+skip-2))
3034                 FAIL;
3035             if (code[skip-2] != SRE_OP_FAILURE)
3036                 FAIL;
3037             code += skip-1;
3038             break;
3039 
3040         case SRE_OP_INFO:
3041             {
3042                 /* A minimal info field is
3043                    <INFO> <1=skip> <2=flags> <3=min> <4=max>;
3044                    If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
3045                    more follows. */
3046                 SRE_CODE flags, i;
3047                 SRE_CODE *newcode;
3048                 GET_SKIP;
3049                 newcode = code+skip-1;
3050                 GET_ARG; flags = arg;
3051                 GET_ARG; /* min */
3052                 GET_ARG; /* max */
3053                 /* Check that only valid flags are present */
3054                 if ((flags & ~(SRE_INFO_PREFIX |
3055                                SRE_INFO_LITERAL |
3056                                SRE_INFO_CHARSET)) != 0)
3057                     FAIL;
3058                 /* PREFIX and CHARSET are mutually exclusive */
3059                 if ((flags & SRE_INFO_PREFIX) &&
3060                     (flags & SRE_INFO_CHARSET))
3061                     FAIL;
3062                 /* LITERAL implies PREFIX */
3063                 if ((flags & SRE_INFO_LITERAL) &&
3064                     !(flags & SRE_INFO_PREFIX))
3065                     FAIL;
3066                 /* Validate the prefix */
3067                 if (flags & SRE_INFO_PREFIX) {
3068                     SRE_CODE prefix_len;
3069                     GET_ARG; prefix_len = arg;
3070                     GET_ARG; /* prefix skip */
3071                     /* Here comes the prefix string */
3072                     if (prefix_len > newcode-code)
3073                         FAIL;
3074                     code += prefix_len;
3075                     /* And here comes the overlap table */
3076                     if (prefix_len > newcode-code)
3077                         FAIL;
3078                     /* Each overlap value should be < prefix_len */
3079                     for (i = 0; i < prefix_len; i++) {
3080                         if (code[i] >= prefix_len)
3081                             FAIL;
3082                     }
3083                     code += prefix_len;
3084                 }
3085                 /* Validate the charset */
3086                 if (flags & SRE_INFO_CHARSET) {
3087                     if (!_validate_charset(code, newcode-1))
3088                         FAIL;
3089                     if (newcode[-1] != SRE_OP_FAILURE)
3090                         FAIL;
3091                     code = newcode;
3092                 }
3093                 else if (code != newcode) {
3094                   VTRACE(("code=%p, newcode=%p\n", code, newcode));
3095                     FAIL;
3096                 }
3097             }
3098             break;
3099 
3100         case SRE_OP_BRANCH:
3101             {
3102                 SRE_CODE *target = NULL;
3103                 for (;;) {
3104                     GET_SKIP;
3105                     if (skip == 0)
3106                         break;
3107                     /* Stop 2 before the end; we check the JUMP below */
3108                     if (!_validate_inner(code, code+skip-3, groups))
3109                         FAIL;
3110                     code += skip-3;
3111                     /* Check that it ends with a JUMP, and that each JUMP
3112                        has the same target */
3113                     GET_OP;
3114                     if (op != SRE_OP_JUMP)
3115                         FAIL;
3116                     GET_SKIP;
3117                     if (target == NULL)
3118                         target = code+skip-1;
3119                     else if (code+skip-1 != target)
3120                         FAIL;
3121                 }
3122             }
3123             break;
3124 
3125         case SRE_OP_REPEAT_ONE:
3126         case SRE_OP_MIN_REPEAT_ONE:
3127             {
3128                 SRE_CODE min, max;
3129                 GET_SKIP;
3130                 GET_ARG; min = arg;
3131                 GET_ARG; max = arg;
3132                 if (min > max)
3133                     FAIL;
3134                 if (max > SRE_MAXREPEAT)
3135                     FAIL;
3136                 if (!_validate_inner(code, code+skip-4, groups))
3137                     FAIL;
3138                 code += skip-4;
3139                 GET_OP;
3140                 if (op != SRE_OP_SUCCESS)
3141                     FAIL;
3142             }
3143             break;
3144 
3145         case SRE_OP_REPEAT:
3146             {
3147                 SRE_CODE min, max;
3148                 GET_SKIP;
3149                 GET_ARG; min = arg;
3150                 GET_ARG; max = arg;
3151                 if (min > max)
3152                     FAIL;
3153                 if (max > SRE_MAXREPEAT)
3154                     FAIL;
3155                 if (!_validate_inner(code, code+skip-3, groups))
3156                     FAIL;
3157                 code += skip-3;
3158                 GET_OP;
3159                 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3160                     FAIL;
3161             }
3162             break;
3163 
3164         case SRE_OP_GROUPREF:
3165         case SRE_OP_GROUPREF_IGNORE:
3166             GET_ARG;
3167             if (arg >= groups)
3168                 FAIL;
3169             break;
3170 
3171         case SRE_OP_GROUPREF_EXISTS:
3172             /* The regex syntax for this is: '(?(group)then|else)', where
3173                'group' is either an integer group number or a group name,
3174                'then' and 'else' are sub-regexes, and 'else' is optional. */
3175             GET_ARG;
3176             if (arg >= groups)
3177                 FAIL;
3178             GET_SKIP_ADJ(1);
3179             code--; /* The skip is relative to the first arg! */
3180             /* There are two possibilities here: if there is both a 'then'
3181                part and an 'else' part, the generated code looks like:
3182 
3183                GROUPREF_EXISTS
3184                <group>
3185                <skipyes>
3186                ...then part...
3187                JUMP
3188                <skipno>
3189                (<skipyes> jumps here)
3190                ...else part...
3191                (<skipno> jumps here)
3192 
3193                If there is only a 'then' part, it looks like:
3194 
3195                GROUPREF_EXISTS
3196                <group>
3197                <skip>
3198                ...then part...
3199                (<skip> jumps here)
3200 
3201                There is no direct way to decide which it is, and we don't want
3202                to allow arbitrary jumps anywhere in the code; so we just look
3203                for a JUMP opcode preceding our skip target.
3204             */
3205             if (skip >= 3 && skip-3 < end-code &&
3206                 code[skip-3] == SRE_OP_JUMP)
3207             {
3208                 VTRACE(("both then and else parts present\n"));
3209                 if (!_validate_inner(code+1, code+skip-3, groups))
3210                     FAIL;
3211                 code += skip-2; /* Position after JUMP, at <skipno> */
3212                 GET_SKIP;
3213                 if (!_validate_inner(code, code+skip-1, groups))
3214                     FAIL;
3215                 code += skip-1;
3216             }
3217             else {
3218                 VTRACE(("only a then part present\n"));
3219                 if (!_validate_inner(code+1, code+skip-1, groups))
3220                     FAIL;
3221                 code += skip-1;
3222             }
3223             break;
3224 
3225         case SRE_OP_ASSERT:
3226         case SRE_OP_ASSERT_NOT:
3227             GET_SKIP;
3228             GET_ARG; /* 0 for lookahead, width for lookbehind */
3229             code--; /* Back up over arg to simplify math below */
3230             if (arg & 0x80000000)
3231                 FAIL; /* Width too large */
3232             /* Stop 1 before the end; we check the SUCCESS below */
3233             if (!_validate_inner(code+1, code+skip-2, groups))
3234                 FAIL;
3235             code += skip-2;
3236             GET_OP;
3237             if (op != SRE_OP_SUCCESS)
3238                 FAIL;
3239             break;
3240 
3241         default:
3242             FAIL;
3243 
3244         }
3245     }
3246 
3247     VTRACE(("okay\n"));
3248     return 1;
3249 }
3250 
3251 static int
_validate_outer(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)3252 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3253 {
3254     if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3255         FAIL;
3256     if (groups == 0)  /* fix for simplejson */
3257         groups = 100; /* 100 groups should always be safe */
3258     return _validate_inner(code, end-1, groups);
3259 }
3260 
3261 static int
_validate(PatternObject * self)3262 _validate(PatternObject *self)
3263 {
3264     if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3265     {
3266         PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3267         return 0;
3268     }
3269     else
3270         VTRACE(("Success!\n"));
3271     return 1;
3272 }
3273 
3274 /* -------------------------------------------------------------------- */
3275 /* match methods */
3276 
3277 static void
match_dealloc(MatchObject * self)3278 match_dealloc(MatchObject* self)
3279 {
3280     Py_XDECREF(self->regs);
3281     Py_XDECREF(self->string);
3282     Py_DECREF(self->pattern);
3283     PyObject_DEL(self);
3284 }
3285 
3286 static PyObject*
match_getslice_by_index(MatchObject * self,Py_ssize_t index,PyObject * def)3287 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3288 {
3289     if (index < 0 || index >= self->groups) {
3290         /* raise IndexError if we were given a bad group number */
3291         PyErr_SetString(
3292             PyExc_IndexError,
3293             "no such group"
3294             );
3295         return NULL;
3296     }
3297 
3298     index *= 2;
3299 
3300     if (self->string == Py_None || self->mark[index] < 0) {
3301         /* return default value if the string or group is undefined */
3302         Py_INCREF(def);
3303         return def;
3304     }
3305 
3306     return PySequence_GetSlice(
3307         self->string, self->mark[index], self->mark[index+1]
3308         );
3309 }
3310 
3311 static Py_ssize_t
match_getindex(MatchObject * self,PyObject * index)3312 match_getindex(MatchObject* self, PyObject* index)
3313 {
3314     Py_ssize_t i;
3315 
3316     if (PyInt_Check(index) || PyLong_Check(index))
3317         return PyInt_AsSsize_t(index);
3318 
3319     i = -1;
3320 
3321     if (self->pattern->groupindex) {
3322         index = PyObject_GetItem(self->pattern->groupindex, index);
3323         if (index) {
3324             if (PyInt_Check(index) || PyLong_Check(index))
3325                 i = PyInt_AsSsize_t(index);
3326             Py_DECREF(index);
3327         } else
3328             PyErr_Clear();
3329     }
3330 
3331     return i;
3332 }
3333 
3334 static PyObject*
match_getslice(MatchObject * self,PyObject * index,PyObject * def)3335 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3336 {
3337     return match_getslice_by_index(self, match_getindex(self, index), def);
3338 }
3339 
3340 static PyObject*
match_expand(MatchObject * self,PyObject * ptemplate)3341 match_expand(MatchObject* self, PyObject* ptemplate)
3342 {
3343     /* delegate to Python code */
3344     return call(
3345         SRE_PY_MODULE, "_expand",
3346         PyTuple_Pack(3, self->pattern, self, ptemplate)
3347         );
3348 }
3349 
3350 static PyObject*
match_group(MatchObject * self,PyObject * args)3351 match_group(MatchObject* self, PyObject* args)
3352 {
3353     PyObject* result;
3354     Py_ssize_t i, size;
3355 
3356     size = PyTuple_GET_SIZE(args);
3357 
3358     switch (size) {
3359     case 0:
3360         result = match_getslice(self, Py_False, Py_None);
3361         break;
3362     case 1:
3363         result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3364         break;
3365     default:
3366         /* fetch multiple items */
3367         result = PyTuple_New(size);
3368         if (!result)
3369             return NULL;
3370         for (i = 0; i < size; i++) {
3371             PyObject* item = match_getslice(
3372                 self, PyTuple_GET_ITEM(args, i), Py_None
3373                 );
3374             if (!item) {
3375                 Py_DECREF(result);
3376                 return NULL;
3377             }
3378             PyTuple_SET_ITEM(result, i, item);
3379         }
3380         break;
3381     }
3382     return result;
3383 }
3384 
3385 static PyObject*
match_groups(MatchObject * self,PyObject * args,PyObject * kw)3386 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3387 {
3388     PyObject* result;
3389     Py_ssize_t index;
3390 
3391     PyObject* def = Py_None;
3392     static char* kwlist[] = { "default", NULL };
3393     if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3394         return NULL;
3395 
3396     result = PyTuple_New(self->groups-1);
3397     if (!result)
3398         return NULL;
3399 
3400     for (index = 1; index < self->groups; index++) {
3401         PyObject* item;
3402         item = match_getslice_by_index(self, index, def);
3403         if (!item) {
3404             Py_DECREF(result);
3405             return NULL;
3406         }
3407         PyTuple_SET_ITEM(result, index-1, item);
3408     }
3409 
3410     return result;
3411 }
3412 
3413 static PyObject*
match_groupdict(MatchObject * self,PyObject * args,PyObject * kw)3414 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3415 {
3416     PyObject* result;
3417     PyObject* keys;
3418     Py_ssize_t index;
3419 
3420     PyObject* def = Py_None;
3421     static char* kwlist[] = { "default", NULL };
3422     if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3423         return NULL;
3424 
3425     result = PyDict_New();
3426     if (!result || !self->pattern->groupindex)
3427         return result;
3428 
3429     keys = PyMapping_Keys(self->pattern->groupindex);
3430     if (!keys)
3431         goto failed;
3432 
3433     for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3434         int status;
3435         PyObject* key;
3436         PyObject* value;
3437         key = PyList_GET_ITEM(keys, index);
3438         if (!key)
3439             goto failed;
3440         value = match_getslice(self, key, def);
3441         if (!value)
3442             goto failed;
3443         status = PyDict_SetItem(result, key, value);
3444         Py_DECREF(value);
3445         if (status < 0)
3446             goto failed;
3447     }
3448 
3449     Py_DECREF(keys);
3450 
3451     return result;
3452 
3453 failed:
3454     Py_XDECREF(keys);
3455     Py_DECREF(result);
3456     return NULL;
3457 }
3458 
3459 static PyObject*
match_start(MatchObject * self,PyObject * args)3460 match_start(MatchObject* self, PyObject* args)
3461 {
3462     Py_ssize_t index;
3463 
3464     PyObject* index_ = Py_False; /* zero */
3465     if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3466         return NULL;
3467 
3468     index = match_getindex(self, index_);
3469 
3470     if (index < 0 || index >= self->groups) {
3471         PyErr_SetString(
3472             PyExc_IndexError,
3473             "no such group"
3474             );
3475         return NULL;
3476     }
3477 
3478     /* mark is -1 if group is undefined */
3479     return PyInt_FromSsize_t(self->mark[index*2]);
3480 }
3481 
3482 static PyObject*
match_end(MatchObject * self,PyObject * args)3483 match_end(MatchObject* self, PyObject* args)
3484 {
3485     Py_ssize_t index;
3486 
3487     PyObject* index_ = Py_False; /* zero */
3488     if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3489         return NULL;
3490 
3491     index = match_getindex(self, index_);
3492 
3493     if (index < 0 || index >= self->groups) {
3494         PyErr_SetString(
3495             PyExc_IndexError,
3496             "no such group"
3497             );
3498         return NULL;
3499     }
3500 
3501     /* mark is -1 if group is undefined */
3502     return PyInt_FromSsize_t(self->mark[index*2+1]);
3503 }
3504 
3505 LOCAL(PyObject*)
_pair(Py_ssize_t i1,Py_ssize_t i2)3506 _pair(Py_ssize_t i1, Py_ssize_t i2)
3507 {
3508     PyObject* pair;
3509     PyObject* item;
3510 
3511     pair = PyTuple_New(2);
3512     if (!pair)
3513         return NULL;
3514 
3515     item = PyInt_FromSsize_t(i1);
3516     if (!item)
3517         goto error;
3518     PyTuple_SET_ITEM(pair, 0, item);
3519 
3520     item = PyInt_FromSsize_t(i2);
3521     if (!item)
3522         goto error;
3523     PyTuple_SET_ITEM(pair, 1, item);
3524 
3525     return pair;
3526 
3527   error:
3528     Py_DECREF(pair);
3529     return NULL;
3530 }
3531 
3532 static PyObject*
match_span(MatchObject * self,PyObject * args)3533 match_span(MatchObject* self, PyObject* args)
3534 {
3535     Py_ssize_t index;
3536 
3537     PyObject* index_ = Py_False; /* zero */
3538     if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3539         return NULL;
3540 
3541     index = match_getindex(self, index_);
3542 
3543     if (index < 0 || index >= self->groups) {
3544         PyErr_SetString(
3545             PyExc_IndexError,
3546             "no such group"
3547             );
3548         return NULL;
3549     }
3550 
3551     /* marks are -1 if group is undefined */
3552     return _pair(self->mark[index*2], self->mark[index*2+1]);
3553 }
3554 
3555 static PyObject*
match_regs(MatchObject * self)3556 match_regs(MatchObject* self)
3557 {
3558     PyObject* regs;
3559     PyObject* item;
3560     Py_ssize_t index;
3561 
3562     regs = PyTuple_New(self->groups);
3563     if (!regs)
3564         return NULL;
3565 
3566     for (index = 0; index < self->groups; index++) {
3567         item = _pair(self->mark[index*2], self->mark[index*2+1]);
3568         if (!item) {
3569             Py_DECREF(regs);
3570             return NULL;
3571         }
3572         PyTuple_SET_ITEM(regs, index, item);
3573     }
3574 
3575     Py_INCREF(regs);
3576     self->regs = regs;
3577 
3578     return regs;
3579 }
3580 
3581 static PyObject*
match_copy(MatchObject * self,PyObject * unused)3582 match_copy(MatchObject* self, PyObject *unused)
3583 {
3584 #ifdef USE_BUILTIN_COPY
3585     MatchObject* copy;
3586     Py_ssize_t slots, offset;
3587 
3588     slots = 2 * (self->pattern->groups+1);
3589 
3590     copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3591     if (!copy)
3592         return NULL;
3593 
3594     /* this value a constant, but any compiler should be able to
3595        figure that out all by itself */
3596     offset = offsetof(MatchObject, string);
3597 
3598     Py_XINCREF(self->pattern);
3599     Py_XINCREF(self->string);
3600     Py_XINCREF(self->regs);
3601 
3602     memcpy((char*) copy + offset, (char*) self + offset,
3603            sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3604 
3605     return (PyObject*) copy;
3606 #else
3607     PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3608     return NULL;
3609 #endif
3610 }
3611 
3612 static PyObject*
match_deepcopy(MatchObject * self,PyObject * memo)3613 match_deepcopy(MatchObject* self, PyObject* memo)
3614 {
3615 #ifdef USE_BUILTIN_COPY
3616     MatchObject* copy;
3617 
3618     copy = (MatchObject*) match_copy(self);
3619     if (!copy)
3620         return NULL;
3621 
3622     if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3623         !deepcopy(&copy->string, memo) ||
3624         !deepcopy(&copy->regs, memo)) {
3625         Py_DECREF(copy);
3626         return NULL;
3627     }
3628 
3629 #else
3630     PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3631     return NULL;
3632 #endif
3633 }
3634 
3635 PyDoc_STRVAR(match_doc,
3636 "The result of re.match() and re.search().\n\
3637 Match objects always have a boolean value of True.");
3638 
3639 PyDoc_STRVAR(match_group_doc,
3640 "group([group1, ...]) -> str or tuple.\n\
3641     Return subgroup(s) of the match by indices or names.\n\
3642     For 0 returns the entire match.");
3643 
3644 PyDoc_STRVAR(match_start_doc,
3645 "start([group=0]) -> int.\n\
3646     Return index of the start of the substring matched by group.");
3647 
3648 PyDoc_STRVAR(match_end_doc,
3649 "end([group=0]) -> int.\n\
3650     Return index of the end of the substring matched by group.");
3651 
3652 PyDoc_STRVAR(match_span_doc,
3653 "span([group]) -> tuple.\n\
3654     For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3655 
3656 PyDoc_STRVAR(match_groups_doc,
3657 "groups([default=None]) -> tuple.\n\
3658     Return a tuple containing all the subgroups of the match, from 1.\n\
3659     The default argument is used for groups\n\
3660     that did not participate in the match");
3661 
3662 PyDoc_STRVAR(match_groupdict_doc,
3663 "groupdict([default=None]) -> dict.\n\
3664     Return a dictionary containing all the named subgroups of the match,\n\
3665     keyed by the subgroup name. The default argument is used for groups\n\
3666     that did not participate in the match");
3667 
3668 PyDoc_STRVAR(match_expand_doc,
3669 "expand(template) -> str.\n\
3670     Return the string obtained by doing backslash substitution\n\
3671     on the string template, as done by the sub() method.");
3672 
3673 static PyMethodDef match_methods[] = {
3674     {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3675     {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3676     {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3677     {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3678     {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3679         match_groups_doc},
3680     {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3681         match_groupdict_doc},
3682     {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
3683     {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3684     {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3685     {NULL, NULL}
3686 };
3687 
3688 static PyObject *
match_lastindex_get(MatchObject * self)3689 match_lastindex_get(MatchObject *self)
3690 {
3691     if (self->lastindex >= 0)
3692         return PyInt_FromSsize_t(self->lastindex);
3693     Py_INCREF(Py_None);
3694     return Py_None;
3695 }
3696 
3697 static PyObject *
match_lastgroup_get(MatchObject * self)3698 match_lastgroup_get(MatchObject *self)
3699 {
3700     if (self->pattern->indexgroup && self->lastindex >= 0) {
3701         PyObject* result = PySequence_GetItem(
3702             self->pattern->indexgroup, self->lastindex
3703             );
3704         if (result)
3705             return result;
3706         PyErr_Clear();
3707     }
3708     Py_INCREF(Py_None);
3709     return Py_None;
3710 }
3711 
3712 static PyObject *
match_regs_get(MatchObject * self)3713 match_regs_get(MatchObject *self)
3714 {
3715     if (self->regs) {
3716         Py_INCREF(self->regs);
3717         return self->regs;
3718     } else
3719         return match_regs(self);
3720 }
3721 
3722 static PyGetSetDef match_getset[] = {
3723     {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3724     {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3725     {"regs",      (getter)match_regs_get,      (setter)NULL},
3726     {NULL}
3727 };
3728 
3729 #define MATCH_OFF(x) offsetof(MatchObject, x)
3730 static PyMemberDef match_members[] = {
3731     {"string",  T_OBJECT,   MATCH_OFF(string),  READONLY},
3732     {"re",      T_OBJECT,   MATCH_OFF(pattern), READONLY},
3733     {"pos",     T_PYSSIZET, MATCH_OFF(pos),     READONLY},
3734     {"endpos",  T_PYSSIZET, MATCH_OFF(endpos),  READONLY},
3735     {NULL}
3736 };
3737 
3738 
3739 /* FIXME: implement setattr("string", None) as a special case (to
3740    detach the associated string, if any */
3741 
3742 static PyTypeObject Match_Type = {
3743     PyVarObject_HEAD_INIT(NULL, 0)
3744     "_" SRE_MODULE ".SRE_Match",
3745     sizeof(MatchObject), sizeof(Py_ssize_t),
3746     (destructor)match_dealloc,  /* tp_dealloc */
3747     0,                          /* tp_print */
3748     0,                          /* tp_getattr */
3749     0,                          /* tp_setattr */
3750     0,                          /* tp_compare */
3751     0,                          /* tp_repr */
3752     0,                          /* tp_as_number */
3753     0,                          /* tp_as_sequence */
3754     0,                          /* tp_as_mapping */
3755     0,                          /* tp_hash */
3756     0,                          /* tp_call */
3757     0,                          /* tp_str */
3758     0,                          /* tp_getattro */
3759     0,                          /* tp_setattro */
3760     0,                          /* tp_as_buffer */
3761     Py_TPFLAGS_DEFAULT,
3762     match_doc,                  /* tp_doc */
3763     0,                          /* tp_traverse */
3764     0,                          /* tp_clear */
3765     0,                          /* tp_richcompare */
3766     0,                          /* tp_weaklistoffset */
3767     0,                          /* tp_iter */
3768     0,                          /* tp_iternext */
3769     match_methods,		/* tp_methods */
3770     match_members,		/* tp_members */
3771     match_getset,  	        /* tp_getset */
3772 };
3773 
3774 static PyObject*
pattern_new_match(PatternObject * pattern,SRE_STATE * state,int status)3775 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3776 {
3777     /* create match object (from state object) */
3778 
3779     MatchObject* match;
3780     Py_ssize_t i, j;
3781     char* base;
3782     int n;
3783 
3784     if (status > 0) {
3785 
3786         /* create match object (with room for extra group marks) */
3787         /* coverity[ampersand_in_size] */
3788         match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3789                                  2*(pattern->groups+1));
3790         if (!match)
3791             return NULL;
3792 
3793         Py_INCREF(pattern);
3794         match->pattern = pattern;
3795 
3796         Py_INCREF(state->string);
3797         match->string = state->string;
3798 
3799         match->regs = NULL;
3800         match->groups = pattern->groups+1;
3801 
3802         /* fill in group slices */
3803 
3804         base = (char*) state->beginning;
3805         n = state->charsize;
3806 
3807         match->mark[0] = ((char*) state->start - base) / n;
3808         match->mark[1] = ((char*) state->ptr - base) / n;
3809 
3810         for (i = j = 0; i < pattern->groups; i++, j+=2)
3811             if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3812                 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3813                 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3814             } else
3815                 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3816 
3817         match->pos = state->pos;
3818         match->endpos = state->endpos;
3819 
3820         match->lastindex = state->lastindex;
3821 
3822         return (PyObject*) match;
3823 
3824     } else if (status == 0) {
3825 
3826         /* no match */
3827         Py_INCREF(Py_None);
3828         return Py_None;
3829 
3830     }
3831 
3832     /* internal error */
3833     pattern_error(status);
3834     return NULL;
3835 }
3836 
3837 
3838 /* -------------------------------------------------------------------- */
3839 /* scanner methods (experimental) */
3840 
3841 static void
scanner_dealloc(ScannerObject * self)3842 scanner_dealloc(ScannerObject* self)
3843 {
3844     state_fini(&self->state);
3845     Py_XDECREF(self->pattern);
3846     PyObject_DEL(self);
3847 }
3848 
3849 static PyObject*
scanner_match(ScannerObject * self,PyObject * unused)3850 scanner_match(ScannerObject* self, PyObject *unused)
3851 {
3852     SRE_STATE* state = &self->state;
3853     PyObject* match;
3854     int status;
3855 
3856     if (state->start == NULL)
3857         Py_RETURN_NONE;
3858 
3859     state_reset(state);
3860 
3861     state->ptr = state->start;
3862 
3863     if (state->charsize == 1) {
3864         status = sre_match(state, PatternObject_GetCode(self->pattern));
3865     } else {
3866 #if defined(HAVE_UNICODE)
3867         status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3868 #endif
3869     }
3870     if (PyErr_Occurred())
3871         return NULL;
3872 
3873     match = pattern_new_match((PatternObject*) self->pattern,
3874                                state, status);
3875 
3876     if (status == 0)
3877         state->start = NULL;
3878     else if (state->ptr != state->start)
3879         state->start = state->ptr;
3880     else if (state->ptr != state->end)
3881         state->start = (void*) ((char*) state->ptr + state->charsize);
3882     else
3883         state->start = NULL;
3884 
3885     return match;
3886 }
3887 
3888 
3889 static PyObject*
scanner_search(ScannerObject * self,PyObject * unused)3890 scanner_search(ScannerObject* self, PyObject *unused)
3891 {
3892     SRE_STATE* state = &self->state;
3893     PyObject* match;
3894     int status;
3895 
3896     if (state->start == NULL)
3897         Py_RETURN_NONE;
3898 
3899     state_reset(state);
3900 
3901     state->ptr = state->start;
3902 
3903     if (state->charsize == 1) {
3904         status = sre_search(state, PatternObject_GetCode(self->pattern));
3905     } else {
3906 #if defined(HAVE_UNICODE)
3907         status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3908 #endif
3909     }
3910     if (PyErr_Occurred())
3911         return NULL;
3912 
3913     match = pattern_new_match((PatternObject*) self->pattern,
3914                                state, status);
3915 
3916     if (status == 0)
3917         state->start = NULL;
3918     else if (state->ptr != state->start)
3919         state->start = state->ptr;
3920     else if (state->ptr != state->end)
3921         state->start = (void*) ((char*) state->ptr + state->charsize);
3922     else
3923         state->start = NULL;
3924 
3925     return match;
3926 }
3927 
3928 static PyMethodDef scanner_methods[] = {
3929     {"match", (PyCFunction) scanner_match, METH_NOARGS},
3930     {"search", (PyCFunction) scanner_search, METH_NOARGS},
3931     {NULL, NULL}
3932 };
3933 
3934 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3935 static PyMemberDef scanner_members[] = {
3936     {"pattern",	T_OBJECT,	SCAN_OFF(pattern),	READONLY},
3937     {NULL}  /* Sentinel */
3938 };
3939 
3940 statichere PyTypeObject Scanner_Type = {
3941     PyObject_HEAD_INIT(NULL)
3942     0, "_" SRE_MODULE ".SRE_Scanner",
3943     sizeof(ScannerObject), 0,
3944     (destructor)scanner_dealloc, /*tp_dealloc*/
3945     0,				/* tp_print */
3946     0,				/* tp_getattr */
3947     0,				/* tp_setattr */
3948     0,				/* tp_reserved */
3949     0,				/* tp_repr */
3950     0,				/* tp_as_number */
3951     0,				/* tp_as_sequence */
3952     0,				/* tp_as_mapping */
3953     0,				/* tp_hash */
3954     0,				/* tp_call */
3955     0,				/* tp_str */
3956     0,				/* tp_getattro */
3957     0,				/* tp_setattro */
3958     0,				/* tp_as_buffer */
3959     Py_TPFLAGS_DEFAULT,		/* tp_flags */
3960     0,				/* tp_doc */
3961     0,				/* tp_traverse */
3962     0,				/* tp_clear */
3963     0,				/* tp_richcompare */
3964     0,				/* tp_weaklistoffset */
3965     0,				/* tp_iter */
3966     0,				/* tp_iternext */
3967     scanner_methods,		/* tp_methods */
3968     scanner_members,		/* tp_members */
3969     0,				/* tp_getset */
3970 };
3971 
3972 static PyObject*
pattern_scanner(PatternObject * pattern,PyObject * args)3973 pattern_scanner(PatternObject* pattern, PyObject* args)
3974 {
3975     /* create search state object */
3976 
3977     ScannerObject* self;
3978 
3979     PyObject* string;
3980     Py_ssize_t start = 0;
3981     Py_ssize_t end = PY_SSIZE_T_MAX;
3982     if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3983         return NULL;
3984 
3985     /* create scanner object */
3986     self = PyObject_NEW(ScannerObject, &Scanner_Type);
3987     if (!self)
3988         return NULL;
3989     self->pattern = NULL;
3990 
3991     string = state_init(&self->state, pattern, string, start, end);
3992     if (!string) {
3993         Py_DECREF(self);
3994         return NULL;
3995     }
3996 
3997     Py_INCREF(pattern);
3998     self->pattern = (PyObject*) pattern;
3999 
4000     return (PyObject*) self;
4001 }
4002 
4003 static PyMethodDef _functions[] = {
4004     {"compile", _compile, METH_VARARGS},
4005     {"getcodesize", sre_codesize, METH_NOARGS},
4006     {"getlower", sre_getlower, METH_VARARGS},
4007     {NULL, NULL}
4008 };
4009 
4010 #if PY_VERSION_HEX < 0x02030000
init_sre(void)4011 DL_EXPORT(void) init_sre(void)
4012 #else
4013 PyMODINIT_FUNC init_sre(void)
4014 #endif
4015 {
4016     PyObject* m;
4017     PyObject* d;
4018     PyObject* x;
4019 
4020     /* Patch object types */
4021     if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
4022         PyType_Ready(&Scanner_Type))
4023         return;
4024 
4025     m = Py_InitModule("_" SRE_MODULE, _functions);
4026     if (m == NULL)
4027     	return;
4028     d = PyModule_GetDict(m);
4029 
4030     x = PyInt_FromLong(SRE_MAGIC);
4031     if (x) {
4032         PyDict_SetItemString(d, "MAGIC", x);
4033         Py_DECREF(x);
4034     }
4035 
4036     x = PyInt_FromLong(sizeof(SRE_CODE));
4037     if (x) {
4038         PyDict_SetItemString(d, "CODESIZE", x);
4039         Py_DECREF(x);
4040     }
4041 
4042     x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
4043     if (x) {
4044         PyDict_SetItemString(d, "MAXREPEAT", x);
4045         Py_DECREF(x);
4046     }
4047 
4048     x = PyString_FromString(copyright);
4049     if (x) {
4050         PyDict_SetItemString(d, "copyright", x);
4051         Py_DECREF(x);
4052     }
4053 }
4054 
4055 #endif /* !defined(SRE_RECURSIVE) */
4056 
4057 /* vim:ts=4:sw=4:et
4058 */
4059