1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 *
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
27 *
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
35 */
36
37 #ifndef SRE_RECURSIVE
38
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
41
42 #define PY_SSIZE_T_CLEAN
43
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
46
47 #include "sre.h"
48
49 #include <ctype.h>
50
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
55
56 #define SRE_PY_MODULE "re"
57
58 /* defining this one enables tracing */
59 #undef VERBOSE
60
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
64 #define HAVE_UNICODE
65 #endif
66 #endif
67
68 /* -------------------------------------------------------------------- */
69 /* optional features */
70
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
73
74 /* enables aggressive inlining (always on for Visual C) */
75 #undef USE_INLINE
76
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
79
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
82 #endif
83
84 /* -------------------------------------------------------------------- */
85
86 #if defined(_MSC_VER)
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
93 #else
94 #define LOCAL(type) static type
95 #endif
96
97 /* error codes */
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
102 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
103
104 #if defined(VERBOSE)
105 #define TRACE(v) printf v
106 #else
107 #define TRACE(v)
108 #endif
109
110 /* -------------------------------------------------------------------- */
111 /* search engine state */
112
113 /* default character predicates (run sre_chars.py to regenerate tables) */
114
115 #define SRE_DIGIT_MASK 1
116 #define SRE_SPACE_MASK 2
117 #define SRE_LINEBREAK_MASK 4
118 #define SRE_ALNUM_MASK 8
119 #define SRE_WORD_MASK 16
120
121 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
122
123 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
124 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
126 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
127 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
128 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
129 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130
131 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
132 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
133 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
134 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
135 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139 120, 121, 122, 123, 124, 125, 126, 127 };
140
141 #define SRE_IS_DIGIT(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143 #define SRE_IS_SPACE(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145 #define SRE_IS_LINEBREAK(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147 #define SRE_IS_ALNUM(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149 #define SRE_IS_WORD(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151
sre_lower(unsigned int ch)152 static unsigned int sre_lower(unsigned int ch)
153 {
154 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
155 }
156
157 /* locale-specific character predicates */
158 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159 * warnings when c's type supports only numbers < N+1 */
160 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
162 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
163 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
164 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165
sre_lower_locale(unsigned int ch)166 static unsigned int sre_lower_locale(unsigned int ch)
167 {
168 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
169 }
170
171 /* unicode-specific character predicates */
172
173 #if defined(HAVE_UNICODE)
174
175 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
179 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180
sre_lower_unicode(unsigned int ch)181 static unsigned int sre_lower_unicode(unsigned int ch)
182 {
183 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184 }
185
186 #endif
187
188 LOCAL(int)
sre_category(SRE_CODE category,unsigned int ch)189 sre_category(SRE_CODE category, unsigned int ch)
190 {
191 switch (category) {
192
193 case SRE_CATEGORY_DIGIT:
194 return SRE_IS_DIGIT(ch);
195 case SRE_CATEGORY_NOT_DIGIT:
196 return !SRE_IS_DIGIT(ch);
197 case SRE_CATEGORY_SPACE:
198 return SRE_IS_SPACE(ch);
199 case SRE_CATEGORY_NOT_SPACE:
200 return !SRE_IS_SPACE(ch);
201 case SRE_CATEGORY_WORD:
202 return SRE_IS_WORD(ch);
203 case SRE_CATEGORY_NOT_WORD:
204 return !SRE_IS_WORD(ch);
205 case SRE_CATEGORY_LINEBREAK:
206 return SRE_IS_LINEBREAK(ch);
207 case SRE_CATEGORY_NOT_LINEBREAK:
208 return !SRE_IS_LINEBREAK(ch);
209
210 case SRE_CATEGORY_LOC_WORD:
211 return SRE_LOC_IS_WORD(ch);
212 case SRE_CATEGORY_LOC_NOT_WORD:
213 return !SRE_LOC_IS_WORD(ch);
214
215 #if defined(HAVE_UNICODE)
216 case SRE_CATEGORY_UNI_DIGIT:
217 return SRE_UNI_IS_DIGIT(ch);
218 case SRE_CATEGORY_UNI_NOT_DIGIT:
219 return !SRE_UNI_IS_DIGIT(ch);
220 case SRE_CATEGORY_UNI_SPACE:
221 return SRE_UNI_IS_SPACE(ch);
222 case SRE_CATEGORY_UNI_NOT_SPACE:
223 return !SRE_UNI_IS_SPACE(ch);
224 case SRE_CATEGORY_UNI_WORD:
225 return SRE_UNI_IS_WORD(ch);
226 case SRE_CATEGORY_UNI_NOT_WORD:
227 return !SRE_UNI_IS_WORD(ch);
228 case SRE_CATEGORY_UNI_LINEBREAK:
229 return SRE_UNI_IS_LINEBREAK(ch);
230 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231 return !SRE_UNI_IS_LINEBREAK(ch);
232 #else
233 case SRE_CATEGORY_UNI_DIGIT:
234 return SRE_IS_DIGIT(ch);
235 case SRE_CATEGORY_UNI_NOT_DIGIT:
236 return !SRE_IS_DIGIT(ch);
237 case SRE_CATEGORY_UNI_SPACE:
238 return SRE_IS_SPACE(ch);
239 case SRE_CATEGORY_UNI_NOT_SPACE:
240 return !SRE_IS_SPACE(ch);
241 case SRE_CATEGORY_UNI_WORD:
242 return SRE_LOC_IS_WORD(ch);
243 case SRE_CATEGORY_UNI_NOT_WORD:
244 return !SRE_LOC_IS_WORD(ch);
245 case SRE_CATEGORY_UNI_LINEBREAK:
246 return SRE_IS_LINEBREAK(ch);
247 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248 return !SRE_IS_LINEBREAK(ch);
249 #endif
250 }
251 return 0;
252 }
253
254 /* helpers */
255
256 static void
data_stack_dealloc(SRE_STATE * state)257 data_stack_dealloc(SRE_STATE* state)
258 {
259 if (state->data_stack) {
260 PyMem_FREE(state->data_stack);
261 state->data_stack = NULL;
262 }
263 state->data_stack_size = state->data_stack_base = 0;
264 }
265
266 static int
data_stack_grow(SRE_STATE * state,Py_ssize_t size)267 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
268 {
269 Py_ssize_t minsize, cursize;
270 minsize = state->data_stack_base+size;
271 cursize = state->data_stack_size;
272 if (cursize < minsize) {
273 void* stack;
274 cursize = minsize+minsize/4+1024;
275 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
276 stack = PyMem_REALLOC(state->data_stack, cursize);
277 if (!stack) {
278 data_stack_dealloc(state);
279 return SRE_ERROR_MEMORY;
280 }
281 state->data_stack = (char *)stack;
282 state->data_stack_size = cursize;
283 }
284 return 0;
285 }
286
287 /* generate 8-bit version */
288
289 #define SRE_CHAR unsigned char
290 #define SRE_AT sre_at
291 #define SRE_COUNT sre_count
292 #define SRE_CHARSET sre_charset
293 #define SRE_INFO sre_info
294 #define SRE_MATCH sre_match
295 #define SRE_MATCH_CONTEXT sre_match_context
296 #define SRE_SEARCH sre_search
297 #define SRE_LITERAL_TEMPLATE sre_literal_template
298
299 #if defined(HAVE_UNICODE)
300
301 #define SRE_RECURSIVE
302 #include "_sre.c"
303 #undef SRE_RECURSIVE
304
305 #undef SRE_LITERAL_TEMPLATE
306 #undef SRE_SEARCH
307 #undef SRE_MATCH
308 #undef SRE_MATCH_CONTEXT
309 #undef SRE_INFO
310 #undef SRE_CHARSET
311 #undef SRE_COUNT
312 #undef SRE_AT
313 #undef SRE_CHAR
314
315 /* generate 16-bit unicode version */
316
317 #define SRE_CHAR Py_UNICODE
318 #define SRE_AT sre_uat
319 #define SRE_COUNT sre_ucount
320 #define SRE_CHARSET sre_ucharset
321 #define SRE_INFO sre_uinfo
322 #define SRE_MATCH sre_umatch
323 #define SRE_MATCH_CONTEXT sre_umatch_context
324 #define SRE_SEARCH sre_usearch
325 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
326 #endif
327
328 #endif /* SRE_RECURSIVE */
329
330 /* -------------------------------------------------------------------- */
331 /* String matching engine */
332
333 /* the following section is compiled twice, with different character
334 settings */
335
336 LOCAL(int)
SRE_AT(SRE_STATE * state,SRE_CHAR * ptr,SRE_CODE at)337 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338 {
339 /* check if pointer is at given position */
340
341 Py_ssize_t thisp, thatp;
342
343 switch (at) {
344
345 case SRE_AT_BEGINNING:
346 case SRE_AT_BEGINNING_STRING:
347 return ((void*) ptr == state->beginning);
348
349 case SRE_AT_BEGINNING_LINE:
350 return ((void*) ptr == state->beginning ||
351 SRE_IS_LINEBREAK((int) ptr[-1]));
352
353 case SRE_AT_END:
354 return (((SRE_CHAR *)state->end - ptr == 1 &&
355 SRE_IS_LINEBREAK((int) ptr[0])) ||
356 ((void*) ptr == state->end));
357
358 case SRE_AT_END_LINE:
359 return ((void*) ptr == state->end ||
360 SRE_IS_LINEBREAK((int) ptr[0]));
361
362 case SRE_AT_END_STRING:
363 return ((void*) ptr == state->end);
364
365 case SRE_AT_BOUNDARY:
366 if (state->beginning == state->end)
367 return 0;
368 thatp = ((void*) ptr > state->beginning) ?
369 SRE_IS_WORD((int) ptr[-1]) : 0;
370 thisp = ((void*) ptr < state->end) ?
371 SRE_IS_WORD((int) ptr[0]) : 0;
372 return thisp != thatp;
373
374 case SRE_AT_NON_BOUNDARY:
375 if (state->beginning == state->end)
376 return 0;
377 thatp = ((void*) ptr > state->beginning) ?
378 SRE_IS_WORD((int) ptr[-1]) : 0;
379 thisp = ((void*) ptr < state->end) ?
380 SRE_IS_WORD((int) ptr[0]) : 0;
381 return thisp == thatp;
382
383 case SRE_AT_LOC_BOUNDARY:
384 if (state->beginning == state->end)
385 return 0;
386 thatp = ((void*) ptr > state->beginning) ?
387 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
388 thisp = ((void*) ptr < state->end) ?
389 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
390 return thisp != thatp;
391
392 case SRE_AT_LOC_NON_BOUNDARY:
393 if (state->beginning == state->end)
394 return 0;
395 thatp = ((void*) ptr > state->beginning) ?
396 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
397 thisp = ((void*) ptr < state->end) ?
398 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
399 return thisp == thatp;
400
401 #if defined(HAVE_UNICODE)
402 case SRE_AT_UNI_BOUNDARY:
403 if (state->beginning == state->end)
404 return 0;
405 thatp = ((void*) ptr > state->beginning) ?
406 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
407 thisp = ((void*) ptr < state->end) ?
408 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
409 return thisp != thatp;
410
411 case SRE_AT_UNI_NON_BOUNDARY:
412 if (state->beginning == state->end)
413 return 0;
414 thatp = ((void*) ptr > state->beginning) ?
415 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
416 thisp = ((void*) ptr < state->end) ?
417 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
418 return thisp == thatp;
419 #endif
420
421 }
422
423 return 0;
424 }
425
426 LOCAL(int)
SRE_CHARSET(SRE_CODE * set,SRE_CODE ch)427 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428 {
429 /* check if character is a member of the given set */
430
431 int ok = 1;
432
433 for (;;) {
434 switch (*set++) {
435
436 case SRE_OP_FAILURE:
437 return !ok;
438
439 case SRE_OP_LITERAL:
440 /* <LITERAL> <code> */
441 if (ch == set[0])
442 return ok;
443 set++;
444 break;
445
446 case SRE_OP_CATEGORY:
447 /* <CATEGORY> <code> */
448 if (sre_category(set[0], (int) ch))
449 return ok;
450 set += 1;
451 break;
452
453 case SRE_OP_CHARSET:
454 if (sizeof(SRE_CODE) == 2) {
455 /* <CHARSET> <bitmap> (16 bits per code word) */
456 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457 return ok;
458 set += 16;
459 }
460 else {
461 /* <CHARSET> <bitmap> (32 bits per code word) */
462 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
463 return ok;
464 set += 8;
465 }
466 break;
467
468 case SRE_OP_RANGE:
469 /* <RANGE> <lower> <upper> */
470 if (set[0] <= ch && ch <= set[1])
471 return ok;
472 set += 2;
473 break;
474
475 case SRE_OP_NEGATE:
476 ok = !ok;
477 break;
478
479 case SRE_OP_BIGCHARSET:
480 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 {
482 Py_ssize_t count, block;
483 count = *(set++);
484
485 if (sizeof(SRE_CODE) == 2) {
486 block = ((unsigned char*)set)[ch >> 8];
487 set += 128;
488 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489 return ok;
490 set += count*16;
491 }
492 else {
493 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494 * warnings when c's type supports only numbers < N+1 */
495 if (!(ch & ~65535))
496 block = ((unsigned char*)set)[ch >> 8];
497 else
498 block = -1;
499 set += 64;
500 if (block >=0 &&
501 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
502 return ok;
503 set += count*8;
504 }
505 break;
506 }
507
508 default:
509 /* internal error -- there's not much we can do about it
510 here, so let's just pretend it didn't match... */
511 return 0;
512 }
513 }
514 }
515
516 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517
518 LOCAL(Py_ssize_t)
SRE_COUNT(SRE_STATE * state,SRE_CODE * pattern,Py_ssize_t maxcount)519 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
520 {
521 SRE_CODE chr;
522 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523 SRE_CHAR* end = (SRE_CHAR *)state->end;
524 Py_ssize_t i;
525
526 /* adjust end */
527 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
528 end = ptr + maxcount;
529
530 switch (pattern[0]) {
531
532 case SRE_OP_IN:
533 /* repeated set */
534 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536 ptr++;
537 break;
538
539 case SRE_OP_ANY:
540 /* repeated dot wildcard. */
541 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
542 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543 ptr++;
544 break;
545
546 case SRE_OP_ANY_ALL:
547 /* repeated dot wildcard. skip to the end of the target
548 string, and backtrack from there */
549 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
550 ptr = end;
551 break;
552
553 case SRE_OP_LITERAL:
554 /* repeated literal */
555 chr = pattern[1];
556 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
557 while (ptr < end && (SRE_CODE) *ptr == chr)
558 ptr++;
559 break;
560
561 case SRE_OP_LITERAL_IGNORE:
562 /* repeated literal */
563 chr = pattern[1];
564 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
565 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566 ptr++;
567 break;
568
569 case SRE_OP_NOT_LITERAL:
570 /* repeated non-literal */
571 chr = pattern[1];
572 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
573 while (ptr < end && (SRE_CODE) *ptr != chr)
574 ptr++;
575 break;
576
577 case SRE_OP_NOT_LITERAL_IGNORE:
578 /* repeated non-literal */
579 chr = pattern[1];
580 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
581 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582 ptr++;
583 break;
584
585 default:
586 /* repeated single character pattern */
587 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
588 while ((SRE_CHAR*) state->ptr < end) {
589 i = SRE_MATCH(state, pattern);
590 if (i < 0)
591 return i;
592 if (!i)
593 break;
594 }
595 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
596 (SRE_CHAR*) state->ptr - ptr));
597 return (SRE_CHAR*) state->ptr - ptr;
598 }
599
600 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
601 ptr - (SRE_CHAR*) state->ptr));
602 return ptr - (SRE_CHAR*) state->ptr;
603 }
604
605 #if 0 /* not used in this release */
606 LOCAL(int)
607 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
608 {
609 /* check if an SRE_OP_INFO block matches at the current position.
610 returns the number of SRE_CODE objects to skip if successful, 0
611 if no match */
612
613 SRE_CHAR* end = state->end;
614 SRE_CHAR* ptr = state->ptr;
615 Py_ssize_t i;
616
617 /* check minimal length */
618 if (pattern[3] && (end - ptr) < pattern[3])
619 return 0;
620
621 /* check known prefix */
622 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
623 /* <length> <skip> <prefix data> <overlap data> */
624 for (i = 0; i < pattern[5]; i++)
625 if ((SRE_CODE) ptr[i] != pattern[7 + i])
626 return 0;
627 return pattern[0] + 2 * pattern[6];
628 }
629 return pattern[0];
630 }
631 #endif
632
633 /* The macros below should be used to protect recursive SRE_MATCH()
634 * calls that *failed* and do *not* return immediately (IOW, those
635 * that will backtrack). Explaining:
636 *
637 * - Recursive SRE_MATCH() returned true: that's usually a success
638 * (besides atypical cases like ASSERT_NOT), therefore there's no
639 * reason to restore lastmark;
640 *
641 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
642 * is returning to the caller: If the current SRE_MATCH() is the
643 * top function of the recursion, returning false will be a matching
644 * failure, and it doesn't matter where lastmark is pointing to.
645 * If it's *not* the top function, it will be a recursive SRE_MATCH()
646 * failure by itself, and the calling SRE_MATCH() will have to deal
647 * with the failure by the same rules explained here (it will restore
648 * lastmark by itself if necessary);
649 *
650 * - Recursive SRE_MATCH() returned false, and will continue the
651 * outside 'for' loop: must be protected when breaking, since the next
652 * OP could potentially depend on lastmark;
653 *
654 * - Recursive SRE_MATCH() returned false, and will be called again
655 * inside a local for/while loop: must be protected between each
656 * loop iteration, since the recursive SRE_MATCH() could do anything,
657 * and could potentially depend on lastmark.
658 *
659 * For more information, check the discussion at SF patch #712900.
660 */
661 #define LASTMARK_SAVE() \
662 do { \
663 ctx->lastmark = state->lastmark; \
664 ctx->lastindex = state->lastindex; \
665 } while (0)
666 #define LASTMARK_RESTORE() \
667 do { \
668 state->lastmark = ctx->lastmark; \
669 state->lastindex = ctx->lastindex; \
670 } while (0)
671
672 #define RETURN_ERROR(i) do { return i; } while(0)
673 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
674 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
675
676 #define RETURN_ON_ERROR(i) \
677 do { if (i < 0) RETURN_ERROR(i); } while (0)
678 #define RETURN_ON_SUCCESS(i) \
679 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
680 #define RETURN_ON_FAILURE(i) \
681 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
682
683 #define SFY(x) #x
684
685 #define DATA_STACK_ALLOC(state, type, ptr) \
686 do { \
687 alloc_pos = state->data_stack_base; \
688 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
689 "(%" PY_FORMAT_SIZE_T "d)\n", \
690 SFY(type), alloc_pos, sizeof(type))); \
691 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
692 int j = data_stack_grow(state, sizeof(type)); \
693 if (j < 0) return j; \
694 if (ctx_pos != -1) \
695 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
696 } \
697 ptr = (type*)(state->data_stack+alloc_pos); \
698 state->data_stack_base += sizeof(type); \
699 } while (0)
700
701 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
702 do { \
703 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
704 ptr = (type*)(state->data_stack+pos); \
705 } while (0)
706
707 #define DATA_STACK_PUSH(state, data, size) \
708 do { \
709 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
710 "(%" PY_FORMAT_SIZE_T "d)\n", \
711 data, state->data_stack_base, size)); \
712 if (size > state->data_stack_size - state->data_stack_base) { \
713 int j = data_stack_grow(state, size); \
714 if (j < 0) return j; \
715 if (ctx_pos != -1) \
716 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
717 } \
718 memcpy(state->data_stack+state->data_stack_base, data, size); \
719 state->data_stack_base += size; \
720 } while (0)
721
722 #define DATA_STACK_POP(state, data, size, discard) \
723 do { \
724 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
725 "(%" PY_FORMAT_SIZE_T "d)\n", \
726 data, state->data_stack_base-size, size)); \
727 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
728 if (discard) \
729 state->data_stack_base -= size; \
730 } while (0)
731
732 #define DATA_STACK_POP_DISCARD(state, size) \
733 do { \
734 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
735 "(%" PY_FORMAT_SIZE_T "d)\n", \
736 state->data_stack_base-size, size)); \
737 state->data_stack_base -= size; \
738 } while(0)
739
740 #define DATA_PUSH(x) \
741 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
742 #define DATA_POP(x) \
743 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
744 #define DATA_POP_DISCARD(x) \
745 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
746 #define DATA_ALLOC(t,p) \
747 DATA_STACK_ALLOC(state, t, p)
748 #define DATA_LOOKUP_AT(t,p,pos) \
749 DATA_STACK_LOOKUP_AT(state,t,p,pos)
750
751 #define MARK_PUSH(lastmark) \
752 do if (lastmark > 0) { \
753 i = lastmark; /* ctx->lastmark may change if reallocated */ \
754 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
755 } while (0)
756 #define MARK_POP(lastmark) \
757 do if (lastmark > 0) { \
758 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
759 } while (0)
760 #define MARK_POP_KEEP(lastmark) \
761 do if (lastmark > 0) { \
762 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
763 } while (0)
764 #define MARK_POP_DISCARD(lastmark) \
765 do if (lastmark > 0) { \
766 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
767 } while (0)
768
769 #define JUMP_NONE 0
770 #define JUMP_MAX_UNTIL_1 1
771 #define JUMP_MAX_UNTIL_2 2
772 #define JUMP_MAX_UNTIL_3 3
773 #define JUMP_MIN_UNTIL_1 4
774 #define JUMP_MIN_UNTIL_2 5
775 #define JUMP_MIN_UNTIL_3 6
776 #define JUMP_REPEAT 7
777 #define JUMP_REPEAT_ONE_1 8
778 #define JUMP_REPEAT_ONE_2 9
779 #define JUMP_MIN_REPEAT_ONE 10
780 #define JUMP_BRANCH 11
781 #define JUMP_ASSERT 12
782 #define JUMP_ASSERT_NOT 13
783
784 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
785 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
786 nextctx->last_ctx_pos = ctx_pos; \
787 nextctx->jump = jumpvalue; \
788 nextctx->pattern = nextpattern; \
789 ctx_pos = alloc_pos; \
790 ctx = nextctx; \
791 goto entrance; \
792 jumplabel: \
793 while (0) /* gcc doesn't like labels at end of scopes */ \
794
795 typedef struct {
796 Py_ssize_t last_ctx_pos;
797 Py_ssize_t jump;
798 SRE_CHAR* ptr;
799 SRE_CODE* pattern;
800 Py_ssize_t count;
801 Py_ssize_t lastmark;
802 Py_ssize_t lastindex;
803 union {
804 SRE_CODE chr;
805 SRE_REPEAT* rep;
806 } u;
807 } SRE_MATCH_CONTEXT;
808
809 /* check if string matches the given pattern. returns <0 for
810 error, 0 for failure, and 1 for success */
811 LOCAL(Py_ssize_t)
SRE_MATCH(SRE_STATE * state,SRE_CODE * pattern)812 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
813 {
814 SRE_CHAR* end = (SRE_CHAR *)state->end;
815 Py_ssize_t alloc_pos, ctx_pos = -1;
816 Py_ssize_t i, ret = 0;
817 Py_ssize_t jump;
818 unsigned int sigcount=0;
819
820 SRE_MATCH_CONTEXT* ctx;
821 SRE_MATCH_CONTEXT* nextctx;
822
823 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
824
825 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
826 ctx->last_ctx_pos = -1;
827 ctx->jump = JUMP_NONE;
828 ctx->pattern = pattern;
829 ctx_pos = alloc_pos;
830
831 entrance:
832
833 ctx->ptr = (SRE_CHAR *)state->ptr;
834
835 if (ctx->pattern[0] == SRE_OP_INFO) {
836 /* optimization info block */
837 /* <INFO> <1=skip> <2=flags> <3=min> ... */
838 if (ctx->pattern[3] && (Py_uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) {
839 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
840 "need %" PY_FORMAT_SIZE_T "d)\n",
841 (end - ctx->ptr), (Py_ssize_t) ctx->pattern[3]));
842 RETURN_FAILURE;
843 }
844 ctx->pattern += ctx->pattern[1] + 1;
845 }
846
847 for (;;) {
848 ++sigcount;
849 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
850 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
851
852 switch (*ctx->pattern++) {
853
854 case SRE_OP_MARK:
855 /* set mark */
856 /* <MARK> <gid> */
857 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
858 ctx->ptr, ctx->pattern[0]));
859 i = ctx->pattern[0];
860 if (i & 1)
861 state->lastindex = i/2 + 1;
862 if (i > state->lastmark) {
863 /* state->lastmark is the highest valid index in the
864 state->mark array. If it is increased by more than 1,
865 the intervening marks must be set to NULL to signal
866 that these marks have not been encountered. */
867 Py_ssize_t j = state->lastmark + 1;
868 while (j < i)
869 state->mark[j++] = NULL;
870 state->lastmark = i;
871 }
872 state->mark[i] = ctx->ptr;
873 ctx->pattern++;
874 break;
875
876 case SRE_OP_LITERAL:
877 /* match literal string */
878 /* <LITERAL> <code> */
879 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
881 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
882 RETURN_FAILURE;
883 ctx->pattern++;
884 ctx->ptr++;
885 break;
886
887 case SRE_OP_NOT_LITERAL:
888 /* match anything that is not literal character */
889 /* <NOT_LITERAL> <code> */
890 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
891 ctx->ptr, *ctx->pattern));
892 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
893 RETURN_FAILURE;
894 ctx->pattern++;
895 ctx->ptr++;
896 break;
897
898 case SRE_OP_SUCCESS:
899 /* end of pattern */
900 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
901 state->ptr = ctx->ptr;
902 RETURN_SUCCESS;
903
904 case SRE_OP_AT:
905 /* match at given position */
906 /* <AT> <code> */
907 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
908 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
909 RETURN_FAILURE;
910 ctx->pattern++;
911 break;
912
913 case SRE_OP_CATEGORY:
914 /* match at given category */
915 /* <CATEGORY> <code> */
916 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
917 ctx->ptr, *ctx->pattern));
918 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
919 RETURN_FAILURE;
920 ctx->pattern++;
921 ctx->ptr++;
922 break;
923
924 case SRE_OP_ANY:
925 /* match anything (except a newline) */
926 /* <ANY> */
927 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
928 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
929 RETURN_FAILURE;
930 ctx->ptr++;
931 break;
932
933 case SRE_OP_ANY_ALL:
934 /* match anything */
935 /* <ANY_ALL> */
936 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
937 if (ctx->ptr >= end)
938 RETURN_FAILURE;
939 ctx->ptr++;
940 break;
941
942 case SRE_OP_IN:
943 /* match set member (or non_member) */
944 /* <IN> <skip> <set> */
945 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
946 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
947 RETURN_FAILURE;
948 ctx->pattern += ctx->pattern[0];
949 ctx->ptr++;
950 break;
951
952 case SRE_OP_LITERAL_IGNORE:
953 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
954 ctx->pattern, ctx->ptr, ctx->pattern[0]));
955 if (ctx->ptr >= end ||
956 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
957 RETURN_FAILURE;
958 ctx->pattern++;
959 ctx->ptr++;
960 break;
961
962 case SRE_OP_NOT_LITERAL_IGNORE:
963 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
964 ctx->pattern, ctx->ptr, *ctx->pattern));
965 if (ctx->ptr >= end ||
966 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
967 RETURN_FAILURE;
968 ctx->pattern++;
969 ctx->ptr++;
970 break;
971
972 case SRE_OP_IN_IGNORE:
973 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
974 if (ctx->ptr >= end
975 || !SRE_CHARSET(ctx->pattern+1,
976 (SRE_CODE)state->lower(*ctx->ptr)))
977 RETURN_FAILURE;
978 ctx->pattern += ctx->pattern[0];
979 ctx->ptr++;
980 break;
981
982 case SRE_OP_JUMP:
983 case SRE_OP_INFO:
984 /* jump forward */
985 /* <JUMP> <offset> */
986 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
987 ctx->ptr, ctx->pattern[0]));
988 ctx->pattern += ctx->pattern[0];
989 break;
990
991 case SRE_OP_BRANCH:
992 /* alternation */
993 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
994 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
995 LASTMARK_SAVE();
996 ctx->u.rep = state->repeat;
997 if (ctx->u.rep)
998 MARK_PUSH(ctx->lastmark);
999 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
1000 if (ctx->pattern[1] == SRE_OP_LITERAL &&
1001 (ctx->ptr >= end ||
1002 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
1003 continue;
1004 if (ctx->pattern[1] == SRE_OP_IN &&
1005 (ctx->ptr >= end ||
1006 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1007 continue;
1008 state->ptr = ctx->ptr;
1009 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1010 if (ret) {
1011 if (ctx->u.rep)
1012 MARK_POP_DISCARD(ctx->lastmark);
1013 RETURN_ON_ERROR(ret);
1014 RETURN_SUCCESS;
1015 }
1016 if (ctx->u.rep)
1017 MARK_POP_KEEP(ctx->lastmark);
1018 LASTMARK_RESTORE();
1019 }
1020 if (ctx->u.rep)
1021 MARK_POP_DISCARD(ctx->lastmark);
1022 RETURN_FAILURE;
1023
1024 case SRE_OP_REPEAT_ONE:
1025 /* match repeated sequence (maximizing regexp) */
1026
1027 /* this operator only works if the repeated item is
1028 exactly one character wide, and we're not already
1029 collecting backtracking points. for other cases,
1030 use the MAX_REPEAT operator */
1031
1032 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1033
1034 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1035 ctx->pattern[1], ctx->pattern[2]));
1036
1037 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1038 RETURN_FAILURE; /* cannot match */
1039
1040 state->ptr = ctx->ptr;
1041
1042 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1043 RETURN_ON_ERROR(ret);
1044 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1045 ctx->count = ret;
1046 ctx->ptr += ctx->count;
1047
1048 /* when we arrive here, count contains the number of
1049 matches, and ctx->ptr points to the tail of the target
1050 string. check if the rest of the pattern matches,
1051 and backtrack if not. */
1052
1053 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1054 RETURN_FAILURE;
1055
1056 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1057 /* tail is empty. we're finished */
1058 state->ptr = ctx->ptr;
1059 RETURN_SUCCESS;
1060 }
1061
1062 LASTMARK_SAVE();
1063
1064 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1065 /* tail starts with a literal. skip positions where
1066 the rest of the pattern cannot possibly match */
1067 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1068 for (;;) {
1069 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1070 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1071 ctx->ptr--;
1072 ctx->count--;
1073 }
1074 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1075 break;
1076 state->ptr = ctx->ptr;
1077 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1078 ctx->pattern+ctx->pattern[0]);
1079 if (ret) {
1080 RETURN_ON_ERROR(ret);
1081 RETURN_SUCCESS;
1082 }
1083
1084 LASTMARK_RESTORE();
1085
1086 ctx->ptr--;
1087 ctx->count--;
1088 }
1089
1090 } else {
1091 /* general case */
1092 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1093 state->ptr = ctx->ptr;
1094 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1095 ctx->pattern+ctx->pattern[0]);
1096 if (ret) {
1097 RETURN_ON_ERROR(ret);
1098 RETURN_SUCCESS;
1099 }
1100 ctx->ptr--;
1101 ctx->count--;
1102 LASTMARK_RESTORE();
1103 }
1104 }
1105 RETURN_FAILURE;
1106
1107 case SRE_OP_MIN_REPEAT_ONE:
1108 /* match repeated sequence (minimizing regexp) */
1109
1110 /* this operator only works if the repeated item is
1111 exactly one character wide, and we're not already
1112 collecting backtracking points. for other cases,
1113 use the MIN_REPEAT operator */
1114
1115 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1116
1117 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1118 ctx->pattern[1], ctx->pattern[2]));
1119
1120 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1121 RETURN_FAILURE; /* cannot match */
1122
1123 state->ptr = ctx->ptr;
1124
1125 if (ctx->pattern[1] == 0)
1126 ctx->count = 0;
1127 else {
1128 /* count using pattern min as the maximum */
1129 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1130 RETURN_ON_ERROR(ret);
1131 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1132 if (ret < (Py_ssize_t) ctx->pattern[1])
1133 /* didn't match minimum number of times */
1134 RETURN_FAILURE;
1135 /* advance past minimum matches of repeat */
1136 ctx->count = ret;
1137 ctx->ptr += ctx->count;
1138 }
1139
1140 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1141 /* tail is empty. we're finished */
1142 state->ptr = ctx->ptr;
1143 RETURN_SUCCESS;
1144
1145 } else {
1146 /* general case */
1147 LASTMARK_SAVE();
1148 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
1149 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1150 state->ptr = ctx->ptr;
1151 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1152 ctx->pattern+ctx->pattern[0]);
1153 if (ret) {
1154 RETURN_ON_ERROR(ret);
1155 RETURN_SUCCESS;
1156 }
1157 state->ptr = ctx->ptr;
1158 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1159 RETURN_ON_ERROR(ret);
1160 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1161 if (ret == 0)
1162 break;
1163 assert(ret == 1);
1164 ctx->ptr++;
1165 ctx->count++;
1166 LASTMARK_RESTORE();
1167 }
1168 }
1169 RETURN_FAILURE;
1170
1171 case SRE_OP_REPEAT:
1172 /* create repeat context. all the hard work is done
1173 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1174 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1175 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1176 ctx->pattern[1], ctx->pattern[2]));
1177
1178 /* install new repeat context */
1179 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1180 if (!ctx->u.rep) {
1181 PyErr_NoMemory();
1182 RETURN_FAILURE;
1183 }
1184 ctx->u.rep->count = -1;
1185 ctx->u.rep->pattern = ctx->pattern;
1186 ctx->u.rep->prev = state->repeat;
1187 ctx->u.rep->last_ptr = NULL;
1188 state->repeat = ctx->u.rep;
1189
1190 state->ptr = ctx->ptr;
1191 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1192 state->repeat = ctx->u.rep->prev;
1193 PyObject_FREE(ctx->u.rep);
1194
1195 if (ret) {
1196 RETURN_ON_ERROR(ret);
1197 RETURN_SUCCESS;
1198 }
1199 RETURN_FAILURE;
1200
1201 case SRE_OP_MAX_UNTIL:
1202 /* maximizing repeat */
1203 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1204
1205 /* FIXME: we probably need to deal with zero-width
1206 matches in here... */
1207
1208 ctx->u.rep = state->repeat;
1209 if (!ctx->u.rep)
1210 RETURN_ERROR(SRE_ERROR_STATE);
1211
1212 state->ptr = ctx->ptr;
1213
1214 ctx->count = ctx->u.rep->count+1;
1215
1216 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1217 ctx->ptr, ctx->count));
1218
1219 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1220 /* not enough matches */
1221 ctx->u.rep->count = ctx->count;
1222 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1223 ctx->u.rep->pattern+3);
1224 if (ret) {
1225 RETURN_ON_ERROR(ret);
1226 RETURN_SUCCESS;
1227 }
1228 ctx->u.rep->count = ctx->count-1;
1229 state->ptr = ctx->ptr;
1230 RETURN_FAILURE;
1231 }
1232
1233 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1234 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1235 state->ptr != ctx->u.rep->last_ptr) {
1236 /* we may have enough matches, but if we can
1237 match another item, do so */
1238 ctx->u.rep->count = ctx->count;
1239 LASTMARK_SAVE();
1240 MARK_PUSH(ctx->lastmark);
1241 /* zero-width match protection */
1242 DATA_PUSH(&ctx->u.rep->last_ptr);
1243 ctx->u.rep->last_ptr = state->ptr;
1244 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1245 ctx->u.rep->pattern+3);
1246 DATA_POP(&ctx->u.rep->last_ptr);
1247 if (ret) {
1248 MARK_POP_DISCARD(ctx->lastmark);
1249 RETURN_ON_ERROR(ret);
1250 RETURN_SUCCESS;
1251 }
1252 MARK_POP(ctx->lastmark);
1253 LASTMARK_RESTORE();
1254 ctx->u.rep->count = ctx->count-1;
1255 state->ptr = ctx->ptr;
1256 }
1257
1258 /* cannot match more repeated items here. make sure the
1259 tail matches */
1260 state->repeat = ctx->u.rep->prev;
1261 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1262 RETURN_ON_SUCCESS(ret);
1263 state->repeat = ctx->u.rep;
1264 state->ptr = ctx->ptr;
1265 RETURN_FAILURE;
1266
1267 case SRE_OP_MIN_UNTIL:
1268 /* minimizing repeat */
1269 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1270
1271 ctx->u.rep = state->repeat;
1272 if (!ctx->u.rep)
1273 RETURN_ERROR(SRE_ERROR_STATE);
1274
1275 state->ptr = ctx->ptr;
1276
1277 ctx->count = ctx->u.rep->count+1;
1278
1279 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
1280 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1281
1282 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1283 /* not enough matches */
1284 ctx->u.rep->count = ctx->count;
1285 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1286 ctx->u.rep->pattern+3);
1287 if (ret) {
1288 RETURN_ON_ERROR(ret);
1289 RETURN_SUCCESS;
1290 }
1291 ctx->u.rep->count = ctx->count-1;
1292 state->ptr = ctx->ptr;
1293 RETURN_FAILURE;
1294 }
1295
1296 LASTMARK_SAVE();
1297
1298 /* see if the tail matches */
1299 state->repeat = ctx->u.rep->prev;
1300 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1301 if (ret) {
1302 RETURN_ON_ERROR(ret);
1303 RETURN_SUCCESS;
1304 }
1305
1306 state->repeat = ctx->u.rep;
1307 state->ptr = ctx->ptr;
1308
1309 LASTMARK_RESTORE();
1310
1311 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1312 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1313 state->ptr == ctx->u.rep->last_ptr)
1314 RETURN_FAILURE;
1315
1316 ctx->u.rep->count = ctx->count;
1317 /* zero-width match protection */
1318 DATA_PUSH(&ctx->u.rep->last_ptr);
1319 ctx->u.rep->last_ptr = state->ptr;
1320 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1321 ctx->u.rep->pattern+3);
1322 DATA_POP(&ctx->u.rep->last_ptr);
1323 if (ret) {
1324 RETURN_ON_ERROR(ret);
1325 RETURN_SUCCESS;
1326 }
1327 ctx->u.rep->count = ctx->count-1;
1328 state->ptr = ctx->ptr;
1329 RETURN_FAILURE;
1330
1331 case SRE_OP_GROUPREF:
1332 /* match backreference */
1333 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1334 ctx->ptr, ctx->pattern[0]));
1335 i = ctx->pattern[0];
1336 {
1337 Py_ssize_t groupref = i+i;
1338 if (groupref >= state->lastmark) {
1339 RETURN_FAILURE;
1340 } else {
1341 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1342 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1343 if (!p || !e || e < p)
1344 RETURN_FAILURE;
1345 while (p < e) {
1346 if (ctx->ptr >= end || *ctx->ptr != *p)
1347 RETURN_FAILURE;
1348 p++; ctx->ptr++;
1349 }
1350 }
1351 }
1352 ctx->pattern++;
1353 break;
1354
1355 case SRE_OP_GROUPREF_IGNORE:
1356 /* match backreference */
1357 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1358 ctx->ptr, ctx->pattern[0]));
1359 i = ctx->pattern[0];
1360 {
1361 Py_ssize_t groupref = i+i;
1362 if (groupref >= state->lastmark) {
1363 RETURN_FAILURE;
1364 } else {
1365 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1366 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1367 if (!p || !e || e < p)
1368 RETURN_FAILURE;
1369 while (p < e) {
1370 if (ctx->ptr >= end ||
1371 state->lower(*ctx->ptr) != state->lower(*p))
1372 RETURN_FAILURE;
1373 p++; ctx->ptr++;
1374 }
1375 }
1376 }
1377 ctx->pattern++;
1378 break;
1379
1380 case SRE_OP_GROUPREF_EXISTS:
1381 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1382 ctx->ptr, ctx->pattern[0]));
1383 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1384 i = ctx->pattern[0];
1385 {
1386 Py_ssize_t groupref = i+i;
1387 if (groupref >= state->lastmark) {
1388 ctx->pattern += ctx->pattern[1];
1389 break;
1390 } else {
1391 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1392 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1393 if (!p || !e || e < p) {
1394 ctx->pattern += ctx->pattern[1];
1395 break;
1396 }
1397 }
1398 }
1399 ctx->pattern += 2;
1400 break;
1401
1402 case SRE_OP_ASSERT:
1403 /* assert subpattern */
1404 /* <ASSERT> <skip> <back> <pattern> */
1405 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1406 ctx->ptr, ctx->pattern[1]));
1407 if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1])
1408 RETURN_FAILURE;
1409 state->ptr = ctx->ptr - ctx->pattern[1];
1410 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1411 RETURN_ON_FAILURE(ret);
1412 ctx->pattern += ctx->pattern[0];
1413 break;
1414
1415 case SRE_OP_ASSERT_NOT:
1416 /* assert not subpattern */
1417 /* <ASSERT_NOT> <skip> <back> <pattern> */
1418 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1419 ctx->ptr, ctx->pattern[1]));
1420 if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
1421 state->ptr = ctx->ptr - ctx->pattern[1];
1422 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1423 if (ret) {
1424 RETURN_ON_ERROR(ret);
1425 RETURN_FAILURE;
1426 }
1427 }
1428 ctx->pattern += ctx->pattern[0];
1429 break;
1430
1431 case SRE_OP_FAILURE:
1432 /* immediate failure */
1433 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1434 RETURN_FAILURE;
1435
1436 default:
1437 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1438 ctx->pattern[-1]));
1439 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1440 }
1441 }
1442
1443 exit:
1444 ctx_pos = ctx->last_ctx_pos;
1445 jump = ctx->jump;
1446 DATA_POP_DISCARD(ctx);
1447 if (ctx_pos == -1)
1448 return ret;
1449 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1450
1451 switch (jump) {
1452 case JUMP_MAX_UNTIL_2:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1454 goto jump_max_until_2;
1455 case JUMP_MAX_UNTIL_3:
1456 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1457 goto jump_max_until_3;
1458 case JUMP_MIN_UNTIL_2:
1459 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1460 goto jump_min_until_2;
1461 case JUMP_MIN_UNTIL_3:
1462 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1463 goto jump_min_until_3;
1464 case JUMP_BRANCH:
1465 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1466 goto jump_branch;
1467 case JUMP_MAX_UNTIL_1:
1468 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1469 goto jump_max_until_1;
1470 case JUMP_MIN_UNTIL_1:
1471 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1472 goto jump_min_until_1;
1473 case JUMP_REPEAT:
1474 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1475 goto jump_repeat;
1476 case JUMP_REPEAT_ONE_1:
1477 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1478 goto jump_repeat_one_1;
1479 case JUMP_REPEAT_ONE_2:
1480 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1481 goto jump_repeat_one_2;
1482 case JUMP_MIN_REPEAT_ONE:
1483 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1484 goto jump_min_repeat_one;
1485 case JUMP_ASSERT:
1486 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1487 goto jump_assert;
1488 case JUMP_ASSERT_NOT:
1489 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1490 goto jump_assert_not;
1491 case JUMP_NONE:
1492 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1493 ctx->ptr, ret));
1494 break;
1495 }
1496
1497 return ret; /* should never get here */
1498 }
1499
1500 LOCAL(Py_ssize_t)
SRE_SEARCH(SRE_STATE * state,SRE_CODE * pattern)1501 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1502 {
1503 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1504 SRE_CHAR* end = (SRE_CHAR *)state->end;
1505 Py_ssize_t status = 0;
1506 Py_ssize_t prefix_len = 0;
1507 Py_ssize_t prefix_skip = 0;
1508 SRE_CODE* prefix = NULL;
1509 SRE_CODE* charset = NULL;
1510 SRE_CODE* overlap = NULL;
1511 int flags = 0;
1512
1513 if (ptr > end)
1514 return 0;
1515
1516 if (pattern[0] == SRE_OP_INFO) {
1517 /* optimization info block */
1518 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1519
1520 flags = pattern[2];
1521
1522 if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1523 TRACE(("reject (got %u chars, need %u)\n",
1524 (unsigned int)(end - ptr), pattern[3]));
1525 return 0;
1526 }
1527 if (pattern[3] > 1) {
1528 /* adjust end point (but make sure we leave at least one
1529 character in there, so literal search will work) */
1530 end -= pattern[3]-1;
1531 if (end <= ptr)
1532 end = ptr+1;
1533 }
1534
1535 if (flags & SRE_INFO_PREFIX) {
1536 /* pattern starts with a known prefix */
1537 /* <length> <skip> <prefix data> <overlap data> */
1538 prefix_len = pattern[5];
1539 prefix_skip = pattern[6];
1540 prefix = pattern + 7;
1541 overlap = prefix + prefix_len - 1;
1542 } else if (flags & SRE_INFO_CHARSET)
1543 /* pattern starts with a character from a known set */
1544 /* <charset> */
1545 charset = pattern + 5;
1546
1547 pattern += 1 + pattern[1];
1548 }
1549
1550 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1551 prefix, prefix_len, prefix_skip));
1552 TRACE(("charset = %p\n", charset));
1553
1554 #if defined(USE_FAST_SEARCH)
1555 if (prefix_len > 1) {
1556 /* pattern starts with a known prefix. use the overlap
1557 table to skip forward as fast as we possibly can */
1558 Py_ssize_t i = 0;
1559 end = (SRE_CHAR *)state->end;
1560 while (ptr < end) {
1561 for (;;) {
1562 if ((SRE_CODE) ptr[0] != prefix[i]) {
1563 if (!i)
1564 break;
1565 else
1566 i = overlap[i];
1567 } else {
1568 if (++i == prefix_len) {
1569 /* found a potential match */
1570 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1571 state->start = ptr + 1 - prefix_len;
1572 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1573 if (flags & SRE_INFO_LITERAL)
1574 return 1; /* we got all of it */
1575 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1576 if (status != 0)
1577 return status;
1578 /* close but no cigar -- try again */
1579 i = overlap[i];
1580 }
1581 break;
1582 }
1583 }
1584 ptr++;
1585 }
1586 return 0;
1587 }
1588 #endif
1589
1590 if (pattern[0] == SRE_OP_LITERAL) {
1591 /* pattern starts with a literal character. this is used
1592 for short prefixes, and if fast search is disabled */
1593 SRE_CODE chr = pattern[1];
1594 end = (SRE_CHAR *)state->end;
1595 for (;;) {
1596 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1597 ptr++;
1598 if (ptr >= end)
1599 return 0;
1600 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1601 state->start = ptr;
1602 state->ptr = ++ptr;
1603 if (flags & SRE_INFO_LITERAL)
1604 return 1; /* we got all of it */
1605 status = SRE_MATCH(state, pattern + 2);
1606 if (status != 0)
1607 break;
1608 }
1609 } else if (charset) {
1610 /* pattern starts with a character from a known set */
1611 end = (SRE_CHAR *)state->end;
1612 for (;;) {
1613 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1614 ptr++;
1615 if (ptr >= end)
1616 return 0;
1617 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1618 state->start = ptr;
1619 state->ptr = ptr;
1620 status = SRE_MATCH(state, pattern);
1621 if (status != 0)
1622 break;
1623 ptr++;
1624 }
1625 } else {
1626 /* general case */
1627 assert(ptr <= end);
1628 while (1) {
1629 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1630 state->start = state->ptr = ptr;
1631 status = SRE_MATCH(state, pattern);
1632 if (status != 0 || ptr >= end)
1633 break;
1634 ptr++;
1635 }
1636 }
1637
1638 return status;
1639 }
1640
1641 LOCAL(int)
SRE_LITERAL_TEMPLATE(SRE_CHAR * ptr,Py_ssize_t len)1642 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1643 {
1644 /* check if given string is a literal template (i.e. no escapes) */
1645 while (len-- > 0)
1646 if (*ptr++ == '\\')
1647 return 0;
1648 return 1;
1649 }
1650
1651 #if !defined(SRE_RECURSIVE)
1652
1653 /* -------------------------------------------------------------------- */
1654 /* factories and destructors */
1655
1656 /* see sre.h for object declarations */
1657 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1658 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1659
1660 static PyObject *
sre_codesize(PyObject * self,PyObject * unused)1661 sre_codesize(PyObject* self, PyObject *unused)
1662 {
1663 return PyInt_FromSize_t(sizeof(SRE_CODE));
1664 }
1665
1666 static PyObject *
sre_getlower(PyObject * self,PyObject * args)1667 sre_getlower(PyObject* self, PyObject* args)
1668 {
1669 int character, flags;
1670 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1671 return NULL;
1672 if (flags & SRE_FLAG_LOCALE)
1673 return Py_BuildValue("i", sre_lower_locale(character));
1674 if (flags & SRE_FLAG_UNICODE)
1675 #if defined(HAVE_UNICODE)
1676 return Py_BuildValue("i", sre_lower_unicode(character));
1677 #else
1678 return Py_BuildValue("i", sre_lower_locale(character));
1679 #endif
1680 return Py_BuildValue("i", sre_lower(character));
1681 }
1682
1683 LOCAL(void)
state_reset(SRE_STATE * state)1684 state_reset(SRE_STATE* state)
1685 {
1686 /* FIXME: dynamic! */
1687 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1688
1689 state->lastmark = -1;
1690 state->lastindex = -1;
1691
1692 state->repeat = NULL;
1693
1694 data_stack_dealloc(state);
1695 }
1696
1697 static void*
getstring(PyObject * string,Py_ssize_t * p_length,int * p_charsize)1698 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1699 {
1700 /* given a python object, return a data pointer, a length (in
1701 characters), and a character size. return NULL if the object
1702 is not a string (or not compatible) */
1703
1704 PyBufferProcs *buffer;
1705 Py_ssize_t size, bytes;
1706 int charsize;
1707 void* ptr;
1708
1709 #if defined(HAVE_UNICODE)
1710 if (PyUnicode_Check(string)) {
1711 /* unicode strings doesn't always support the buffer interface */
1712 ptr = (void*) PyUnicode_AS_DATA(string);
1713 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1714 size = PyUnicode_GET_SIZE(string);
1715 charsize = sizeof(Py_UNICODE);
1716
1717 } else {
1718 #endif
1719
1720 /* get pointer to string buffer */
1721 buffer = Py_TYPE(string)->tp_as_buffer;
1722 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1723 buffer->bf_getsegcount(string, NULL) != 1) {
1724 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1725 return NULL;
1726 }
1727
1728 /* determine buffer size */
1729 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1730 if (bytes < 0) {
1731 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1732 return NULL;
1733 }
1734
1735 /* determine character size */
1736 #if PY_VERSION_HEX >= 0x01060000
1737 size = PyObject_Size(string);
1738 #else
1739 size = PyObject_Length(string);
1740 #endif
1741
1742 if (PyString_Check(string) || bytes == size)
1743 charsize = 1;
1744 #if defined(HAVE_UNICODE)
1745 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1746 charsize = sizeof(Py_UNICODE);
1747 #endif
1748 else {
1749 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1750 return NULL;
1751 }
1752
1753 #if defined(HAVE_UNICODE)
1754 }
1755 #endif
1756
1757 *p_length = size;
1758 *p_charsize = charsize;
1759
1760 return ptr;
1761 }
1762
1763 LOCAL(PyObject*)
state_init(SRE_STATE * state,PatternObject * pattern,PyObject * string,Py_ssize_t start,Py_ssize_t end)1764 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1765 Py_ssize_t start, Py_ssize_t end)
1766 {
1767 /* prepare state object */
1768
1769 Py_ssize_t length;
1770 int charsize;
1771 void* ptr;
1772
1773 memset(state, 0, sizeof(SRE_STATE));
1774
1775 state->lastmark = -1;
1776 state->lastindex = -1;
1777
1778 ptr = getstring(string, &length, &charsize);
1779 if (!ptr)
1780 return NULL;
1781
1782 /* adjust boundaries */
1783 if (start < 0)
1784 start = 0;
1785 else if (start > length)
1786 start = length;
1787
1788 if (end < 0)
1789 end = 0;
1790 else if (end > length)
1791 end = length;
1792
1793 state->charsize = charsize;
1794
1795 state->beginning = ptr;
1796
1797 state->start = (void*) ((char*) ptr + start * state->charsize);
1798 state->end = (void*) ((char*) ptr + end * state->charsize);
1799
1800 Py_INCREF(string);
1801 state->string = string;
1802 state->pos = start;
1803 state->endpos = end;
1804
1805 if (pattern->flags & SRE_FLAG_LOCALE)
1806 state->lower = sre_lower_locale;
1807 else if (pattern->flags & SRE_FLAG_UNICODE)
1808 #if defined(HAVE_UNICODE)
1809 state->lower = sre_lower_unicode;
1810 #else
1811 state->lower = sre_lower_locale;
1812 #endif
1813 else
1814 state->lower = sre_lower;
1815
1816 return string;
1817 }
1818
1819 LOCAL(void)
state_fini(SRE_STATE * state)1820 state_fini(SRE_STATE* state)
1821 {
1822 Py_XDECREF(state->string);
1823 data_stack_dealloc(state);
1824 }
1825
1826 /* calculate offset from start of string */
1827 #define STATE_OFFSET(state, member)\
1828 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1829
1830 LOCAL(PyObject*)
state_getslice(SRE_STATE * state,Py_ssize_t index,PyObject * string,int empty)1831 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1832 {
1833 Py_ssize_t i, j;
1834
1835 index = (index - 1) * 2;
1836
1837 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1838 if (empty)
1839 /* want empty string */
1840 i = j = 0;
1841 else {
1842 Py_INCREF(Py_None);
1843 return Py_None;
1844 }
1845 } else {
1846 i = STATE_OFFSET(state, state->mark[index]);
1847 j = STATE_OFFSET(state, state->mark[index+1]);
1848 }
1849
1850 return PySequence_GetSlice(string, i, j);
1851 }
1852
1853 static void
pattern_error(int status)1854 pattern_error(int status)
1855 {
1856 switch (status) {
1857 case SRE_ERROR_RECURSION_LIMIT:
1858 PyErr_SetString(
1859 PyExc_RuntimeError,
1860 "maximum recursion limit exceeded"
1861 );
1862 break;
1863 case SRE_ERROR_MEMORY:
1864 PyErr_NoMemory();
1865 break;
1866 case SRE_ERROR_INTERRUPTED:
1867 /* An exception has already been raised, so let it fly */
1868 break;
1869 default:
1870 /* other error codes indicate compiler/engine bugs */
1871 PyErr_SetString(
1872 PyExc_RuntimeError,
1873 "internal error in regular expression engine"
1874 );
1875 }
1876 }
1877
1878 static void
pattern_dealloc(PatternObject * self)1879 pattern_dealloc(PatternObject* self)
1880 {
1881 if (self->weakreflist != NULL)
1882 PyObject_ClearWeakRefs((PyObject *) self);
1883 Py_XDECREF(self->pattern);
1884 Py_XDECREF(self->groupindex);
1885 Py_XDECREF(self->indexgroup);
1886 PyObject_DEL(self);
1887 }
1888
1889 static int
check_args_size(const char * name,PyObject * args,PyObject * kw,int n)1890 check_args_size(const char *name, PyObject* args, PyObject* kw, int n)
1891 {
1892 Py_ssize_t m = PyTuple_GET_SIZE(args) + (kw ? PyDict_Size(kw) : 0);
1893 if (m <= n)
1894 return 1;
1895 PyErr_Format(PyExc_TypeError,
1896 "%s() takes at most %d positional arguments (%zd given)",
1897 name, n, m);
1898 return 0;
1899 }
1900
1901 static PyObject*
fix_string_param(PyObject * string,PyObject * string2,const char * oldname)1902 fix_string_param(PyObject *string, PyObject *string2, const char *oldname)
1903 {
1904 if (string2 != NULL) {
1905 char buf[100];
1906 if (string != NULL) {
1907 PyErr_Format(PyExc_TypeError,
1908 "Argument given by name ('%s') and position (1)",
1909 oldname);
1910 return NULL;
1911 }
1912 sprintf(buf, "The '%s' keyword parameter name is deprecated. "
1913 "Use 'string' instead.", oldname);
1914 if (PyErr_Warn(PyExc_DeprecationWarning, buf) < 0)
1915 return NULL;
1916 return string2;
1917 }
1918 if (string == NULL) {
1919 PyErr_SetString(PyExc_TypeError,
1920 "Required argument 'string' (pos 1) not found");
1921 return NULL;
1922 }
1923 return string;
1924 }
1925
1926 static PyObject*
pattern_match(PatternObject * self,PyObject * args,PyObject * kw)1927 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1928 {
1929 SRE_STATE state;
1930 int status;
1931
1932 PyObject *string = NULL, *string2 = NULL;
1933 Py_ssize_t start = 0;
1934 Py_ssize_t end = PY_SSIZE_T_MAX;
1935 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1936 if (!check_args_size("match", args, kw, 3))
1937 return NULL;
1938
1939 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:match", kwlist,
1940 &string, &start, &end, &string2))
1941 return NULL;
1942
1943 string = fix_string_param(string, string2, "pattern");
1944 if (!string)
1945 return NULL;
1946
1947 string = state_init(&state, self, string, start, end);
1948 if (!string)
1949 return NULL;
1950
1951 state.ptr = state.start;
1952
1953 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1954
1955 if (state.charsize == 1) {
1956 status = sre_match(&state, PatternObject_GetCode(self));
1957 } else {
1958 #if defined(HAVE_UNICODE)
1959 status = sre_umatch(&state, PatternObject_GetCode(self));
1960 #endif
1961 }
1962
1963 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1964 if (PyErr_Occurred())
1965 return NULL;
1966
1967 state_fini(&state);
1968
1969 return pattern_new_match(self, &state, status);
1970 }
1971
1972 static PyObject*
pattern_search(PatternObject * self,PyObject * args,PyObject * kw)1973 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1974 {
1975 SRE_STATE state;
1976 int status;
1977
1978 PyObject *string = NULL, *string2 = NULL;
1979 Py_ssize_t start = 0;
1980 Py_ssize_t end = PY_SSIZE_T_MAX;
1981 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1982 if (!check_args_size("search", args, kw, 3))
1983 return NULL;
1984
1985 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:search", kwlist,
1986 &string, &start, &end, &string2))
1987 return NULL;
1988
1989 string = fix_string_param(string, string2, "pattern");
1990 if (!string)
1991 return NULL;
1992
1993 string = state_init(&state, self, string, start, end);
1994 if (!string)
1995 return NULL;
1996
1997 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1998
1999 if (state.charsize == 1) {
2000 status = sre_search(&state, PatternObject_GetCode(self));
2001 } else {
2002 #if defined(HAVE_UNICODE)
2003 status = sre_usearch(&state, PatternObject_GetCode(self));
2004 #endif
2005 }
2006
2007 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2008
2009 state_fini(&state);
2010
2011 if (PyErr_Occurred())
2012 return NULL;
2013
2014 return pattern_new_match(self, &state, status);
2015 }
2016
2017 static PyObject*
call(char * module,char * function,PyObject * args)2018 call(char* module, char* function, PyObject* args)
2019 {
2020 PyObject* name;
2021 PyObject* mod;
2022 PyObject* func;
2023 PyObject* result;
2024
2025 if (!args)
2026 return NULL;
2027 name = PyString_FromString(module);
2028 if (!name)
2029 return NULL;
2030 mod = PyImport_Import(name);
2031 Py_DECREF(name);
2032 if (!mod)
2033 return NULL;
2034 func = PyObject_GetAttrString(mod, function);
2035 Py_DECREF(mod);
2036 if (!func)
2037 return NULL;
2038 result = PyObject_CallObject(func, args);
2039 Py_DECREF(func);
2040 Py_DECREF(args);
2041 return result;
2042 }
2043
2044 #ifdef USE_BUILTIN_COPY
2045 static int
deepcopy(PyObject ** object,PyObject * memo)2046 deepcopy(PyObject** object, PyObject* memo)
2047 {
2048 PyObject* copy;
2049
2050 copy = call(
2051 "copy", "deepcopy",
2052 PyTuple_Pack(2, *object, memo)
2053 );
2054 if (!copy)
2055 return 0;
2056
2057 Py_SETREF(*object, copy);
2058
2059 return 1; /* success */
2060 }
2061 #endif
2062
2063 static PyObject*
join_list(PyObject * list,PyObject * string)2064 join_list(PyObject* list, PyObject* string)
2065 {
2066 /* join list elements */
2067
2068 PyObject* joiner;
2069 #if PY_VERSION_HEX >= 0x01060000
2070 PyObject* function;
2071 PyObject* args;
2072 #endif
2073 PyObject* result;
2074
2075 joiner = PySequence_GetSlice(string, 0, 0);
2076 if (!joiner)
2077 return NULL;
2078
2079 if (PyList_GET_SIZE(list) == 0) {
2080 Py_DECREF(list);
2081 return joiner;
2082 }
2083
2084 #if PY_VERSION_HEX >= 0x01060000
2085 function = PyObject_GetAttrString(joiner, "join");
2086 if (!function) {
2087 Py_DECREF(joiner);
2088 return NULL;
2089 }
2090 args = PyTuple_New(1);
2091 if (!args) {
2092 Py_DECREF(function);
2093 Py_DECREF(joiner);
2094 return NULL;
2095 }
2096 PyTuple_SET_ITEM(args, 0, list);
2097 result = PyObject_CallObject(function, args);
2098 Py_DECREF(args); /* also removes list */
2099 Py_DECREF(function);
2100 #else
2101 result = call(
2102 "string", "join",
2103 PyTuple_Pack(2, list, joiner)
2104 );
2105 #endif
2106 Py_DECREF(joiner);
2107
2108 return result;
2109 }
2110
2111 static PyObject*
pattern_findall(PatternObject * self,PyObject * args,PyObject * kw)2112 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2113 {
2114 SRE_STATE state;
2115 PyObject* list;
2116 int status;
2117 Py_ssize_t i, b, e;
2118
2119 PyObject *string = NULL, *string2 = NULL;
2120 Py_ssize_t start = 0;
2121 Py_ssize_t end = PY_SSIZE_T_MAX;
2122 static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
2123 if (!check_args_size("findall", args, kw, 3))
2124 return NULL;
2125
2126 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:findall", kwlist,
2127 &string, &start, &end, &string2))
2128 return NULL;
2129
2130 string = fix_string_param(string, string2, "source");
2131 if (!string)
2132 return NULL;
2133
2134 string = state_init(&state, self, string, start, end);
2135 if (!string)
2136 return NULL;
2137
2138 list = PyList_New(0);
2139 if (!list) {
2140 state_fini(&state);
2141 return NULL;
2142 }
2143
2144 while (state.start <= state.end) {
2145
2146 PyObject* item;
2147
2148 state_reset(&state);
2149
2150 state.ptr = state.start;
2151
2152 if (state.charsize == 1) {
2153 status = sre_search(&state, PatternObject_GetCode(self));
2154 } else {
2155 #if defined(HAVE_UNICODE)
2156 status = sre_usearch(&state, PatternObject_GetCode(self));
2157 #endif
2158 }
2159
2160 if (PyErr_Occurred())
2161 goto error;
2162
2163 if (status <= 0) {
2164 if (status == 0)
2165 break;
2166 pattern_error(status);
2167 goto error;
2168 }
2169
2170 /* don't bother to build a match object */
2171 switch (self->groups) {
2172 case 0:
2173 b = STATE_OFFSET(&state, state.start);
2174 e = STATE_OFFSET(&state, state.ptr);
2175 item = PySequence_GetSlice(string, b, e);
2176 if (!item)
2177 goto error;
2178 break;
2179 case 1:
2180 item = state_getslice(&state, 1, string, 1);
2181 if (!item)
2182 goto error;
2183 break;
2184 default:
2185 item = PyTuple_New(self->groups);
2186 if (!item)
2187 goto error;
2188 for (i = 0; i < self->groups; i++) {
2189 PyObject* o = state_getslice(&state, i+1, string, 1);
2190 if (!o) {
2191 Py_DECREF(item);
2192 goto error;
2193 }
2194 PyTuple_SET_ITEM(item, i, o);
2195 }
2196 break;
2197 }
2198
2199 status = PyList_Append(list, item);
2200 Py_DECREF(item);
2201 if (status < 0)
2202 goto error;
2203
2204 if (state.ptr == state.start)
2205 state.start = (void*) ((char*) state.ptr + state.charsize);
2206 else
2207 state.start = state.ptr;
2208
2209 }
2210
2211 state_fini(&state);
2212 return list;
2213
2214 error:
2215 Py_DECREF(list);
2216 state_fini(&state);
2217 return NULL;
2218
2219 }
2220
2221 #if PY_VERSION_HEX >= 0x02020000
2222 static PyObject*
pattern_finditer(PatternObject * pattern,PyObject * args)2223 pattern_finditer(PatternObject* pattern, PyObject* args)
2224 {
2225 PyObject* scanner;
2226 PyObject* search;
2227 PyObject* iterator;
2228
2229 scanner = pattern_scanner(pattern, args);
2230 if (!scanner)
2231 return NULL;
2232
2233 search = PyObject_GetAttrString(scanner, "search");
2234 Py_DECREF(scanner);
2235 if (!search)
2236 return NULL;
2237
2238 iterator = PyCallIter_New(search, Py_None);
2239 Py_DECREF(search);
2240
2241 return iterator;
2242 }
2243 #endif
2244
2245 static PyObject*
pattern_split(PatternObject * self,PyObject * args,PyObject * kw)2246 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2247 {
2248 SRE_STATE state;
2249 PyObject* list;
2250 PyObject* item;
2251 int status;
2252 Py_ssize_t n;
2253 Py_ssize_t i;
2254 void* last;
2255
2256 PyObject *string = NULL, *string2 = NULL;
2257 Py_ssize_t maxsplit = 0;
2258 static char* kwlist[] = { "string", "maxsplit", "source", NULL };
2259 if (!check_args_size("split", args, kw, 2))
2260 return NULL;
2261
2262 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnO:split", kwlist,
2263 &string, &maxsplit, &string2))
2264 return NULL;
2265
2266 string = fix_string_param(string, string2, "source");
2267 if (!string)
2268 return NULL;
2269
2270 if (Py_Py3kWarningFlag &&
2271 (self->code[0] != SRE_OP_INFO || self->code[3] == 0))
2272 {
2273 if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
2274 if (PyErr_WarnPy3k("split() requires a non-empty pattern match.",
2275 1) < 0)
2276 return NULL;
2277 }
2278 else if (PyErr_WarnEx(PyExc_FutureWarning,
2279 "split() requires a non-empty pattern match.",
2280 1) < 0)
2281 return NULL;
2282 }
2283
2284 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2285 if (!string)
2286 return NULL;
2287
2288 list = PyList_New(0);
2289 if (!list) {
2290 state_fini(&state);
2291 return NULL;
2292 }
2293
2294 n = 0;
2295 last = state.start;
2296
2297 while (!maxsplit || n < maxsplit) {
2298
2299 state_reset(&state);
2300
2301 state.ptr = state.start;
2302
2303 if (state.charsize == 1) {
2304 status = sre_search(&state, PatternObject_GetCode(self));
2305 } else {
2306 #if defined(HAVE_UNICODE)
2307 status = sre_usearch(&state, PatternObject_GetCode(self));
2308 #endif
2309 }
2310
2311 if (PyErr_Occurred())
2312 goto error;
2313
2314 if (status <= 0) {
2315 if (status == 0)
2316 break;
2317 pattern_error(status);
2318 goto error;
2319 }
2320
2321 if (state.start == state.ptr) {
2322 if (last == state.end || state.ptr == state.end)
2323 break;
2324 /* skip one character */
2325 state.start = (void*) ((char*) state.ptr + state.charsize);
2326 continue;
2327 }
2328
2329 /* get segment before this match */
2330 item = PySequence_GetSlice(
2331 string, STATE_OFFSET(&state, last),
2332 STATE_OFFSET(&state, state.start)
2333 );
2334 if (!item)
2335 goto error;
2336 status = PyList_Append(list, item);
2337 Py_DECREF(item);
2338 if (status < 0)
2339 goto error;
2340
2341 /* add groups (if any) */
2342 for (i = 0; i < self->groups; i++) {
2343 item = state_getslice(&state, i+1, string, 0);
2344 if (!item)
2345 goto error;
2346 status = PyList_Append(list, item);
2347 Py_DECREF(item);
2348 if (status < 0)
2349 goto error;
2350 }
2351
2352 n = n + 1;
2353
2354 last = state.start = state.ptr;
2355
2356 }
2357
2358 /* get segment following last match (even if empty) */
2359 item = PySequence_GetSlice(
2360 string, STATE_OFFSET(&state, last), state.endpos
2361 );
2362 if (!item)
2363 goto error;
2364 status = PyList_Append(list, item);
2365 Py_DECREF(item);
2366 if (status < 0)
2367 goto error;
2368
2369 state_fini(&state);
2370 return list;
2371
2372 error:
2373 Py_DECREF(list);
2374 state_fini(&state);
2375 return NULL;
2376
2377 }
2378
2379 static PyObject*
pattern_subx(PatternObject * self,PyObject * ptemplate,PyObject * string,Py_ssize_t count,Py_ssize_t subn)2380 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2381 Py_ssize_t count, Py_ssize_t subn)
2382 {
2383 SRE_STATE state;
2384 PyObject* list;
2385 PyObject* item;
2386 PyObject* filter;
2387 PyObject* args;
2388 PyObject* match;
2389 void* ptr;
2390 int status;
2391 Py_ssize_t n;
2392 Py_ssize_t i, b, e;
2393 int bint;
2394 int filter_is_callable;
2395
2396 if (PyCallable_Check(ptemplate)) {
2397 /* sub/subn takes either a function or a template */
2398 filter = ptemplate;
2399 Py_INCREF(filter);
2400 filter_is_callable = 1;
2401 } else {
2402 /* if not callable, check if it's a literal string */
2403 int literal;
2404 ptr = getstring(ptemplate, &n, &bint);
2405 b = bint;
2406 if (ptr) {
2407 if (b == 1) {
2408 literal = sre_literal_template((unsigned char *)ptr, n);
2409 } else {
2410 #if defined(HAVE_UNICODE)
2411 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2412 #endif
2413 }
2414 } else {
2415 PyErr_Clear();
2416 literal = 0;
2417 }
2418 if (literal) {
2419 filter = ptemplate;
2420 Py_INCREF(filter);
2421 filter_is_callable = 0;
2422 } else {
2423 /* not a literal; hand it over to the template compiler */
2424 filter = call(
2425 SRE_PY_MODULE, "_subx",
2426 PyTuple_Pack(2, self, ptemplate)
2427 );
2428 if (!filter)
2429 return NULL;
2430 filter_is_callable = PyCallable_Check(filter);
2431 }
2432 }
2433
2434 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2435 if (!string) {
2436 Py_DECREF(filter);
2437 return NULL;
2438 }
2439
2440 list = PyList_New(0);
2441 if (!list) {
2442 Py_DECREF(filter);
2443 state_fini(&state);
2444 return NULL;
2445 }
2446
2447 n = i = 0;
2448
2449 while (!count || n < count) {
2450
2451 state_reset(&state);
2452
2453 state.ptr = state.start;
2454
2455 if (state.charsize == 1) {
2456 status = sre_search(&state, PatternObject_GetCode(self));
2457 } else {
2458 #if defined(HAVE_UNICODE)
2459 status = sre_usearch(&state, PatternObject_GetCode(self));
2460 #endif
2461 }
2462
2463 if (PyErr_Occurred())
2464 goto error;
2465
2466 if (status <= 0) {
2467 if (status == 0)
2468 break;
2469 pattern_error(status);
2470 goto error;
2471 }
2472
2473 b = STATE_OFFSET(&state, state.start);
2474 e = STATE_OFFSET(&state, state.ptr);
2475
2476 if (i < b) {
2477 /* get segment before this match */
2478 item = PySequence_GetSlice(string, i, b);
2479 if (!item)
2480 goto error;
2481 status = PyList_Append(list, item);
2482 Py_DECREF(item);
2483 if (status < 0)
2484 goto error;
2485
2486 } else if (i == b && i == e && n > 0)
2487 /* ignore empty match on latest position */
2488 goto next;
2489
2490 if (filter_is_callable) {
2491 /* pass match object through filter */
2492 match = pattern_new_match(self, &state, 1);
2493 if (!match)
2494 goto error;
2495 args = PyTuple_Pack(1, match);
2496 if (!args) {
2497 Py_DECREF(match);
2498 goto error;
2499 }
2500 item = PyObject_CallObject(filter, args);
2501 Py_DECREF(args);
2502 Py_DECREF(match);
2503 if (!item)
2504 goto error;
2505 } else {
2506 /* filter is literal string */
2507 item = filter;
2508 Py_INCREF(item);
2509 }
2510
2511 /* add to list */
2512 if (item != Py_None) {
2513 status = PyList_Append(list, item);
2514 Py_DECREF(item);
2515 if (status < 0)
2516 goto error;
2517 }
2518
2519 i = e;
2520 n = n + 1;
2521
2522 next:
2523 /* move on */
2524 if (state.ptr == state.end)
2525 break;
2526 if (state.ptr == state.start)
2527 state.start = (void*) ((char*) state.ptr + state.charsize);
2528 else
2529 state.start = state.ptr;
2530
2531 }
2532
2533 /* get segment following last match */
2534 if (i < state.endpos) {
2535 item = PySequence_GetSlice(string, i, state.endpos);
2536 if (!item)
2537 goto error;
2538 status = PyList_Append(list, item);
2539 Py_DECREF(item);
2540 if (status < 0)
2541 goto error;
2542 }
2543
2544 state_fini(&state);
2545
2546 Py_DECREF(filter);
2547
2548 /* convert list to single string (also removes list) */
2549 item = join_list(list, string);
2550
2551 if (!item)
2552 return NULL;
2553
2554 if (subn)
2555 return Py_BuildValue("Nn", item, n);
2556
2557 return item;
2558
2559 error:
2560 Py_DECREF(list);
2561 state_fini(&state);
2562 Py_DECREF(filter);
2563 return NULL;
2564
2565 }
2566
2567 static PyObject*
pattern_sub(PatternObject * self,PyObject * args,PyObject * kw)2568 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2569 {
2570 PyObject* ptemplate;
2571 PyObject* string;
2572 Py_ssize_t count = 0;
2573 static char* kwlist[] = { "repl", "string", "count", NULL };
2574 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2575 &ptemplate, &string, &count))
2576 return NULL;
2577
2578 return pattern_subx(self, ptemplate, string, count, 0);
2579 }
2580
2581 static PyObject*
pattern_subn(PatternObject * self,PyObject * args,PyObject * kw)2582 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2583 {
2584 PyObject* ptemplate;
2585 PyObject* string;
2586 Py_ssize_t count = 0;
2587 static char* kwlist[] = { "repl", "string", "count", NULL };
2588 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2589 &ptemplate, &string, &count))
2590 return NULL;
2591
2592 return pattern_subx(self, ptemplate, string, count, 1);
2593 }
2594
2595 static PyObject*
pattern_copy(PatternObject * self,PyObject * unused)2596 pattern_copy(PatternObject* self, PyObject *unused)
2597 {
2598 #ifdef USE_BUILTIN_COPY
2599 PatternObject* copy;
2600 int offset;
2601
2602 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2603 if (!copy)
2604 return NULL;
2605
2606 offset = offsetof(PatternObject, groups);
2607
2608 Py_XINCREF(self->groupindex);
2609 Py_XINCREF(self->indexgroup);
2610 Py_XINCREF(self->pattern);
2611
2612 memcpy((char*) copy + offset, (char*) self + offset,
2613 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2614 copy->weakreflist = NULL;
2615
2616 return (PyObject*) copy;
2617 #else
2618 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2619 return NULL;
2620 #endif
2621 }
2622
2623 static PyObject*
pattern_deepcopy(PatternObject * self,PyObject * memo)2624 pattern_deepcopy(PatternObject* self, PyObject* memo)
2625 {
2626 #ifdef USE_BUILTIN_COPY
2627 PatternObject* copy;
2628
2629 copy = (PatternObject*) pattern_copy(self);
2630 if (!copy)
2631 return NULL;
2632
2633 if (!deepcopy(©->groupindex, memo) ||
2634 !deepcopy(©->indexgroup, memo) ||
2635 !deepcopy(©->pattern, memo)) {
2636 Py_DECREF(copy);
2637 return NULL;
2638 }
2639
2640 #else
2641 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2642 return NULL;
2643 #endif
2644 }
2645
2646 PyDoc_STRVAR(pattern_match_doc,
2647 "match(string[, pos[, endpos]]) --> match object or None.\n\
2648 Matches zero or more characters at the beginning of the string");
2649
2650 PyDoc_STRVAR(pattern_search_doc,
2651 "search(string[, pos[, endpos]]) --> match object or None.\n\
2652 Scan through string looking for a match, and return a corresponding\n\
2653 match object instance. Return None if no position in the string matches.");
2654
2655 PyDoc_STRVAR(pattern_split_doc,
2656 "split(string[, maxsplit = 0]) --> list.\n\
2657 Split string by the occurrences of pattern.");
2658
2659 PyDoc_STRVAR(pattern_findall_doc,
2660 "findall(string[, pos[, endpos]]) --> list.\n\
2661 Return a list of all non-overlapping matches of pattern in string.");
2662
2663 PyDoc_STRVAR(pattern_finditer_doc,
2664 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2665 Return an iterator over all non-overlapping matches for the \n\
2666 RE pattern in string. For each match, the iterator returns a\n\
2667 match object.");
2668
2669 PyDoc_STRVAR(pattern_sub_doc,
2670 "sub(repl, string[, count = 0]) --> newstring\n\
2671 Return the string obtained by replacing the leftmost non-overlapping\n\
2672 occurrences of pattern in string by the replacement repl.");
2673
2674 PyDoc_STRVAR(pattern_subn_doc,
2675 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2676 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2677 the leftmost non-overlapping occurrences of pattern with the\n\
2678 replacement repl.");
2679
2680 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2681
2682 static PyMethodDef pattern_methods[] = {
2683 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2684 pattern_match_doc},
2685 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2686 pattern_search_doc},
2687 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2688 pattern_sub_doc},
2689 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2690 pattern_subn_doc},
2691 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2692 pattern_split_doc},
2693 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2694 pattern_findall_doc},
2695 #if PY_VERSION_HEX >= 0x02020000
2696 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2697 pattern_finditer_doc},
2698 #endif
2699 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2700 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2701 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2702 {NULL, NULL}
2703 };
2704
2705 #define PAT_OFF(x) offsetof(PatternObject, x)
2706 static PyMemberDef pattern_members[] = {
2707 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2708 {"flags", T_INT, PAT_OFF(flags), READONLY},
2709 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2710 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2711 {NULL} /* Sentinel */
2712 };
2713
2714 statichere PyTypeObject Pattern_Type = {
2715 PyVarObject_HEAD_INIT(NULL, 0)
2716 "_" SRE_MODULE ".SRE_Pattern",
2717 sizeof(PatternObject), sizeof(SRE_CODE),
2718 (destructor)pattern_dealloc, /*tp_dealloc*/
2719 0, /* tp_print */
2720 0, /* tp_getattrn */
2721 0, /* tp_setattr */
2722 0, /* tp_compare */
2723 0, /* tp_repr */
2724 0, /* tp_as_number */
2725 0, /* tp_as_sequence */
2726 0, /* tp_as_mapping */
2727 0, /* tp_hash */
2728 0, /* tp_call */
2729 0, /* tp_str */
2730 0, /* tp_getattro */
2731 0, /* tp_setattro */
2732 0, /* tp_as_buffer */
2733 Py_TPFLAGS_DEFAULT, /* tp_flags */
2734 pattern_doc, /* tp_doc */
2735 0, /* tp_traverse */
2736 0, /* tp_clear */
2737 0, /* tp_richcompare */
2738 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2739 0, /* tp_iter */
2740 0, /* tp_iternext */
2741 pattern_methods, /* tp_methods */
2742 pattern_members, /* tp_members */
2743 };
2744
2745 static int _validate(PatternObject *self); /* Forward */
2746
2747 static PyObject *
_compile(PyObject * self_,PyObject * args)2748 _compile(PyObject* self_, PyObject* args)
2749 {
2750 /* "compile" pattern descriptor to pattern object */
2751
2752 PatternObject* self;
2753 Py_ssize_t i, n;
2754
2755 PyObject* pattern;
2756 int flags = 0;
2757 PyObject* code;
2758 Py_ssize_t groups = 0;
2759 PyObject* groupindex = NULL;
2760 PyObject* indexgroup = NULL;
2761 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2762 &PyList_Type, &code, &groups,
2763 &groupindex, &indexgroup))
2764 return NULL;
2765
2766 n = PyList_GET_SIZE(code);
2767 /* coverity[ampersand_in_size] */
2768 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2769 if (!self)
2770 return NULL;
2771 self->weakreflist = NULL;
2772 self->pattern = NULL;
2773 self->groupindex = NULL;
2774 self->indexgroup = NULL;
2775
2776 self->codesize = n;
2777
2778 for (i = 0; i < n; i++) {
2779 PyObject *o = PyList_GET_ITEM(code, i);
2780 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2781 : PyLong_AsUnsignedLong(o);
2782 if (value == (unsigned long)-1 && PyErr_Occurred()) {
2783 if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
2784 PyErr_SetString(PyExc_OverflowError,
2785 "regular expression code size limit exceeded");
2786 }
2787 break;
2788 }
2789 self->code[i] = (SRE_CODE) value;
2790 if ((unsigned long) self->code[i] != value) {
2791 PyErr_SetString(PyExc_OverflowError,
2792 "regular expression code size limit exceeded");
2793 break;
2794 }
2795 }
2796
2797 if (PyErr_Occurred()) {
2798 Py_DECREF(self);
2799 return NULL;
2800 }
2801
2802 Py_INCREF(pattern);
2803 self->pattern = pattern;
2804
2805 self->flags = flags;
2806
2807 self->groups = groups;
2808
2809 Py_XINCREF(groupindex);
2810 self->groupindex = groupindex;
2811
2812 Py_XINCREF(indexgroup);
2813 self->indexgroup = indexgroup;
2814
2815 self->weakreflist = NULL;
2816
2817 if (!_validate(self)) {
2818 Py_DECREF(self);
2819 return NULL;
2820 }
2821
2822 return (PyObject*) self;
2823 }
2824
2825 /* -------------------------------------------------------------------- */
2826 /* Code validation */
2827
2828 /* To learn more about this code, have a look at the _compile() function in
2829 Lib/sre_compile.py. The validation functions below checks the code array
2830 for conformance with the code patterns generated there.
2831
2832 The nice thing about the generated code is that it is position-independent:
2833 all jumps are relative jumps forward. Also, jumps don't cross each other:
2834 the target of a later jump is always earlier than the target of an earlier
2835 jump. IOW, this is okay:
2836
2837 J---------J-------T--------T
2838 \ \_____/ /
2839 \______________________/
2840
2841 but this is not:
2842
2843 J---------J-------T--------T
2844 \_________\_____/ /
2845 \____________/
2846
2847 It also helps that SRE_CODE is always an unsigned type.
2848 */
2849
2850 /* Defining this one enables tracing of the validator */
2851 #undef VVERBOSE
2852
2853 /* Trace macro for the validator */
2854 #if defined(VVERBOSE)
2855 #define VTRACE(v) printf v
2856 #else
2857 #define VTRACE(v) do {} while(0) /* do nothing */
2858 #endif
2859
2860 /* Report failure */
2861 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2862
2863 /* Extract opcode, argument, or skip count from code array */
2864 #define GET_OP \
2865 do { \
2866 VTRACE(("%p: ", code)); \
2867 if (code >= end) FAIL; \
2868 op = *code++; \
2869 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2870 } while (0)
2871 #define GET_ARG \
2872 do { \
2873 VTRACE(("%p= ", code)); \
2874 if (code >= end) FAIL; \
2875 arg = *code++; \
2876 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2877 } while (0)
2878 #define GET_SKIP_ADJ(adj) \
2879 do { \
2880 VTRACE(("%p= ", code)); \
2881 if (code >= end) FAIL; \
2882 skip = *code; \
2883 VTRACE(("%lu (skip to %p)\n", \
2884 (unsigned long)skip, code+skip)); \
2885 if (skip-adj > (Py_uintptr_t)(end - code)) \
2886 FAIL; \
2887 code++; \
2888 } while (0)
2889 #define GET_SKIP GET_SKIP_ADJ(0)
2890
2891 static int
_validate_charset(SRE_CODE * code,SRE_CODE * end)2892 _validate_charset(SRE_CODE *code, SRE_CODE *end)
2893 {
2894 /* Some variables are manipulated by the macros above */
2895 SRE_CODE op;
2896 SRE_CODE arg;
2897 SRE_CODE offset;
2898 int i;
2899
2900 while (code < end) {
2901 GET_OP;
2902 switch (op) {
2903
2904 case SRE_OP_NEGATE:
2905 break;
2906
2907 case SRE_OP_LITERAL:
2908 GET_ARG;
2909 break;
2910
2911 case SRE_OP_RANGE:
2912 GET_ARG;
2913 GET_ARG;
2914 break;
2915
2916 case SRE_OP_CHARSET:
2917 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2918 if (offset > (Py_uintptr_t)(end - code))
2919 FAIL;
2920 code += offset;
2921 break;
2922
2923 case SRE_OP_BIGCHARSET:
2924 GET_ARG; /* Number of blocks */
2925 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2926 if (offset > (Py_uintptr_t)(end - code))
2927 FAIL;
2928 /* Make sure that each byte points to a valid block */
2929 for (i = 0; i < 256; i++) {
2930 if (((unsigned char *)code)[i] >= arg)
2931 FAIL;
2932 }
2933 code += offset;
2934 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2935 if (offset > (Py_uintptr_t)(end - code))
2936 FAIL;
2937 code += offset;
2938 break;
2939
2940 case SRE_OP_CATEGORY:
2941 GET_ARG;
2942 switch (arg) {
2943 case SRE_CATEGORY_DIGIT:
2944 case SRE_CATEGORY_NOT_DIGIT:
2945 case SRE_CATEGORY_SPACE:
2946 case SRE_CATEGORY_NOT_SPACE:
2947 case SRE_CATEGORY_WORD:
2948 case SRE_CATEGORY_NOT_WORD:
2949 case SRE_CATEGORY_LINEBREAK:
2950 case SRE_CATEGORY_NOT_LINEBREAK:
2951 case SRE_CATEGORY_LOC_WORD:
2952 case SRE_CATEGORY_LOC_NOT_WORD:
2953 case SRE_CATEGORY_UNI_DIGIT:
2954 case SRE_CATEGORY_UNI_NOT_DIGIT:
2955 case SRE_CATEGORY_UNI_SPACE:
2956 case SRE_CATEGORY_UNI_NOT_SPACE:
2957 case SRE_CATEGORY_UNI_WORD:
2958 case SRE_CATEGORY_UNI_NOT_WORD:
2959 case SRE_CATEGORY_UNI_LINEBREAK:
2960 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2961 break;
2962 default:
2963 FAIL;
2964 }
2965 break;
2966
2967 default:
2968 FAIL;
2969
2970 }
2971 }
2972
2973 return 1;
2974 }
2975
2976 static int
_validate_inner(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)2977 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2978 {
2979 /* Some variables are manipulated by the macros above */
2980 SRE_CODE op;
2981 SRE_CODE arg;
2982 SRE_CODE skip;
2983
2984 VTRACE(("code=%p, end=%p\n", code, end));
2985
2986 if (code > end)
2987 FAIL;
2988
2989 while (code < end) {
2990 GET_OP;
2991 switch (op) {
2992
2993 case SRE_OP_MARK:
2994 /* We don't check whether marks are properly nested; the
2995 sre_match() code is robust even if they don't, and the worst
2996 you can get is nonsensical match results. */
2997 GET_ARG;
2998 if (arg > 2 * (size_t)groups + 1) {
2999 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
3000 FAIL;
3001 }
3002 break;
3003
3004 case SRE_OP_LITERAL:
3005 case SRE_OP_NOT_LITERAL:
3006 case SRE_OP_LITERAL_IGNORE:
3007 case SRE_OP_NOT_LITERAL_IGNORE:
3008 GET_ARG;
3009 /* The arg is just a character, nothing to check */
3010 break;
3011
3012 case SRE_OP_SUCCESS:
3013 case SRE_OP_FAILURE:
3014 /* Nothing to check; these normally end the matching process */
3015 break;
3016
3017 case SRE_OP_AT:
3018 GET_ARG;
3019 switch (arg) {
3020 case SRE_AT_BEGINNING:
3021 case SRE_AT_BEGINNING_STRING:
3022 case SRE_AT_BEGINNING_LINE:
3023 case SRE_AT_END:
3024 case SRE_AT_END_LINE:
3025 case SRE_AT_END_STRING:
3026 case SRE_AT_BOUNDARY:
3027 case SRE_AT_NON_BOUNDARY:
3028 case SRE_AT_LOC_BOUNDARY:
3029 case SRE_AT_LOC_NON_BOUNDARY:
3030 case SRE_AT_UNI_BOUNDARY:
3031 case SRE_AT_UNI_NON_BOUNDARY:
3032 break;
3033 default:
3034 FAIL;
3035 }
3036 break;
3037
3038 case SRE_OP_ANY:
3039 case SRE_OP_ANY_ALL:
3040 /* These have no operands */
3041 break;
3042
3043 case SRE_OP_IN:
3044 case SRE_OP_IN_IGNORE:
3045 GET_SKIP;
3046 /* Stop 1 before the end; we check the FAILURE below */
3047 if (!_validate_charset(code, code+skip-2))
3048 FAIL;
3049 if (code[skip-2] != SRE_OP_FAILURE)
3050 FAIL;
3051 code += skip-1;
3052 break;
3053
3054 case SRE_OP_INFO:
3055 {
3056 /* A minimal info field is
3057 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
3058 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
3059 more follows. */
3060 SRE_CODE flags, i;
3061 SRE_CODE *newcode;
3062 GET_SKIP;
3063 newcode = code+skip-1;
3064 GET_ARG; flags = arg;
3065 GET_ARG; /* min */
3066 GET_ARG; /* max */
3067 /* Check that only valid flags are present */
3068 if ((flags & ~(SRE_INFO_PREFIX |
3069 SRE_INFO_LITERAL |
3070 SRE_INFO_CHARSET)) != 0)
3071 FAIL;
3072 /* PREFIX and CHARSET are mutually exclusive */
3073 if ((flags & SRE_INFO_PREFIX) &&
3074 (flags & SRE_INFO_CHARSET))
3075 FAIL;
3076 /* LITERAL implies PREFIX */
3077 if ((flags & SRE_INFO_LITERAL) &&
3078 !(flags & SRE_INFO_PREFIX))
3079 FAIL;
3080 /* Validate the prefix */
3081 if (flags & SRE_INFO_PREFIX) {
3082 SRE_CODE prefix_len;
3083 GET_ARG; prefix_len = arg;
3084 GET_ARG; /* prefix skip */
3085 /* Here comes the prefix string */
3086 if (prefix_len > (Py_uintptr_t)(newcode - code))
3087 FAIL;
3088 code += prefix_len;
3089 /* And here comes the overlap table */
3090 if (prefix_len > (Py_uintptr_t)(newcode - code))
3091 FAIL;
3092 /* Each overlap value should be < prefix_len */
3093 for (i = 0; i < prefix_len; i++) {
3094 if (code[i] >= prefix_len)
3095 FAIL;
3096 }
3097 code += prefix_len;
3098 }
3099 /* Validate the charset */
3100 if (flags & SRE_INFO_CHARSET) {
3101 if (!_validate_charset(code, newcode-1))
3102 FAIL;
3103 if (newcode[-1] != SRE_OP_FAILURE)
3104 FAIL;
3105 code = newcode;
3106 }
3107 else if (code != newcode) {
3108 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3109 FAIL;
3110 }
3111 }
3112 break;
3113
3114 case SRE_OP_BRANCH:
3115 {
3116 SRE_CODE *target = NULL;
3117 for (;;) {
3118 GET_SKIP;
3119 if (skip == 0)
3120 break;
3121 /* Stop 2 before the end; we check the JUMP below */
3122 if (!_validate_inner(code, code+skip-3, groups))
3123 FAIL;
3124 code += skip-3;
3125 /* Check that it ends with a JUMP, and that each JUMP
3126 has the same target */
3127 GET_OP;
3128 if (op != SRE_OP_JUMP)
3129 FAIL;
3130 GET_SKIP;
3131 if (target == NULL)
3132 target = code+skip-1;
3133 else if (code+skip-1 != target)
3134 FAIL;
3135 }
3136 }
3137 break;
3138
3139 case SRE_OP_REPEAT_ONE:
3140 case SRE_OP_MIN_REPEAT_ONE:
3141 {
3142 SRE_CODE min, max;
3143 GET_SKIP;
3144 GET_ARG; min = arg;
3145 GET_ARG; max = arg;
3146 if (min > max)
3147 FAIL;
3148 if (max > SRE_MAXREPEAT)
3149 FAIL;
3150 if (!_validate_inner(code, code+skip-4, groups))
3151 FAIL;
3152 code += skip-4;
3153 GET_OP;
3154 if (op != SRE_OP_SUCCESS)
3155 FAIL;
3156 }
3157 break;
3158
3159 case SRE_OP_REPEAT:
3160 {
3161 SRE_CODE min, max;
3162 GET_SKIP;
3163 GET_ARG; min = arg;
3164 GET_ARG; max = arg;
3165 if (min > max)
3166 FAIL;
3167 if (max > SRE_MAXREPEAT)
3168 FAIL;
3169 if (!_validate_inner(code, code+skip-3, groups))
3170 FAIL;
3171 code += skip-3;
3172 GET_OP;
3173 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3174 FAIL;
3175 }
3176 break;
3177
3178 case SRE_OP_GROUPREF:
3179 case SRE_OP_GROUPREF_IGNORE:
3180 GET_ARG;
3181 if (arg >= (size_t)groups)
3182 FAIL;
3183 break;
3184
3185 case SRE_OP_GROUPREF_EXISTS:
3186 /* The regex syntax for this is: '(?(group)then|else)', where
3187 'group' is either an integer group number or a group name,
3188 'then' and 'else' are sub-regexes, and 'else' is optional. */
3189 GET_ARG;
3190 if (arg >= (size_t)groups)
3191 FAIL;
3192 GET_SKIP_ADJ(1);
3193 code--; /* The skip is relative to the first arg! */
3194 /* There are two possibilities here: if there is both a 'then'
3195 part and an 'else' part, the generated code looks like:
3196
3197 GROUPREF_EXISTS
3198 <group>
3199 <skipyes>
3200 ...then part...
3201 JUMP
3202 <skipno>
3203 (<skipyes> jumps here)
3204 ...else part...
3205 (<skipno> jumps here)
3206
3207 If there is only a 'then' part, it looks like:
3208
3209 GROUPREF_EXISTS
3210 <group>
3211 <skip>
3212 ...then part...
3213 (<skip> jumps here)
3214
3215 There is no direct way to decide which it is, and we don't want
3216 to allow arbitrary jumps anywhere in the code; so we just look
3217 for a JUMP opcode preceding our skip target.
3218 */
3219 if (skip >= 3 && skip-3 < (Py_uintptr_t)(end - code) &&
3220 code[skip-3] == SRE_OP_JUMP)
3221 {
3222 VTRACE(("both then and else parts present\n"));
3223 if (!_validate_inner(code+1, code+skip-3, groups))
3224 FAIL;
3225 code += skip-2; /* Position after JUMP, at <skipno> */
3226 GET_SKIP;
3227 if (!_validate_inner(code, code+skip-1, groups))
3228 FAIL;
3229 code += skip-1;
3230 }
3231 else {
3232 VTRACE(("only a then part present\n"));
3233 if (!_validate_inner(code+1, code+skip-1, groups))
3234 FAIL;
3235 code += skip-1;
3236 }
3237 break;
3238
3239 case SRE_OP_ASSERT:
3240 case SRE_OP_ASSERT_NOT:
3241 GET_SKIP;
3242 GET_ARG; /* 0 for lookahead, width for lookbehind */
3243 code--; /* Back up over arg to simplify math below */
3244 if (arg & 0x80000000)
3245 FAIL; /* Width too large */
3246 /* Stop 1 before the end; we check the SUCCESS below */
3247 if (!_validate_inner(code+1, code+skip-2, groups))
3248 FAIL;
3249 code += skip-2;
3250 GET_OP;
3251 if (op != SRE_OP_SUCCESS)
3252 FAIL;
3253 break;
3254
3255 default:
3256 FAIL;
3257
3258 }
3259 }
3260
3261 VTRACE(("okay\n"));
3262 return 1;
3263 }
3264
3265 static int
_validate_outer(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)3266 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3267 {
3268 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3269 FAIL;
3270 if (groups == 0) /* fix for simplejson */
3271 groups = 100; /* 100 groups should always be safe */
3272 return _validate_inner(code, end-1, groups);
3273 }
3274
3275 static int
_validate(PatternObject * self)3276 _validate(PatternObject *self)
3277 {
3278 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3279 {
3280 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3281 return 0;
3282 }
3283 else
3284 VTRACE(("Success!\n"));
3285 return 1;
3286 }
3287
3288 /* -------------------------------------------------------------------- */
3289 /* match methods */
3290
3291 static void
match_dealloc(MatchObject * self)3292 match_dealloc(MatchObject* self)
3293 {
3294 Py_XDECREF(self->regs);
3295 Py_XDECREF(self->string);
3296 Py_DECREF(self->pattern);
3297 PyObject_DEL(self);
3298 }
3299
3300 static PyObject*
match_getslice_by_index(MatchObject * self,Py_ssize_t index,PyObject * def)3301 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3302 {
3303 if (index < 0 || index >= self->groups) {
3304 /* raise IndexError if we were given a bad group number */
3305 PyErr_SetString(
3306 PyExc_IndexError,
3307 "no such group"
3308 );
3309 return NULL;
3310 }
3311
3312 index *= 2;
3313
3314 if (self->string == Py_None || self->mark[index] < 0) {
3315 /* return default value if the string or group is undefined */
3316 Py_INCREF(def);
3317 return def;
3318 }
3319
3320 return PySequence_GetSlice(
3321 self->string, self->mark[index], self->mark[index+1]
3322 );
3323 }
3324
3325 static Py_ssize_t
match_getindex(MatchObject * self,PyObject * index)3326 match_getindex(MatchObject* self, PyObject* index)
3327 {
3328 Py_ssize_t i;
3329
3330 if (_PyAnyInt_Check(index))
3331 return PyInt_AsSsize_t(index);
3332
3333 i = -1;
3334
3335 if (self->pattern->groupindex) {
3336 index = PyObject_GetItem(self->pattern->groupindex, index);
3337 if (index) {
3338 if (_PyAnyInt_Check(index))
3339 i = PyInt_AsSsize_t(index);
3340 Py_DECREF(index);
3341 } else
3342 PyErr_Clear();
3343 }
3344
3345 return i;
3346 }
3347
3348 static PyObject*
match_getslice(MatchObject * self,PyObject * index,PyObject * def)3349 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3350 {
3351 return match_getslice_by_index(self, match_getindex(self, index), def);
3352 }
3353
3354 static PyObject*
match_expand(MatchObject * self,PyObject * ptemplate)3355 match_expand(MatchObject* self, PyObject* ptemplate)
3356 {
3357 /* delegate to Python code */
3358 return call(
3359 SRE_PY_MODULE, "_expand",
3360 PyTuple_Pack(3, self->pattern, self, ptemplate)
3361 );
3362 }
3363
3364 static PyObject*
match_group(MatchObject * self,PyObject * args)3365 match_group(MatchObject* self, PyObject* args)
3366 {
3367 PyObject* result;
3368 Py_ssize_t i, size;
3369
3370 size = PyTuple_GET_SIZE(args);
3371
3372 switch (size) {
3373 case 0:
3374 result = match_getslice(self, Py_False, Py_None);
3375 break;
3376 case 1:
3377 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3378 break;
3379 default:
3380 /* fetch multiple items */
3381 result = PyTuple_New(size);
3382 if (!result)
3383 return NULL;
3384 for (i = 0; i < size; i++) {
3385 PyObject* item = match_getslice(
3386 self, PyTuple_GET_ITEM(args, i), Py_None
3387 );
3388 if (!item) {
3389 Py_DECREF(result);
3390 return NULL;
3391 }
3392 PyTuple_SET_ITEM(result, i, item);
3393 }
3394 break;
3395 }
3396 return result;
3397 }
3398
3399 static PyObject*
match_groups(MatchObject * self,PyObject * args,PyObject * kw)3400 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3401 {
3402 PyObject* result;
3403 Py_ssize_t index;
3404
3405 PyObject* def = Py_None;
3406 static char* kwlist[] = { "default", NULL };
3407 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3408 return NULL;
3409
3410 result = PyTuple_New(self->groups-1);
3411 if (!result)
3412 return NULL;
3413
3414 for (index = 1; index < self->groups; index++) {
3415 PyObject* item;
3416 item = match_getslice_by_index(self, index, def);
3417 if (!item) {
3418 Py_DECREF(result);
3419 return NULL;
3420 }
3421 PyTuple_SET_ITEM(result, index-1, item);
3422 }
3423
3424 return result;
3425 }
3426
3427 static PyObject*
match_groupdict(MatchObject * self,PyObject * args,PyObject * kw)3428 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3429 {
3430 PyObject* result;
3431 PyObject* keys;
3432 Py_ssize_t index;
3433
3434 PyObject* def = Py_None;
3435 static char* kwlist[] = { "default", NULL };
3436 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3437 return NULL;
3438
3439 result = PyDict_New();
3440 if (!result || !self->pattern->groupindex)
3441 return result;
3442
3443 keys = PyMapping_Keys(self->pattern->groupindex);
3444 if (!keys)
3445 goto failed;
3446
3447 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3448 int status;
3449 PyObject* key;
3450 PyObject* value;
3451 key = PyList_GET_ITEM(keys, index);
3452 if (!key)
3453 goto failed;
3454 value = match_getslice(self, key, def);
3455 if (!value)
3456 goto failed;
3457 status = PyDict_SetItem(result, key, value);
3458 Py_DECREF(value);
3459 if (status < 0)
3460 goto failed;
3461 }
3462
3463 Py_DECREF(keys);
3464
3465 return result;
3466
3467 failed:
3468 Py_XDECREF(keys);
3469 Py_DECREF(result);
3470 return NULL;
3471 }
3472
3473 static PyObject*
match_start(MatchObject * self,PyObject * args)3474 match_start(MatchObject* self, PyObject* args)
3475 {
3476 Py_ssize_t index;
3477
3478 PyObject* index_ = Py_False; /* zero */
3479 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3480 return NULL;
3481
3482 index = match_getindex(self, index_);
3483
3484 if (index < 0 || index >= self->groups) {
3485 PyErr_SetString(
3486 PyExc_IndexError,
3487 "no such group"
3488 );
3489 return NULL;
3490 }
3491
3492 /* mark is -1 if group is undefined */
3493 return PyInt_FromSsize_t(self->mark[index*2]);
3494 }
3495
3496 static PyObject*
match_end(MatchObject * self,PyObject * args)3497 match_end(MatchObject* self, PyObject* args)
3498 {
3499 Py_ssize_t index;
3500
3501 PyObject* index_ = Py_False; /* zero */
3502 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3503 return NULL;
3504
3505 index = match_getindex(self, index_);
3506
3507 if (index < 0 || index >= self->groups) {
3508 PyErr_SetString(
3509 PyExc_IndexError,
3510 "no such group"
3511 );
3512 return NULL;
3513 }
3514
3515 /* mark is -1 if group is undefined */
3516 return PyInt_FromSsize_t(self->mark[index*2+1]);
3517 }
3518
3519 LOCAL(PyObject*)
_pair(Py_ssize_t i1,Py_ssize_t i2)3520 _pair(Py_ssize_t i1, Py_ssize_t i2)
3521 {
3522 PyObject* pair;
3523 PyObject* item;
3524
3525 pair = PyTuple_New(2);
3526 if (!pair)
3527 return NULL;
3528
3529 item = PyInt_FromSsize_t(i1);
3530 if (!item)
3531 goto error;
3532 PyTuple_SET_ITEM(pair, 0, item);
3533
3534 item = PyInt_FromSsize_t(i2);
3535 if (!item)
3536 goto error;
3537 PyTuple_SET_ITEM(pair, 1, item);
3538
3539 return pair;
3540
3541 error:
3542 Py_DECREF(pair);
3543 return NULL;
3544 }
3545
3546 static PyObject*
match_span(MatchObject * self,PyObject * args)3547 match_span(MatchObject* self, PyObject* args)
3548 {
3549 Py_ssize_t index;
3550
3551 PyObject* index_ = Py_False; /* zero */
3552 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3553 return NULL;
3554
3555 index = match_getindex(self, index_);
3556
3557 if (index < 0 || index >= self->groups) {
3558 PyErr_SetString(
3559 PyExc_IndexError,
3560 "no such group"
3561 );
3562 return NULL;
3563 }
3564
3565 /* marks are -1 if group is undefined */
3566 return _pair(self->mark[index*2], self->mark[index*2+1]);
3567 }
3568
3569 static PyObject*
match_regs(MatchObject * self)3570 match_regs(MatchObject* self)
3571 {
3572 PyObject* regs;
3573 PyObject* item;
3574 Py_ssize_t index;
3575
3576 regs = PyTuple_New(self->groups);
3577 if (!regs)
3578 return NULL;
3579
3580 for (index = 0; index < self->groups; index++) {
3581 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3582 if (!item) {
3583 Py_DECREF(regs);
3584 return NULL;
3585 }
3586 PyTuple_SET_ITEM(regs, index, item);
3587 }
3588
3589 Py_INCREF(regs);
3590 self->regs = regs;
3591
3592 return regs;
3593 }
3594
3595 static PyObject*
match_copy(MatchObject * self,PyObject * unused)3596 match_copy(MatchObject* self, PyObject *unused)
3597 {
3598 #ifdef USE_BUILTIN_COPY
3599 MatchObject* copy;
3600 Py_ssize_t slots, offset;
3601
3602 slots = 2 * (self->pattern->groups+1);
3603
3604 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3605 if (!copy)
3606 return NULL;
3607
3608 /* this value a constant, but any compiler should be able to
3609 figure that out all by itself */
3610 offset = offsetof(MatchObject, string);
3611
3612 Py_XINCREF(self->pattern);
3613 Py_XINCREF(self->string);
3614 Py_XINCREF(self->regs);
3615
3616 memcpy((char*) copy + offset, (char*) self + offset,
3617 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3618
3619 return (PyObject*) copy;
3620 #else
3621 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3622 return NULL;
3623 #endif
3624 }
3625
3626 static PyObject*
match_deepcopy(MatchObject * self,PyObject * memo)3627 match_deepcopy(MatchObject* self, PyObject* memo)
3628 {
3629 #ifdef USE_BUILTIN_COPY
3630 MatchObject* copy;
3631
3632 copy = (MatchObject*) match_copy(self);
3633 if (!copy)
3634 return NULL;
3635
3636 if (!deepcopy((PyObject**) ©->pattern, memo) ||
3637 !deepcopy(©->string, memo) ||
3638 !deepcopy(©->regs, memo)) {
3639 Py_DECREF(copy);
3640 return NULL;
3641 }
3642
3643 #else
3644 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3645 return NULL;
3646 #endif
3647 }
3648
3649 PyDoc_STRVAR(match_doc,
3650 "The result of re.match() and re.search().\n\
3651 Match objects always have a boolean value of True.");
3652
3653 PyDoc_STRVAR(match_group_doc,
3654 "group([group1, ...]) -> str or tuple.\n\
3655 Return subgroup(s) of the match by indices or names.\n\
3656 For 0 returns the entire match.");
3657
3658 PyDoc_STRVAR(match_start_doc,
3659 "start([group=0]) -> int.\n\
3660 Return index of the start of the substring matched by group.");
3661
3662 PyDoc_STRVAR(match_end_doc,
3663 "end([group=0]) -> int.\n\
3664 Return index of the end of the substring matched by group.");
3665
3666 PyDoc_STRVAR(match_span_doc,
3667 "span([group]) -> tuple.\n\
3668 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3669
3670 PyDoc_STRVAR(match_groups_doc,
3671 "groups([default=None]) -> tuple.\n\
3672 Return a tuple containing all the subgroups of the match, from 1.\n\
3673 The default argument is used for groups\n\
3674 that did not participate in the match");
3675
3676 PyDoc_STRVAR(match_groupdict_doc,
3677 "groupdict([default=None]) -> dict.\n\
3678 Return a dictionary containing all the named subgroups of the match,\n\
3679 keyed by the subgroup name. The default argument is used for groups\n\
3680 that did not participate in the match");
3681
3682 PyDoc_STRVAR(match_expand_doc,
3683 "expand(template) -> str.\n\
3684 Return the string obtained by doing backslash substitution\n\
3685 on the string template, as done by the sub() method.");
3686
3687 static PyMethodDef match_methods[] = {
3688 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3689 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3690 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3691 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3692 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3693 match_groups_doc},
3694 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3695 match_groupdict_doc},
3696 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
3697 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3698 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3699 {NULL, NULL}
3700 };
3701
3702 static PyObject *
match_lastindex_get(MatchObject * self)3703 match_lastindex_get(MatchObject *self)
3704 {
3705 if (self->lastindex >= 0)
3706 return PyInt_FromSsize_t(self->lastindex);
3707 Py_INCREF(Py_None);
3708 return Py_None;
3709 }
3710
3711 static PyObject *
match_lastgroup_get(MatchObject * self)3712 match_lastgroup_get(MatchObject *self)
3713 {
3714 if (self->pattern->indexgroup && self->lastindex >= 0) {
3715 PyObject* result = PySequence_GetItem(
3716 self->pattern->indexgroup, self->lastindex
3717 );
3718 if (result)
3719 return result;
3720 PyErr_Clear();
3721 }
3722 Py_INCREF(Py_None);
3723 return Py_None;
3724 }
3725
3726 static PyObject *
match_regs_get(MatchObject * self)3727 match_regs_get(MatchObject *self)
3728 {
3729 if (self->regs) {
3730 Py_INCREF(self->regs);
3731 return self->regs;
3732 } else
3733 return match_regs(self);
3734 }
3735
3736 static PyGetSetDef match_getset[] = {
3737 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3738 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3739 {"regs", (getter)match_regs_get, (setter)NULL},
3740 {NULL}
3741 };
3742
3743 #define MATCH_OFF(x) offsetof(MatchObject, x)
3744 static PyMemberDef match_members[] = {
3745 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3746 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3747 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3748 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3749 {NULL}
3750 };
3751
3752
3753 /* FIXME: implement setattr("string", None) as a special case (to
3754 detach the associated string, if any */
3755
3756 static PyTypeObject Match_Type = {
3757 PyVarObject_HEAD_INIT(NULL, 0)
3758 "_" SRE_MODULE ".SRE_Match",
3759 sizeof(MatchObject), sizeof(Py_ssize_t),
3760 (destructor)match_dealloc, /* tp_dealloc */
3761 0, /* tp_print */
3762 0, /* tp_getattr */
3763 0, /* tp_setattr */
3764 0, /* tp_compare */
3765 0, /* tp_repr */
3766 0, /* tp_as_number */
3767 0, /* tp_as_sequence */
3768 0, /* tp_as_mapping */
3769 0, /* tp_hash */
3770 0, /* tp_call */
3771 0, /* tp_str */
3772 0, /* tp_getattro */
3773 0, /* tp_setattro */
3774 0, /* tp_as_buffer */
3775 Py_TPFLAGS_DEFAULT,
3776 match_doc, /* tp_doc */
3777 0, /* tp_traverse */
3778 0, /* tp_clear */
3779 0, /* tp_richcompare */
3780 0, /* tp_weaklistoffset */
3781 0, /* tp_iter */
3782 0, /* tp_iternext */
3783 match_methods, /* tp_methods */
3784 match_members, /* tp_members */
3785 match_getset, /* tp_getset */
3786 };
3787
3788 static PyObject*
pattern_new_match(PatternObject * pattern,SRE_STATE * state,int status)3789 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3790 {
3791 /* create match object (from state object) */
3792
3793 MatchObject* match;
3794 Py_ssize_t i, j;
3795 char* base;
3796 int n;
3797
3798 if (status > 0) {
3799
3800 /* create match object (with room for extra group marks) */
3801 /* coverity[ampersand_in_size] */
3802 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3803 2*(pattern->groups+1));
3804 if (!match)
3805 return NULL;
3806
3807 Py_INCREF(pattern);
3808 match->pattern = pattern;
3809
3810 Py_INCREF(state->string);
3811 match->string = state->string;
3812
3813 match->regs = NULL;
3814 match->groups = pattern->groups+1;
3815
3816 /* fill in group slices */
3817
3818 base = (char*) state->beginning;
3819 n = state->charsize;
3820
3821 match->mark[0] = ((char*) state->start - base) / n;
3822 match->mark[1] = ((char*) state->ptr - base) / n;
3823
3824 for (i = j = 0; i < pattern->groups; i++, j+=2)
3825 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3826 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3827 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3828 } else
3829 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3830
3831 match->pos = state->pos;
3832 match->endpos = state->endpos;
3833
3834 match->lastindex = state->lastindex;
3835
3836 return (PyObject*) match;
3837
3838 } else if (status == 0) {
3839
3840 /* no match */
3841 Py_INCREF(Py_None);
3842 return Py_None;
3843
3844 }
3845
3846 /* internal error */
3847 pattern_error(status);
3848 return NULL;
3849 }
3850
3851
3852 /* -------------------------------------------------------------------- */
3853 /* scanner methods (experimental) */
3854
3855 static void
scanner_dealloc(ScannerObject * self)3856 scanner_dealloc(ScannerObject* self)
3857 {
3858 state_fini(&self->state);
3859 Py_XDECREF(self->pattern);
3860 PyObject_DEL(self);
3861 }
3862
3863 static PyObject*
scanner_match(ScannerObject * self,PyObject * unused)3864 scanner_match(ScannerObject* self, PyObject *unused)
3865 {
3866 SRE_STATE* state = &self->state;
3867 PyObject* match;
3868 int status;
3869
3870 if (state->start == NULL)
3871 Py_RETURN_NONE;
3872
3873 state_reset(state);
3874
3875 state->ptr = state->start;
3876
3877 if (state->charsize == 1) {
3878 status = sre_match(state, PatternObject_GetCode(self->pattern));
3879 } else {
3880 #if defined(HAVE_UNICODE)
3881 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3882 #endif
3883 }
3884 if (PyErr_Occurred())
3885 return NULL;
3886
3887 match = pattern_new_match((PatternObject*) self->pattern,
3888 state, status);
3889
3890 if (status == 0)
3891 state->start = NULL;
3892 else if (state->ptr != state->start)
3893 state->start = state->ptr;
3894 else if (state->ptr != state->end)
3895 state->start = (void*) ((char*) state->ptr + state->charsize);
3896 else
3897 state->start = NULL;
3898
3899 return match;
3900 }
3901
3902
3903 static PyObject*
scanner_search(ScannerObject * self,PyObject * unused)3904 scanner_search(ScannerObject* self, PyObject *unused)
3905 {
3906 SRE_STATE* state = &self->state;
3907 PyObject* match;
3908 int status;
3909
3910 if (state->start == NULL)
3911 Py_RETURN_NONE;
3912
3913 state_reset(state);
3914
3915 state->ptr = state->start;
3916
3917 if (state->charsize == 1) {
3918 status = sre_search(state, PatternObject_GetCode(self->pattern));
3919 } else {
3920 #if defined(HAVE_UNICODE)
3921 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3922 #endif
3923 }
3924 if (PyErr_Occurred())
3925 return NULL;
3926
3927 match = pattern_new_match((PatternObject*) self->pattern,
3928 state, status);
3929
3930 if (status == 0)
3931 state->start = NULL;
3932 else if (state->ptr != state->start)
3933 state->start = state->ptr;
3934 else if (state->ptr != state->end)
3935 state->start = (void*) ((char*) state->ptr + state->charsize);
3936 else
3937 state->start = NULL;
3938
3939 return match;
3940 }
3941
3942 static PyMethodDef scanner_methods[] = {
3943 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3944 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3945 {NULL, NULL}
3946 };
3947
3948 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3949 static PyMemberDef scanner_members[] = {
3950 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3951 {NULL} /* Sentinel */
3952 };
3953
3954 statichere PyTypeObject Scanner_Type = {
3955 PyVarObject_HEAD_INIT(NULL, 0)
3956 "_" SRE_MODULE ".SRE_Scanner",
3957 sizeof(ScannerObject), 0,
3958 (destructor)scanner_dealloc, /*tp_dealloc*/
3959 0, /* tp_print */
3960 0, /* tp_getattr */
3961 0, /* tp_setattr */
3962 0, /* tp_reserved */
3963 0, /* tp_repr */
3964 0, /* tp_as_number */
3965 0, /* tp_as_sequence */
3966 0, /* tp_as_mapping */
3967 0, /* tp_hash */
3968 0, /* tp_call */
3969 0, /* tp_str */
3970 0, /* tp_getattro */
3971 0, /* tp_setattro */
3972 0, /* tp_as_buffer */
3973 Py_TPFLAGS_DEFAULT, /* tp_flags */
3974 0, /* tp_doc */
3975 0, /* tp_traverse */
3976 0, /* tp_clear */
3977 0, /* tp_richcompare */
3978 0, /* tp_weaklistoffset */
3979 0, /* tp_iter */
3980 0, /* tp_iternext */
3981 scanner_methods, /* tp_methods */
3982 scanner_members, /* tp_members */
3983 0, /* tp_getset */
3984 };
3985
3986 static PyObject*
pattern_scanner(PatternObject * pattern,PyObject * args)3987 pattern_scanner(PatternObject* pattern, PyObject* args)
3988 {
3989 /* create search state object */
3990
3991 ScannerObject* self;
3992
3993 PyObject* string;
3994 Py_ssize_t start = 0;
3995 Py_ssize_t end = PY_SSIZE_T_MAX;
3996 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3997 return NULL;
3998
3999 /* create scanner object */
4000 self = PyObject_NEW(ScannerObject, &Scanner_Type);
4001 if (!self)
4002 return NULL;
4003 self->pattern = NULL;
4004
4005 string = state_init(&self->state, pattern, string, start, end);
4006 if (!string) {
4007 Py_DECREF(self);
4008 return NULL;
4009 }
4010
4011 Py_INCREF(pattern);
4012 self->pattern = (PyObject*) pattern;
4013
4014 return (PyObject*) self;
4015 }
4016
4017 static PyMethodDef _functions[] = {
4018 {"compile", _compile, METH_VARARGS},
4019 {"getcodesize", sre_codesize, METH_NOARGS},
4020 {"getlower", sre_getlower, METH_VARARGS},
4021 {NULL, NULL}
4022 };
4023
4024 #if PY_VERSION_HEX < 0x02030000
init_sre(void)4025 DL_EXPORT(void) init_sre(void)
4026 #else
4027 PyMODINIT_FUNC init_sre(void)
4028 #endif
4029 {
4030 PyObject* m;
4031 PyObject* d;
4032 PyObject* x;
4033
4034 /* Patch object types */
4035 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
4036 PyType_Ready(&Scanner_Type))
4037 return;
4038
4039 m = Py_InitModule("_" SRE_MODULE, _functions);
4040 if (m == NULL)
4041 return;
4042 d = PyModule_GetDict(m);
4043
4044 x = PyInt_FromLong(SRE_MAGIC);
4045 if (x) {
4046 PyDict_SetItemString(d, "MAGIC", x);
4047 Py_DECREF(x);
4048 }
4049
4050 x = PyInt_FromLong(sizeof(SRE_CODE));
4051 if (x) {
4052 PyDict_SetItemString(d, "CODESIZE", x);
4053 Py_DECREF(x);
4054 }
4055
4056 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
4057 if (x) {
4058 PyDict_SetItemString(d, "MAXREPEAT", x);
4059 Py_DECREF(x);
4060 }
4061
4062 x = PyString_FromString(copyright);
4063 if (x) {
4064 PyDict_SetItemString(d, "copyright", x);
4065 Py_DECREF(x);
4066 }
4067 }
4068
4069 #endif /* !defined(SRE_RECURSIVE) */
4070
4071 /* vim:ts=4:sw=4:et
4072 */
4073