• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // -V::506
2 
3 /* Copyright (c) 2014 by Ian Piumarta
4  * All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the 'Software'),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, provided that the above copyright notice(s) and this
12  * permission notice appear in all copies of the Software.  Acknowledgement
13  * of the use of this Software in supporting documentation would be
14  * appreciated but is not required.
15  *
16  * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
17  */
18 
19 #include <ctype.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <setjmp.h>
23 #include <assert.h>
24 
25 #include "lwre.h"
26 
27 #ifndef RE_ERROR
28 # define RE_ERROR(RE, CODE, MESSAGE) { (RE)->error_message = (MESSAGE);  \
29                                        longjmp(*(RE)->error_env, (re->error_code = RE_ERROR_ ## CODE)); }
30 #endif
31 
32 #ifndef RE_MALLOC
33 # define RE_MALLOC(RE, SIZE) re__malloc((RE), (SIZE))
34 
re__malloc(struct re * re,size_t size)35 static void *re__malloc(struct re *re, size_t size) {
36   void *p = malloc(size);
37   if (!p) {
38     RE_ERROR(re, NOMEM, "out of memory");
39   }
40   return p;
41 }
42 
43 #endif
44 
45 #ifndef RE_CALLOC
46 # define RE_CALLOC(RE, NMEMB, SIZE) re__calloc((RE), (NMEMB), (SIZE))
47 
re__calloc(struct re * re,size_t nmemb,size_t size)48 static void *re__calloc(struct re *re, size_t nmemb, size_t size) {
49   void *p = calloc(nmemb, size);
50   if (p) {
51     return p;
52   }
53   if (re) {
54     RE_ERROR(re, NOMEM, "out of memory");
55   }
56   return p;
57 }
58 
59 #endif
60 
61 #ifndef RE_REALLOC
62 # define RE_REALLOC(RE, PTR, SIZE) re__realloc((RE), (PTR), (SIZE))
63 
re__realloc(struct re * re,void * ptr,size_t size)64 static inline void *re__realloc(struct re *re, void *ptr, size_t size) {
65   void *p = realloc(ptr, size);
66   if (!p) {
67     RE_ERROR(re, NOMEM, "out of memory");
68   }
69   return p;
70 }
71 
72 #endif
73 
74 #ifndef RE_FREE
75 # define RE_FREE(RE, PTR) free(PTR)
76 #endif
77 
78 /* arrays */
79 
80 #define re_array_of(TYPE) \
81   struct {      \
82     int   size;    \
83     int   capacity;  \
84     TYPE *at;    \
85   }
86 
87 #define RE_ARRAY_OF_INITIALISER { 0, 0, 0 }
88 
89 #define re_array_append(RE, ARRAY, ELEMENT)                 \
90   ((ARRAY).size++,                        \
91    (((ARRAY).size > (ARRAY).capacity)                   \
92     ? ((ARRAY).at = RE_REALLOC((RE), (ARRAY).at,               \
93                                sizeof(*(ARRAY).at) * ((ARRAY).capacity = ((ARRAY).capacity      \
94                                                                           ? ((ARRAY).capacity * 2)    \
95                                                                           : 8))))       \
96     : (ARRAY).at)                       \
97    [(ARRAY).size - 1] = (ELEMENT))
98 
99 #define re_array_copy(RE, ARRAY)          \
100   { (ARRAY).size,             \
101     (ARRAY).size,             \
102     memcpy(RE_MALLOC((RE), sizeof((ARRAY).at[0]) * (ARRAY).size), \
103            (ARRAY).at,            \
104            sizeof((ARRAY).at[0]) * (ARRAY).size) }
105 
106 #define re_array_release(RE, ARRAY) {   \
107     if ((ARRAY).at) {     \
108       RE_FREE((RE), (ARRAY).at);    \
109       (ARRAY).at = 0;      \
110     }         \
111 }
112 
113 /* bit sets */
114 
115 struct RE_BitSet;
116 typedef struct RE_BitSet RE_BitSet;
117 
118 struct RE_BitSet {
119   int inverted;
120   unsigned char bits[256 / sizeof(unsigned char)];
121 };
122 
re_bitset__includes(RE_BitSet * c,int i)123 static int re_bitset__includes(RE_BitSet *c, int i) {
124   if ((i < 0) || (255 < i)) {
125     return 0;
126   }
127   return (c->bits[i / 8] >> (i % 8)) & 1;
128 }
129 
re_bitset_includes(RE_BitSet * c,int i)130 static int re_bitset_includes(RE_BitSet *c, int i) {
131   int inc = re_bitset__includes(c, i);
132   if (c->inverted) {
133     inc = !inc;
134   }
135   return inc;
136 }
137 
re_bitset_add(RE_BitSet * c,int i)138 static void re_bitset_add(RE_BitSet *c, int i) {
139   if ((i < 0) || (255 < i)) {
140     return;
141   }
142   c->bits[i / 8] |= (1 << (i % 8));
143 }
144 
145 /* character classes */
146 
re_make_char(struct re * re)147 static int re_make_char(struct re *re) {
148   const char *p = re->position;
149   if (!*p) {
150     return 0;
151   }
152   int c = *p++;
153   if (('\\' == c) && *p) {
154     c = *p++;
155   }
156   re->position = p;
157   return c;
158 }
159 
re_make_class(struct re * re)160 static RE_BitSet *re_make_class(struct re *re) {
161   RE_BitSet *c = RE_CALLOC(re, 1, sizeof(RE_BitSet));
162   int last = -1;
163   c->inverted = ('^' == *re->position); // -V522
164   if (c->inverted) {
165     re->position++;
166   }
167   while (*re->position && (']' != *re->position)) {
168     int this = re->position[0];
169     if (('-' == this) && (last >= 0) && re->position[1] && (']' != re->position[1])) {
170       re->position++;
171       this = re_make_char(re);
172       do {
173         re_bitset_add(c, last++);
174       } while (last <= this);
175       last = -1;
176     } else {
177       this = re_make_char(re);
178       re_bitset_add(c, this);
179       last = this;
180     }
181   }
182   return c;
183 }
184 
185 /* instructions */
186 
187 enum { RE_Any, RE_Char, RE_Class, RE_Accept, RE_Jump, RE_Fork, RE_Begin, RE_End, };
188 
189 struct RE_Insn;
190 typedef struct RE_Insn RE_Insn;
191 
192 struct RE_Insn {
193   int  opcode;
194   long x;
195   union {
196     long       y;
197     RE_BitSet *c;
198   };
199   union {
200     RE_Insn *next;
201     char    *stamp;
202   };
203 };
204 
205 struct RE_Compiled;
206 typedef struct RE_Compiled RE_Compiled;
207 
208 /*
209    struct RE_Compiled
210    {
211     int    size;
212     RE_Insn *first;
213     RE_Insn *last;
214    };
215 
216  #define RE_COMPILED_INITIALISER { 0, 0, 0 }
217  */
218 
re_insn_new(struct re * re,int opc)219 static RE_Compiled re_insn_new(struct re *re, int opc) {
220   RE_Insn *insn = RE_CALLOC(re, 1, sizeof(RE_Insn));
221   insn->opcode = opc;  // -V522
222   RE_Compiled insns = { 1, insn, insn };
223   return insns;
224 }
225 
re_new_Any(struct re * re)226 static RE_Compiled re_new_Any(struct re *re) {
227   RE_Compiled insns = re_insn_new(re, RE_Any);
228   return insns;
229 }
230 
re_new_Char(struct re * re,int c)231 static RE_Compiled re_new_Char(struct re *re, int c) {
232   RE_Compiled insns = re_insn_new(re, RE_Char);
233   insns.first->x = c;
234   return insns;
235 }
236 
re_new_Class(struct re * re,RE_BitSet * c)237 static RE_Compiled re_new_Class(struct re *re, RE_BitSet *c) {
238   RE_Compiled insns = re_insn_new(re, RE_Class);
239   insns.first->c = c;
240   return insns;
241 }
242 
re_new_Accept(struct re * re)243 static RE_Compiled re_new_Accept(struct re *re) {
244   RE_Compiled insns = re_insn_new(re, RE_Accept);
245   return insns;
246 }
247 
re_new_Jump(struct re * re,int x)248 static RE_Compiled re_new_Jump(struct re *re, int x) {
249   RE_Compiled insns = re_insn_new(re, RE_Jump);
250   insns.first->x = x;
251   return insns;
252 }
253 
re_new_Fork(struct re * re,int x,int y)254 static RE_Compiled re_new_Fork(struct re *re, int x, int y) {
255   RE_Compiled insns = re_insn_new(re, RE_Fork);
256   insns.first->x = x;
257   insns.first->y = y;
258   return insns;
259 }
260 
re_new_Begin(struct re * re)261 static RE_Compiled re_new_Begin(struct re *re) {
262   RE_Compiled insns = re_insn_new(re, RE_Begin);
263   return insns;
264 }
265 
re_new_End(struct re * re)266 static RE_Compiled re_new_End(struct re *re) {
267   RE_Compiled insns = re_insn_new(re, RE_End);
268   return insns;
269 }
270 
re_program_append(RE_Compiled * insns,RE_Compiled tail)271 static void re_program_append(RE_Compiled *insns, RE_Compiled tail) {
272   insns->last->next = tail.first;
273   insns->last = tail.last;
274   insns->size += tail.size;
275 }
276 
re_program_prepend(RE_Compiled * insns,RE_Compiled head)277 static void re_program_prepend(RE_Compiled *insns, RE_Compiled head) {
278   head.last->next = insns->first;
279   insns->first = head.first;
280   insns->size += head.size;
281 }
282 
re_program_free(struct re * re,RE_Compiled * insns)283 static void re_program_free(struct re *re, RE_Compiled *insns) {
284   int i;
285   for (i = 0; i < insns->size; ++i) {
286     switch (insns->first[i].opcode) {
287       case RE_Class: {
288         RE_FREE(re, insns->first[i].c);
289         insns->first[i].c = 0;
290         break;
291       }
292     }
293   }
294   RE_FREE(re, insns->first);
295   insns->first = insns->last = 0;
296   insns->size = 0;
297 }
298 
299 /* compilation */
300 
301 /*
302    struct re
303    {
304     char     *expression;
305     char     *position;
306     jmp_buf    *error_env;
307     int       error_code;
308     char     *error_message;
309     struct RE_Compiled    code;
310     char    **matches;
311     int       nmatches;
312    };
313  */
314 
315 static RE_Compiled re_compile_expression(struct re *re);
316 
re_compile_primary(struct re * re)317 static RE_Compiled re_compile_primary(struct re *re) {
318   int c = *re->position++;
319   assert(0 != c);
320   switch (c) {
321     case '\\': {
322       if (*re->position) {
323         c = *re->position++;
324       }
325       break;
326     }
327     case '.': {
328       return re_new_Any(re);
329     }
330     case '[': {
331       RE_BitSet *cc = re_make_class(re);
332       if (']' != *re->position) {
333         RE_FREE(re, cc);
334         RE_ERROR(re, CHARSET, "expected ']' at end of character set");
335       }
336       re->position++;
337       return re_new_Class(re, cc);
338     };
339     case '(': {
340       RE_Compiled insns = re_compile_expression(re);
341       if (')' != *re->position) {
342         RE_Insn *insn, *next;
343         for (insn = insns.first; insn; insn = next) {
344           next = insn->next;
345           RE_FREE(re, insn);
346         }
347         RE_ERROR(re, SUBEXP, "expected ')' at end of subexpression");
348       }
349       re->position++;
350       return insns;
351     }
352     case '{': {
353       RE_Compiled insns = re_compile_expression(re);
354       if ('}' != *re->position) {
355         RE_Insn *insn, *next;
356         for (insn = insns.first; insn; insn = next) {
357           next = insn->next;
358           RE_FREE(re, insn);
359         }
360         RE_ERROR(re, SUBMATCH, "expected '}' at end of submatch");
361       }
362       re_program_prepend(&insns, re_new_Begin(re));
363       re_program_append(&insns, re_new_End(re));
364       re->position++;
365       return insns;
366     }
367   }
368   return re_new_Char(re, c);
369 }
370 
re_compile_suffix(struct re * re)371 static RE_Compiled re_compile_suffix(struct re *re) {
372   RE_Compiled insns = re_compile_primary(re);
373   switch (*re->position) {
374     case '?': {
375       re->position++;
376       if ('?' == *re->position) {
377         re->position++;
378         re_program_prepend(&insns, re_new_Fork(re, insns.size, 0));
379       } else {
380         re_program_prepend(&insns, re_new_Fork(re, 0, insns.size));
381       }
382       break;
383     }
384     case '*': {
385       re->position++;
386       if ('?' == *re->position) {
387         re->position++;
388         re_program_prepend(&insns, re_new_Fork(re, insns.size + 1, 0));
389       } else {
390         re_program_prepend(&insns, re_new_Fork(re, 0, insns.size + 1));
391       }
392       re_program_append(&insns, re_new_Jump(re, -(insns.size + 1)));
393       break;
394     }
395     case '+': {
396       re->position++;
397       if ('?' == *re->position) {
398         re->position++;
399         re_program_append(&insns, re_new_Fork(re, 0, -(insns.size + 1)));
400       } else {
401         re_program_append(&insns, re_new_Fork(re, -(insns.size + 1), 0));
402       }
403       break;
404     }
405   }
406   return insns;
407 }
408 
re_compile_sequence(struct re * re)409 static RE_Compiled re_compile_sequence(struct re *re) {
410   if (!*re->position) {
411     return re_new_Accept(re);
412   }
413   RE_Compiled head = re_compile_suffix(re);
414   while (*re->position && !strchr("|)}>", *re->position))
415     re_program_append(&head, re_compile_suffix(re));
416   if (!*re->position) {
417     re_program_append(&head, re_new_Accept(re));
418   }
419   return head;
420 }
421 
re_compile_expression(struct re * re)422 static RE_Compiled re_compile_expression(struct re *re) {
423   RE_Compiled head = re_compile_sequence(re);
424   while ('|' == *re->position) {
425     re->position++;
426     RE_Compiled tail = re_compile_sequence(re);
427     re_program_append(&head, re_new_Jump(re, tail.size));
428     re_program_prepend(&head, re_new_Fork(re, 0, head.size));
429     re_program_append(&head, tail);
430   }
431   return head;
432 }
433 
re_compile(struct re * re)434 static RE_Compiled re_compile(struct re *re) {
435   jmp_buf env;
436   RE_Compiled insns = RE_COMPILED_INITIALISER;
437   re->error_env = &env;
438   if (setjmp(env)) {  /* syntax error */
439     return insns;
440   }
441   insns = re_compile_expression(re);
442   re_array_of(RE_Insn) program = RE_ARRAY_OF_INITIALISER;
443   RE_Insn *insn, *next;
444   for (insn = insns.first; insn; insn = next) {
445     re_array_append(re, program, *insn);
446     next = insn->next;
447     RE_FREE(re, insn);
448   }
449 
450 #if 0
451   int i;
452   for (i = 0; i < program.size; ++i) {
453     RE_Insn *insn = &program.at[i];
454     printf("%03i ", i);
455     switch (insn->opcode) {
456       case RE_Any:
457         printf("Any\n");
458         break;
459       case RE_Char:
460         printf("Char   %li\n", insn->x);
461         break;
462       case RE_Class: {
463         printf("Class ");
464         {
465           RE_BitSet *c = insn->c;
466           int i;
467           putchar('[');
468           if (c->inverted) {
469             putchar('^');
470           }
471           for (i = 0; i < 256; ++i)
472             if (re_bitset__includes(c, i)) {
473               switch (i) {
474                 case '\a':
475                   printf("\\a");
476                   break;
477                 case '\b':
478                   printf("\\b");
479                   break;
480                 case '\t':
481                   printf("\\t");
482                   break;
483                 case '\n':
484                   printf("\\n");
485                   break;
486                 case '\v':
487                   printf("\\v");
488                   break;
489                 case '\f':
490                   printf("\\f");
491                   break;
492                 case '\r':
493                   printf("\\r");
494                   break;
495                 case '\\':
496                   printf("\\\\");
497                   break;
498                 case ']':
499                   printf("\\]");
500                   break;
501                 case '^':
502                   printf("\\^");
503                   break;
504                 case '-':
505                   printf("\\-");
506                   break;
507                 default:
508                   if (isprint(i)) {
509                     putchar(i);
510                   } else {
511                     printf("\\x%02x", i);
512                   }
513               }
514             }
515           putchar(']');
516         }
517         putchar('\n');
518         break;
519       }
520       case RE_Accept:
521         printf("Accept\n");
522         break;
523       case RE_Jump:
524         printf("Jump   %li -> %03li\n", insn->x, i + 1 + insn->x);
525         break;
526       case RE_Fork:
527         printf("Fork   %li %li -> %03li %03li\n", insn->x, insn->y, i + 1 + insn->x, i + 1 + insn->y);
528         break;
529       case RE_Begin:
530         printf("Begin\n");
531         break;
532       case RE_End:
533         printf("End\n");
534         break;
535       default:
536         printf("?%i\n", insn->opcode);
537         break;
538     }
539   }
540 #endif
541 
542   assert(program.size == insns.size);
543 
544   insns.first = program.at;
545   insns.last = insns.first + insns.size;
546 
547   return insns;
548 }
549 
550 /* submatch recording */
551 
552 typedef re_array_of(char*) re_array_of_charp;
553 
554 struct RE_Submatches;
555 typedef struct RE_Submatches RE_Submatches;
556 
557 struct RE_Submatches {
558   int refs;
559   re_array_of_charp beginnings;
560   re_array_of_charp endings;
561 };
562 
re_submatches_copy(struct re * re,RE_Submatches * orig)563 static RE_Submatches *re_submatches_copy(struct re *re, RE_Submatches *orig) {
564   RE_Submatches *subs = RE_CALLOC(re, 1, sizeof(RE_Submatches));
565   if (orig) {
566     subs->beginnings = (re_array_of_charp) re_array_copy(re, orig->beginnings); // -V522
567     subs->endings = (re_array_of_charp) re_array_copy(re, orig->endings);
568   }
569   return subs;
570 }
571 
re_submatches_free(struct re * re,RE_Submatches * subs)572 static void re_submatches_free(struct re *re, RE_Submatches *subs) {
573   assert(subs);
574   assert(!subs->refs);
575   re_array_release(re, subs->beginnings);
576   re_array_release(re, subs->endings);
577   RE_FREE(re, subs);
578 }
579 
re_submatches_link(RE_Submatches * subs)580 static inline RE_Submatches *re_submatches_link(RE_Submatches *subs) {
581   if (subs) {
582     subs->refs++;
583   }
584   return subs;
585 }
586 
re_submatches_unlink(struct re * re,RE_Submatches * subs)587 static inline void re_submatches_unlink(struct re *re, RE_Submatches *subs) {
588   if (subs && (0 == --(subs->refs))) {
589     re_submatches_free(re, subs);
590   }
591 }
592 
593 /* matching */
594 
595 struct RE_Thread;
596 typedef struct RE_Thread RE_Thread;
597 
598 struct RE_Thread {
599   RE_Insn       *pc;
600   RE_Submatches *submatches;
601 };
602 
603 struct RE_ThreadList;
604 typedef struct RE_ThreadList RE_ThreadList;
605 
606 struct RE_ThreadList {
607   int size;
608   RE_Thread *at;
609 };
610 
re_thread(RE_Insn * pc,RE_Submatches * subs)611 static inline RE_Thread re_thread(RE_Insn *pc, RE_Submatches *subs) {
612   return (RE_Thread) {
613            pc, subs
614   };
615 }
616 
re_thread_schedule(struct re * re,RE_ThreadList * threads,RE_Insn * pc,char * sp,RE_Submatches * subs)617 static void re_thread_schedule(struct re *re, RE_ThreadList *threads, RE_Insn *pc, char *sp, RE_Submatches *subs) {
618   if (pc->stamp == sp) {
619     return;
620   }
621   pc->stamp = sp;
622 
623   switch (pc->opcode) {
624     case RE_Jump:
625       re_thread_schedule(re, threads, pc + 1 + pc->x, sp, subs);
626       return;
627     case RE_Fork:
628       re_thread_schedule(re, threads, pc + 1 + pc->x, sp, subs);
629       re_thread_schedule(re, threads, pc + 1 + pc->y, sp, subs);
630       return;
631     case RE_Begin:
632       subs = re_submatches_copy(re, subs);
633       re_array_append(re, subs->beginnings, sp); // -V522
634       re_thread_schedule(re, threads, pc + 1, sp, subs);
635       if (!subs->refs) {
636         re_submatches_free(re, subs);
637       }
638       return;
639     case RE_End: {
640       subs = re_submatches_copy(re, subs);
641 #   if 0    /* non-nesting groups: ab{cd{ef}gh}ij => {cdef} {efgh}  */
642       re_array_append(re, subs->endings, sp);
643 #   else    /* nesting groups: ab{cd{ef}gh}ij => {cdefgh} {ef}  */
644       while (subs->endings.size < subs->beginnings.size) re_array_append(re, subs->endings, 0);
645       int i;
646       for (i = subs->endings.size; i--; ) {
647         if (!subs->endings.at[i]) {
648           subs->endings.at[i] = sp;
649           break;
650         }
651       }
652 #   endif
653       re_thread_schedule(re, threads, pc + 1, sp, subs);
654       if (!subs->refs) {
655         re_submatches_free(re, subs);
656       }
657       return;
658     }
659   }
660   threads->at[threads->size++] = re_thread(pc, re_submatches_link(subs));
661 }
662 
re_program_run(struct re * re,char * input,char *** saved,int * nsaved)663 static int re_program_run(struct re *re, char *input, char ***saved, int *nsaved) {
664   int matched = RE_ERROR_NOMATCH;
665   if (!re) {
666     return matched;
667   }
668 
669   RE_Submatches *submatches = 0;
670   RE_ThreadList a = { 0, 0 }, b = { 0, 0 }, *here = &a, *next = &b;
671 
672   char *sp = input;
673   re->position = 0;
674 
675   jmp_buf env;
676   re->error_env = &env;
677 
678   if (setjmp(env)) {  /* out of memory */
679     matched = re->error_code;
680     goto bailout;
681   }
682 
683   a.at = RE_CALLOC(re, re->code.size, sizeof(RE_Thread));
684   b.at = RE_CALLOC(re, re->code.size, sizeof(RE_Thread));
685 
686   re_thread_schedule(re, here, re->code.first, input, 0);
687 
688   {
689     int i;
690     for (i = 0; i < re->code.size; ++i)
691       re->code.first[i].stamp = 0;
692   }
693 
694   for (sp = input; here->size; ++sp) {
695     int i;
696     for (i = 0; i < here->size; ++i) {
697       RE_Thread t = here->at[i];
698       switch (t.pc->opcode) {
699         case RE_Any: {
700           if (*sp) {
701             re_thread_schedule(re, next, t.pc + 1, sp + 1, t.submatches);
702           }
703           break;
704         }
705         case RE_Char: {
706           if (*sp == t.pc->x) {
707             re_thread_schedule(re, next, t.pc + 1, sp + 1, t.submatches);
708           }
709           break;
710         }
711         case RE_Class: {
712           if (re_bitset_includes(t.pc->c, *sp)) {
713             re_thread_schedule(re, next, t.pc + 1, sp + 1, t.submatches);
714           }
715           break;
716         }
717         case RE_Accept: {
718           matched = sp - input;
719           re_submatches_unlink(re, submatches);
720           submatches = re_submatches_link(t.submatches);
721           while (i < here->size) re_submatches_unlink(re, here->at[i++].submatches);
722           goto nextchar;
723         }
724         default:
725           RE_ERROR(re, ENGINE, "illegal instruction in compiled regular expression (please report this bug)");
726       }
727       re_submatches_unlink(re, t.submatches);
728     }
729 nextchar:
730     ;
731     RE_ThreadList *tmp = here;
732     here = next;
733     next = tmp;
734     next->size = 0;
735     if (!*sp) {
736       break;
737     }
738   }
739 
740 bailout:
741   re->position = sp;
742 
743   {
744     int i;
745     for (i = 0; i < here->size; ++i)
746       re_submatches_unlink(re, here->at[i].submatches);
747   }
748 
749   RE_FREE(re, a.at);
750   RE_FREE(re, b.at);
751 
752   if (submatches) {
753     if (saved && nsaved && (matched >= 0)) {
754       assert(submatches->beginnings.size == submatches->endings.size);
755       *nsaved = submatches->beginnings.size * 2;
756       *saved = RE_CALLOC(re, *nsaved, sizeof(char*));
757       int i;
758       for (i = 0; i < *nsaved; i += 2) {
759         (*saved)[i + 0] = submatches->beginnings.at[i / 2];
760         (*saved)[i + 1] = submatches->endings.at[i / 2];
761       }
762     }
763     re_submatches_unlink(re, submatches);
764   }
765 
766   return matched;
767 }
768 
769 /* public interface */
770 
lwre_new(const char * expr)771 struct re *lwre_new(const char *expr) {
772   struct re *re = RE_CALLOC(0, 1, sizeof(struct re));
773   if (re) {
774     re->expression = expr;
775   }
776   return re;
777 }
778 
lwre_match(struct re * re,char * input)779 int lwre_match(struct re *re, char *input) {
780   RE_FREE(re, re->matches);
781   re->matches = 0;
782   re->nmatches = 0;
783   if (!re->expression) {
784     return 0;
785   }
786   if (!re->code.size) {
787     re->position = re->expression;
788     re->error_code = 0;
789     re->error_message = 0;
790     re->code = re_compile(re);
791     if (re->error_code) {
792       return re->error_code;
793     }
794     re->position = 0;
795   }
796   return re_program_run(re, input, &re->matches, &re->nmatches);
797 }
798 
lwre_release(struct re * re)799 void lwre_release(struct re *re) {
800   RE_FREE(re, re->matches);
801   if (re->code.first) {
802     re_program_free(re, &re->code);
803   }
804   memset(re, 0, sizeof(*re));
805 }
806 
lwre_reset(struct re * re,const char * expression)807 void lwre_reset(struct re *re, const char *expression) {
808   lwre_release(re);
809   re->expression = expression;
810 }
811 
lwre_free(struct re * re)812 void lwre_free(struct re *re) {
813   lwre_release(re);
814   RE_FREE(0, re);
815 }
816 
817 /* utility */
818 
re_digit(int c,int base)819 static int re_digit(int c, int base) {
820   if ((c >= '0') && (c <= '9')) {
821     c -= '0';
822   } else if ((c >= 'A') && (c <= 'Z')) {
823     c -= ('A' - 10);
824   } else if ((c >= 'a') && (c <= 'z')) {
825     c -= ('a' - 10);
826   } else {
827     return -1;
828   }
829   if (c >= base) {
830     return -1;
831   }
832   return c;
833 }
834 
re_byte(char ** sp,int least,int most,int base,int liberal)835 static int re_byte(char **sp, int least, int most, int base, int liberal) {
836   int c = 0;
837   char *s = *sp;
838   while (s - *sp < most) {
839     int d = re_digit(*s, base);
840     if (d < 0) {
841       break;
842     }
843     ++s;
844     c = c * base + d;
845   }
846   if (s - *sp < least) {
847     if (liberal) {
848       return (*sp)[-1];
849     }
850     --*sp;
851     return '\\';
852   }
853   *sp = s;
854   return c;
855 }
856 
re_log2floor(unsigned int n)857 static int re_log2floor(unsigned int n) {
858   if (!n) {
859     return 0;
860   }
861   int b = 1;
862 # define _do(x) if (n >= (1U << x)) (b += x), (n >>= x)
863   _do(16);
864   _do(8);
865   _do(4);
866   _do(2);
867   _do(1);
868 # undef _do
869   return b;
870 }
871 
re_escape_utf8(char ** sp,unsigned int c)872 static void re_escape_utf8(char **sp, unsigned int c) {
873   char *s = *sp;
874   if (c < 128) {
875     *s++ = c;
876   } else { /* this is good for up to 36 bits of c, which proves that Gordon Bell was right all along */
877     int n = re_log2floor((unsigned) c) / 6;
878     int m = 6 * n;
879     *s++ = (0xff << (7 - n)) + (c >> m);
880     while ((m -= 6) >= 0) *s++ = 0x80 + ((c >> m) & 0x3F);
881   }
882   *sp = s;
883 }
884 
lwre_escape(char * s,int liberal)885 char *lwre_escape(char *s, int liberal) {
886   char *in = s, *out = s;
887   int c;
888   while ((c = *in++)) {
889     int u = 0;
890     if ('\\' == c) {
891       c = *in++;
892       switch (c) {
893         case '0':
894         case '1':
895           c = re_byte(&in, 1, 3, 8, liberal);
896           break;
897         case '\\':
898           //c = '\\';
899           break;
900         case 'a':
901           c = '\a';
902           break;
903         case 'b':
904           c = '\b';
905           break;
906         case 'f':
907           c = '\f';
908           break;
909         case 'n':
910           c = '\n';
911           break;
912         case 'r':
913           c = '\r';
914           break;
915         case 't':
916           c = '\t';
917           break;
918         case 'U':
919           c = re_byte(&in, 8, 8, 16, liberal);
920           u = 1;
921           break;
922         case 'u':
923           c = re_byte(&in, 4, 4, 16, liberal);
924           u = 1;
925           break;
926         case 'v':
927           c = '\v';
928           break;
929         case 'x':
930           c = re_byte(&in, 1, 2, 16, liberal);
931           break;
932         default: {
933           if (!liberal) { /* pass escape character through unharmed */
934             --in;
935             c = '\\';
936           }
937           break;
938         }
939       }
940     }
941     if (u) {
942       re_escape_utf8(&out, c);
943     } else {
944       *out++ = c;
945     }
946   }
947   assert(out <= in);
948   *out = 0;
949   return s;
950 }
951 
952 /* testing */
953 
954 #ifdef LWRE_TEST
955 
956 #include <stdio.h>
957 
958 /* echo stdin to stout with ANSI terminal escapes to turn every number red */
959 
main(int argc,char ** argv)960 int main(int argc, char **argv) {
961   static struct re re = RE_INITIALISER("(.*?{[0-9]+})*");
962   char buf[1024];
963   while (fgets(buf, sizeof(buf), stdin)) {
964     int n;
965     if (((n = lwre_match(&re, buf)) < 0) && (RE_ERROR_NOMATCH != n)) {
966       fprintf(stderr, "%i %ss: %s\n", n, re.error_message, re.position);
967       break;
968     } else {
969       char *p = buf;
970       int n = 0;
971       while (*p) {
972         if ((n < re.nmatches) && (p == re.matches[n])) {
973           if (n & 1) {
974             printf("\033[0m");            /* end of match: clear all attributes */
975           } else {
976             printf("\033[1;31m");         /* start of match: bold and foreground red */
977           }
978           ++n;
979         }
980         putchar(*p++);
981       }
982     }
983   }
984   lwre_release(&re);
985   return 0;
986 }
987 
988 #endif /* REGEXP_TEST */
989