• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Regular expression tests.
2    Copyright (C) 2003 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19    02110-1301 USA.  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include <sys/types.h>
26 #ifdef HAVE_MCHECK_H
27 #include <mcheck.h>
28 #endif
29 #include <regex.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <locale.h>
34 #include <getopt.h>
35 
36 static void
replace_special_chars(char * str)37 replace_special_chars (char *str)
38 {
39   for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str)
40     switch (*str)
41       {
42       case 'N': *str = '\n'; break;
43       case 'T': *str = '\t'; break;
44       case 'S': *str = ' '; break;
45       case 'Z': *str = '\0'; break;
46       }
47 }
48 
49 static void
glibc_re_syntax(char * str)50 glibc_re_syntax (char *str)
51 {
52   char *p, *end = strchr (str, '\0') + 1;
53 
54   /* Replace [[:<:]] with \< and [[:>:]] with \>.  */
55   for (p = str; (p = strstr (p, "[[:")) != NULL; )
56     if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0)
57       {
58         p[0] = '\\';
59         p[1] = p[3];
60         memmove (p + 2, p + 7, end - p - 7);
61         end -= 5;
62         p += 2;
63       }
64     else
65       p += 3;
66 }
67 
68 static char *
mb_replace(char * dst,const char c)69 mb_replace (char *dst, const char c)
70 {
71   switch (c)
72     {
73     /* Replace a with \'a and A with \'A.  */
74     case 'a':
75       *dst++ = '\xc3';
76       *dst++ = '\xa1';
77       break;
78     case 'A':
79       *dst++ = '\xc3';
80       *dst++ = '\x81';
81       break;
82     /* Replace b with \v{c} and B with \v{C}.  */
83     case 'b':
84       *dst++ = '\xc4';
85       *dst++ = '\x8d';
86       break;
87     case 'B':
88       *dst++ = '\xc4';
89       *dst++ = '\x8c';
90       break;
91     /* Replace c with \v{d} and C with \v{D}.  */
92     case 'c':
93       *dst++ = '\xc4';
94       *dst++ = '\x8f';
95       break;
96     case 'C':
97       *dst++ = '\xc4';
98       *dst++ = '\x8e';
99       break;
100     /* Replace d with \'e and D with \'E.  */
101     case 'd':
102       *dst++ = '\xc3';
103       *dst++ = '\xa9';
104       break;
105     case 'D':
106       *dst++ = '\xc3';
107       *dst++ = '\x89';
108       break;
109     }
110   return dst;
111 }
112 
113 static char *
mb_frob_string(const char * str,const char * letters)114 mb_frob_string (const char *str, const char *letters)
115 {
116   char *ret, *dst;
117   const char *src;
118 
119   if (str == NULL)
120     return NULL;
121 
122   ret = malloc (2 * strlen (str) + 1);
123   if (ret == NULL)
124     return NULL;
125 
126   for (src = str, dst = ret; *src; ++src)
127     if (strchr (letters, *src))
128       dst = mb_replace (dst, *src);
129     else
130       *dst++ = *src;
131   *dst = '\0';
132   return ret;
133 }
134 
135 /* Like mb_frob_string, but don't replace anything between
136    [: and :], [. and .] or [= and =].  */
137 
138 static char *
mb_frob_pattern(const char * str,const char * letters)139 mb_frob_pattern (const char *str, const char *letters)
140 {
141   char *ret, *dst;
142   const char *src;
143   int in_class = 0;
144 
145   if (str == NULL)
146     return NULL;
147 
148   ret = malloc (2 * strlen (str) + 1);
149   if (ret == NULL)
150     return NULL;
151 
152   for (src = str, dst = ret; *src; ++src)
153     if (!in_class && strchr (letters, *src))
154       dst = mb_replace (dst, *src);
155     else
156       {
157 	if (!in_class && *src == '[' && strchr (":.=", src[1]))
158 	  in_class = 1;
159 	else if (in_class && *src == ']' && strchr (":.=", src[-1]))
160 	  in_class = 0;
161 	*dst++ = *src;
162       }
163   *dst = '\0';
164   return ret;
165 }
166 
167 static int
check_match(regmatch_t * rm,int idx,const char * string,const char * match,const char * fail)168 check_match (regmatch_t *rm, int idx, const char *string,
169 	     const char *match, const char *fail)
170 {
171   if (match[0] == '-' && match[1] == '\0')
172     {
173       if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1)
174 	return 0;
175       printf ("%s rm[%d] unexpectedly matched\n", fail, idx);
176       return 1;
177     }
178 
179   if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1)
180     {
181       printf ("%s rm[%d] unexpectedly did not match\n", fail, idx);
182       return 1;
183     }
184 
185   if (match[0] == '@')
186     {
187       if (rm[idx].rm_so != rm[idx].rm_eo)
188 	{
189 	  printf ("%s rm[%d] not empty\n", fail, idx);
190 	  return 1;
191 	}
192 
193       if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1)
194 						      ? strlen (match + 1) : 1))
195 	{
196 	  printf ("%s rm[%d] not matching %s\n", fail, idx, match);
197 	  return 1;
198 	}
199       return 0;
200     }
201 
202   if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match)
203       || strncmp (string + rm[idx].rm_so, match,
204 		  rm[idx].rm_eo - rm[idx].rm_so))
205     {
206       printf ("%s rm[%d] not matching %s\n", fail, idx, match);
207       return 1;
208     }
209 
210   return 0;
211 }
212 
213 static int
test(const char * pattern,int cflags,const char * string,int eflags,char * expect,char * matches,const char * fail)214 test (const char *pattern, int cflags, const char *string, int eflags,
215       char *expect, char *matches, const char *fail)
216 {
217   regex_t re;
218   regmatch_t rm[10];
219   int n, ret = 0;
220 
221   n = regcomp (&re, pattern, cflags);
222   if (n != 0)
223     {
224       char buf[500];
225       if (eflags == -1)
226 	{
227 	  static struct { reg_errcode_t code; const char *name; } codes []
228 #define C(x) { REG_##x, #x }
229 	    = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE),
230 		C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK),
231 		C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE),
232 		C(ESPACE), C(BADRPT) };
233 
234 	  int i;
235 	  for (i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i)
236 	    if (n == codes[i].code)
237 	      {
238 		if (strcmp (string, codes[i].name))
239 		  {
240 		    printf ("%s regcomp returned REG_%s (expected REG_%s)\n",
241 			    fail, codes[i].name, string);
242 		    return 1;
243 		  }
244 	        return 0;
245 	      }
246 
247 	  printf ("%s regcomp return value REG_%d\n", fail, n);
248 	  return 1;
249 	}
250 
251       regerror (n, &re, buf, sizeof (buf));
252       printf ("%s regcomp failed: %s\n", fail, buf);
253       return 1;
254     }
255 
256   if (eflags == -1)
257     {
258       regfree (&re);
259 
260       /* The test case file assumes something only guaranteed by the
261 	 rxspencer regex implementation.  Namely that for empty
262 	 expressions regcomp() return REG_EMPTY.  This is not the case
263 	 for us and so we ignore this error.  */
264       if (strcmp (string, "EMPTY") == 0)
265 	return 0;
266 
267       printf ("%s regcomp unexpectedly succeeded\n", fail);
268       return 1;
269     }
270 
271   if (regexec (&re, string, 10, rm, eflags))
272     {
273       regfree (&re);
274       if (expect == NULL)
275 	return 0;
276       printf ("%s regexec failed\n", fail);
277       return 1;
278     }
279 
280   regfree (&re);
281 
282   if (expect == NULL)
283     {
284       printf ("%s regexec unexpectedly succeeded\n", fail);
285       return 1;
286     }
287 
288   if (cflags & REG_NOSUB)
289     return 0;
290 
291   ret = check_match (rm, 0, string, expect, fail);
292   if (matches == NULL)
293     return ret;
294 
295   for (n = 1; ret == 0 && n < 10; ++n)
296     {
297       char *p = NULL;
298 
299       if (matches)
300 	{
301 	  p = strchr (matches, ',');
302 	  if (p != NULL)
303 	    *p = '\0';
304 	}
305       ret = check_match (rm, n, string, matches ? matches : "-", fail);
306       if (p)
307 	{
308 	  *p = ',';
309 	  matches = p + 1;
310 	}
311       else
312 	matches = NULL;
313     }
314 
315   return ret;
316 }
317 
318 static int
mb_test(const char * pattern,int cflags,const char * string,int eflags,char * expect,const char * matches,const char * letters,const char * fail)319 mb_test (const char *pattern, int cflags, const char *string, int eflags,
320 	 char *expect, const char *matches, const char *letters,
321 	 const char *fail)
322 {
323   char *pattern_mb = mb_frob_pattern (pattern, letters);
324   const char *string_mb
325     = eflags == -1 ? string : mb_frob_string (string, letters);
326   char *expect_mb = mb_frob_string (expect, letters);
327   char *matches_mb = mb_frob_string (matches, letters);
328   int ret = 0;
329 
330   if (!pattern_mb || !string_mb
331       || (expect && !expect_mb) || (matches && !matches_mb))
332     {
333       printf ("%s %m", fail);
334       ret = 1;
335     }
336   else
337     ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb,
338 		matches_mb, fail);
339 
340   free (matches_mb);
341   free (expect_mb);
342   if (string_mb != string)
343     free ((char *) string_mb);
344   free (pattern_mb);
345   return ret;
346 }
347 
348 static int
mb_tests(const char * pattern,int cflags,const char * string,int eflags,char * expect,const char * matches)349 mb_tests (const char *pattern, int cflags, const char *string, int eflags,
350 	  char *expect, const char *matches)
351 {
352   int ret = 0;
353   int i;
354   char letters[9], fail[20];
355 
356   /* The tests aren't supposed to work with xdigit, since a-dA-D are
357      hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not.  */
358   if (strstr (pattern, "[:xdigit:]"))
359     return 0;
360 
361   /* XXX: regex ATM handles only single byte equivalence classes.  */
362   if (strstr (pattern, "[[=b=]]"))
363     return 0;
364 
365   for (i = 1; i < 16; ++i)
366     {
367       char *p = letters;
368       if (i & 1)
369 	{
370 	  if (!strchr (pattern, 'a') && !strchr (string, 'a')
371 	      && !strchr (pattern, 'A') && !strchr (string, 'A'))
372 	    continue;
373 	  *p++ = 'a', *p++ = 'A';
374 	}
375       if (i & 2)
376 	{
377 	  if (!strchr (pattern, 'b') && !strchr (string, 'b')
378 	      && !strchr (pattern, 'B') && !strchr (string, 'B'))
379 	    continue;
380 	  *p++ = 'b', *p++ = 'B';
381 	}
382       if (i & 4)
383 	{
384 	  if (!strchr (pattern, 'c') && !strchr (string, 'c')
385 	      && !strchr (pattern, 'C') && !strchr (string, 'C'))
386 	    continue;
387 	  *p++ = 'c', *p++ = 'C';
388 	}
389       if (i & 8)
390 	{
391 	  if (!strchr (pattern, 'd') && !strchr (string, 'd')
392 	      && !strchr (pattern, 'D') && !strchr (string, 'D'))
393 	    continue;
394 	  *p++ = 'd', *p++ = 'D';
395 	}
396       *p++ = '\0';
397       sprintf (fail, "UTF-8 %s FAIL", letters);
398       ret |= mb_test (pattern, cflags, string, eflags, expect, matches,
399 		      letters, fail);
400     }
401   return ret;
402 }
403 
404 int
main(int argc,char ** argv)405 main (int argc, char **argv)
406 {
407   int ret = 0;
408   char *line = NULL;
409   size_t line_len = 0;
410   ssize_t len;
411   FILE *f;
412   static int test_utf8 = 0;
413   static const struct option options[] =
414     {
415       {"utf8",	no_argument,	&test_utf8,	1},
416       {NULL,	0,		NULL,		0 }
417     };
418 
419 #ifdef HAVE_MCHECK_H
420   mtrace ();
421 #endif
422 
423   while (getopt_long (argc, argv, "", options, NULL) >= 0);
424 
425   if (optind + 1 != argc)
426     {
427       fprintf (stderr, "Missing test filename\n");
428       return 1;
429     }
430 
431   f = fopen (argv[optind], "r");
432   if (f == NULL)
433     {
434       fprintf (stderr, "Couldn't open %s\n", argv[optind]);
435       return 1;
436     }
437 
438   while ((len = getline (&line, &line_len, f)) > 0)
439     {
440       char *pattern, *flagstr, *string, *expect, *matches, *p;
441       int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0;
442 
443       if (line[len - 1] == '\n')
444         line[len - 1] = '\0';
445 
446       /* Skip comments and empty lines.  */
447       if (*line == '#' || *line == '\0')
448 	continue;
449 
450       puts (line);
451       fflush (stdout);
452 
453       pattern = strtok (line, "\t");
454       if (pattern == NULL)
455         continue;
456 
457       if (strcmp (pattern, "\"\"") == 0)
458 	pattern += 2;
459 
460       flagstr = strtok (NULL, "\t");
461       if (flagstr == NULL)
462         continue;
463 
464       string = strtok (NULL, "\t");
465       if (string == NULL)
466         continue;
467 
468       if (strcmp (string, "\"\"") == 0)
469 	string += 2;
470 
471       for (p = flagstr; *p; ++p)
472 	switch (*p)
473 	  {
474 	  case '-':
475 	    break;
476 	  case 'b':
477 	    cflags &= ~REG_EXTENDED;
478 	    break;
479 	  case '&':
480 	    try_bre_ere = 1;
481 	    break;
482 	  case 'C':
483 	    eflags = -1;
484 	    break;
485 	  case 'i':
486 	    cflags |= REG_ICASE;
487 	    break;
488 	  case 's':
489 	    cflags |= REG_NOSUB;
490 	    break;
491 	  case 'n':
492 	    cflags |= REG_NEWLINE;
493 	    break;
494 	  case '^':
495 	    eflags |= REG_NOTBOL;
496 	    break;
497 	  case '$':
498 	    eflags |= REG_NOTEOL;
499 	    break;
500 	  case 'm':
501 	  case 'p':
502 	  case '#':
503 	    /* Not supported.  */
504 	    flagstr = NULL;
505 	    break;
506 	  }
507 
508       if (flagstr == NULL)
509 	continue;
510 
511       replace_special_chars (pattern);
512       glibc_re_syntax (pattern);
513       if (eflags != -1)
514         replace_special_chars (string);
515 
516       expect = strtok (NULL, "\t");
517       matches = NULL;
518       if (expect != NULL)
519         {
520 	  replace_special_chars (expect);
521 	  matches = strtok (NULL, "\t");
522 	  if (matches != NULL)
523 	    replace_special_chars (matches);
524         }
525 
526       if (setlocale (LC_ALL, "C") == NULL)
527 	{
528 	  puts ("setlocale C failed");
529 	  ret = 1;
530 	}
531       if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
532 	  || (try_bre_ere
533 	      && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
534 		       expect, matches, "FAIL")))
535 	ret = 1;
536       else if (test_utf8)
537 	{
538 	  if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL)
539 	    {
540 	      puts ("setlocale cs_CZ.UTF-8 failed");
541 	      ret = 1;
542 	    }
543 	  else if (test (pattern, cflags, string, eflags, expect, matches,
544 			 "UTF-8 FAIL")
545 		   || (try_bre_ere
546 		       && test (pattern, cflags & ~REG_EXTENDED, string,
547 				eflags, expect, matches, "UTF-8 FAIL")))
548 	    ret = 1;
549 	  else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
550 		   || (try_bre_ere
551 		       && mb_tests (pattern, cflags & ~REG_EXTENDED, string,
552 				    eflags, expect, matches)))
553 	    ret = 1;
554 	}
555     }
556 
557   free (line);
558   fclose (f);
559   return ret;
560 }
561