1 /* Regular expression tests.
2 Copyright (C) 2003 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24
25 #include <sys/types.h>
26 #ifdef HAVE_MCHECK_H
27 #include <mcheck.h>
28 #endif
29 #include <regex.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <locale.h>
34 #include <getopt.h>
35
36 static void
replace_special_chars(char * str)37 replace_special_chars (char *str)
38 {
39 for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str)
40 switch (*str)
41 {
42 case 'N': *str = '\n'; break;
43 case 'T': *str = '\t'; break;
44 case 'S': *str = ' '; break;
45 case 'Z': *str = '\0'; break;
46 }
47 }
48
49 static void
glibc_re_syntax(char * str)50 glibc_re_syntax (char *str)
51 {
52 char *p, *end = strchr (str, '\0') + 1;
53
54 /* Replace [[:<:]] with \< and [[:>:]] with \>. */
55 for (p = str; (p = strstr (p, "[[:")) != NULL; )
56 if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0)
57 {
58 p[0] = '\\';
59 p[1] = p[3];
60 memmove (p + 2, p + 7, end - p - 7);
61 end -= 5;
62 p += 2;
63 }
64 else
65 p += 3;
66 }
67
68 static char *
mb_replace(char * dst,const char c)69 mb_replace (char *dst, const char c)
70 {
71 switch (c)
72 {
73 /* Replace a with \'a and A with \'A. */
74 case 'a':
75 *dst++ = '\xc3';
76 *dst++ = '\xa1';
77 break;
78 case 'A':
79 *dst++ = '\xc3';
80 *dst++ = '\x81';
81 break;
82 /* Replace b with \v{c} and B with \v{C}. */
83 case 'b':
84 *dst++ = '\xc4';
85 *dst++ = '\x8d';
86 break;
87 case 'B':
88 *dst++ = '\xc4';
89 *dst++ = '\x8c';
90 break;
91 /* Replace c with \v{d} and C with \v{D}. */
92 case 'c':
93 *dst++ = '\xc4';
94 *dst++ = '\x8f';
95 break;
96 case 'C':
97 *dst++ = '\xc4';
98 *dst++ = '\x8e';
99 break;
100 /* Replace d with \'e and D with \'E. */
101 case 'd':
102 *dst++ = '\xc3';
103 *dst++ = '\xa9';
104 break;
105 case 'D':
106 *dst++ = '\xc3';
107 *dst++ = '\x89';
108 break;
109 }
110 return dst;
111 }
112
113 static char *
mb_frob_string(const char * str,const char * letters)114 mb_frob_string (const char *str, const char *letters)
115 {
116 char *ret, *dst;
117 const char *src;
118
119 if (str == NULL)
120 return NULL;
121
122 ret = malloc (2 * strlen (str) + 1);
123 if (ret == NULL)
124 return NULL;
125
126 for (src = str, dst = ret; *src; ++src)
127 if (strchr (letters, *src))
128 dst = mb_replace (dst, *src);
129 else
130 *dst++ = *src;
131 *dst = '\0';
132 return ret;
133 }
134
135 /* Like mb_frob_string, but don't replace anything between
136 [: and :], [. and .] or [= and =]. */
137
138 static char *
mb_frob_pattern(const char * str,const char * letters)139 mb_frob_pattern (const char *str, const char *letters)
140 {
141 char *ret, *dst;
142 const char *src;
143 int in_class = 0;
144
145 if (str == NULL)
146 return NULL;
147
148 ret = malloc (2 * strlen (str) + 1);
149 if (ret == NULL)
150 return NULL;
151
152 for (src = str, dst = ret; *src; ++src)
153 if (!in_class && strchr (letters, *src))
154 dst = mb_replace (dst, *src);
155 else
156 {
157 if (!in_class && *src == '[' && strchr (":.=", src[1]))
158 in_class = 1;
159 else if (in_class && *src == ']' && strchr (":.=", src[-1]))
160 in_class = 0;
161 *dst++ = *src;
162 }
163 *dst = '\0';
164 return ret;
165 }
166
167 static int
check_match(regmatch_t * rm,int idx,const char * string,const char * match,const char * fail)168 check_match (regmatch_t *rm, int idx, const char *string,
169 const char *match, const char *fail)
170 {
171 if (match[0] == '-' && match[1] == '\0')
172 {
173 if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1)
174 return 0;
175 printf ("%s rm[%d] unexpectedly matched\n", fail, idx);
176 return 1;
177 }
178
179 if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1)
180 {
181 printf ("%s rm[%d] unexpectedly did not match\n", fail, idx);
182 return 1;
183 }
184
185 if (match[0] == '@')
186 {
187 if (rm[idx].rm_so != rm[idx].rm_eo)
188 {
189 printf ("%s rm[%d] not empty\n", fail, idx);
190 return 1;
191 }
192
193 if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1)
194 ? strlen (match + 1) : 1))
195 {
196 printf ("%s rm[%d] not matching %s\n", fail, idx, match);
197 return 1;
198 }
199 return 0;
200 }
201
202 if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match)
203 || strncmp (string + rm[idx].rm_so, match,
204 rm[idx].rm_eo - rm[idx].rm_so))
205 {
206 printf ("%s rm[%d] not matching %s\n", fail, idx, match);
207 return 1;
208 }
209
210 return 0;
211 }
212
213 static int
test(const char * pattern,int cflags,const char * string,int eflags,char * expect,char * matches,const char * fail)214 test (const char *pattern, int cflags, const char *string, int eflags,
215 char *expect, char *matches, const char *fail)
216 {
217 regex_t re;
218 regmatch_t rm[10];
219 int n, ret = 0;
220
221 n = regcomp (&re, pattern, cflags);
222 if (n != 0)
223 {
224 char buf[500];
225 if (eflags == -1)
226 {
227 static struct { reg_errcode_t code; const char *name; } codes []
228 #define C(x) { REG_##x, #x }
229 = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE),
230 C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK),
231 C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE),
232 C(ESPACE), C(BADRPT) };
233
234 int i;
235 for (i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i)
236 if (n == codes[i].code)
237 {
238 if (strcmp (string, codes[i].name))
239 {
240 printf ("%s regcomp returned REG_%s (expected REG_%s)\n",
241 fail, codes[i].name, string);
242 return 1;
243 }
244 return 0;
245 }
246
247 printf ("%s regcomp return value REG_%d\n", fail, n);
248 return 1;
249 }
250
251 regerror (n, &re, buf, sizeof (buf));
252 printf ("%s regcomp failed: %s\n", fail, buf);
253 return 1;
254 }
255
256 if (eflags == -1)
257 {
258 regfree (&re);
259
260 /* The test case file assumes something only guaranteed by the
261 rxspencer regex implementation. Namely that for empty
262 expressions regcomp() return REG_EMPTY. This is not the case
263 for us and so we ignore this error. */
264 if (strcmp (string, "EMPTY") == 0)
265 return 0;
266
267 printf ("%s regcomp unexpectedly succeeded\n", fail);
268 return 1;
269 }
270
271 if (regexec (&re, string, 10, rm, eflags))
272 {
273 regfree (&re);
274 if (expect == NULL)
275 return 0;
276 printf ("%s regexec failed\n", fail);
277 return 1;
278 }
279
280 regfree (&re);
281
282 if (expect == NULL)
283 {
284 printf ("%s regexec unexpectedly succeeded\n", fail);
285 return 1;
286 }
287
288 if (cflags & REG_NOSUB)
289 return 0;
290
291 ret = check_match (rm, 0, string, expect, fail);
292 if (matches == NULL)
293 return ret;
294
295 for (n = 1; ret == 0 && n < 10; ++n)
296 {
297 char *p = NULL;
298
299 if (matches)
300 {
301 p = strchr (matches, ',');
302 if (p != NULL)
303 *p = '\0';
304 }
305 ret = check_match (rm, n, string, matches ? matches : "-", fail);
306 if (p)
307 {
308 *p = ',';
309 matches = p + 1;
310 }
311 else
312 matches = NULL;
313 }
314
315 return ret;
316 }
317
318 static int
mb_test(const char * pattern,int cflags,const char * string,int eflags,char * expect,const char * matches,const char * letters,const char * fail)319 mb_test (const char *pattern, int cflags, const char *string, int eflags,
320 char *expect, const char *matches, const char *letters,
321 const char *fail)
322 {
323 char *pattern_mb = mb_frob_pattern (pattern, letters);
324 const char *string_mb
325 = eflags == -1 ? string : mb_frob_string (string, letters);
326 char *expect_mb = mb_frob_string (expect, letters);
327 char *matches_mb = mb_frob_string (matches, letters);
328 int ret = 0;
329
330 if (!pattern_mb || !string_mb
331 || (expect && !expect_mb) || (matches && !matches_mb))
332 {
333 printf ("%s %m", fail);
334 ret = 1;
335 }
336 else
337 ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb,
338 matches_mb, fail);
339
340 free (matches_mb);
341 free (expect_mb);
342 if (string_mb != string)
343 free ((char *) string_mb);
344 free (pattern_mb);
345 return ret;
346 }
347
348 static int
mb_tests(const char * pattern,int cflags,const char * string,int eflags,char * expect,const char * matches)349 mb_tests (const char *pattern, int cflags, const char *string, int eflags,
350 char *expect, const char *matches)
351 {
352 int ret = 0;
353 int i;
354 char letters[9], fail[20];
355
356 /* The tests aren't supposed to work with xdigit, since a-dA-D are
357 hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not. */
358 if (strstr (pattern, "[:xdigit:]"))
359 return 0;
360
361 /* XXX: regex ATM handles only single byte equivalence classes. */
362 if (strstr (pattern, "[[=b=]]"))
363 return 0;
364
365 for (i = 1; i < 16; ++i)
366 {
367 char *p = letters;
368 if (i & 1)
369 {
370 if (!strchr (pattern, 'a') && !strchr (string, 'a')
371 && !strchr (pattern, 'A') && !strchr (string, 'A'))
372 continue;
373 *p++ = 'a', *p++ = 'A';
374 }
375 if (i & 2)
376 {
377 if (!strchr (pattern, 'b') && !strchr (string, 'b')
378 && !strchr (pattern, 'B') && !strchr (string, 'B'))
379 continue;
380 *p++ = 'b', *p++ = 'B';
381 }
382 if (i & 4)
383 {
384 if (!strchr (pattern, 'c') && !strchr (string, 'c')
385 && !strchr (pattern, 'C') && !strchr (string, 'C'))
386 continue;
387 *p++ = 'c', *p++ = 'C';
388 }
389 if (i & 8)
390 {
391 if (!strchr (pattern, 'd') && !strchr (string, 'd')
392 && !strchr (pattern, 'D') && !strchr (string, 'D'))
393 continue;
394 *p++ = 'd', *p++ = 'D';
395 }
396 *p++ = '\0';
397 sprintf (fail, "UTF-8 %s FAIL", letters);
398 ret |= mb_test (pattern, cflags, string, eflags, expect, matches,
399 letters, fail);
400 }
401 return ret;
402 }
403
404 int
main(int argc,char ** argv)405 main (int argc, char **argv)
406 {
407 int ret = 0;
408 char *line = NULL;
409 size_t line_len = 0;
410 ssize_t len;
411 FILE *f;
412 static int test_utf8 = 0;
413 static const struct option options[] =
414 {
415 {"utf8", no_argument, &test_utf8, 1},
416 {NULL, 0, NULL, 0 }
417 };
418
419 #ifdef HAVE_MCHECK_H
420 mtrace ();
421 #endif
422
423 while (getopt_long (argc, argv, "", options, NULL) >= 0);
424
425 if (optind + 1 != argc)
426 {
427 fprintf (stderr, "Missing test filename\n");
428 return 1;
429 }
430
431 f = fopen (argv[optind], "r");
432 if (f == NULL)
433 {
434 fprintf (stderr, "Couldn't open %s\n", argv[optind]);
435 return 1;
436 }
437
438 while ((len = getline (&line, &line_len, f)) > 0)
439 {
440 char *pattern, *flagstr, *string, *expect, *matches, *p;
441 int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0;
442
443 if (line[len - 1] == '\n')
444 line[len - 1] = '\0';
445
446 /* Skip comments and empty lines. */
447 if (*line == '#' || *line == '\0')
448 continue;
449
450 puts (line);
451 fflush (stdout);
452
453 pattern = strtok (line, "\t");
454 if (pattern == NULL)
455 continue;
456
457 if (strcmp (pattern, "\"\"") == 0)
458 pattern += 2;
459
460 flagstr = strtok (NULL, "\t");
461 if (flagstr == NULL)
462 continue;
463
464 string = strtok (NULL, "\t");
465 if (string == NULL)
466 continue;
467
468 if (strcmp (string, "\"\"") == 0)
469 string += 2;
470
471 for (p = flagstr; *p; ++p)
472 switch (*p)
473 {
474 case '-':
475 break;
476 case 'b':
477 cflags &= ~REG_EXTENDED;
478 break;
479 case '&':
480 try_bre_ere = 1;
481 break;
482 case 'C':
483 eflags = -1;
484 break;
485 case 'i':
486 cflags |= REG_ICASE;
487 break;
488 case 's':
489 cflags |= REG_NOSUB;
490 break;
491 case 'n':
492 cflags |= REG_NEWLINE;
493 break;
494 case '^':
495 eflags |= REG_NOTBOL;
496 break;
497 case '$':
498 eflags |= REG_NOTEOL;
499 break;
500 case 'm':
501 case 'p':
502 case '#':
503 /* Not supported. */
504 flagstr = NULL;
505 break;
506 }
507
508 if (flagstr == NULL)
509 continue;
510
511 replace_special_chars (pattern);
512 glibc_re_syntax (pattern);
513 if (eflags != -1)
514 replace_special_chars (string);
515
516 expect = strtok (NULL, "\t");
517 matches = NULL;
518 if (expect != NULL)
519 {
520 replace_special_chars (expect);
521 matches = strtok (NULL, "\t");
522 if (matches != NULL)
523 replace_special_chars (matches);
524 }
525
526 if (setlocale (LC_ALL, "C") == NULL)
527 {
528 puts ("setlocale C failed");
529 ret = 1;
530 }
531 if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
532 || (try_bre_ere
533 && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
534 expect, matches, "FAIL")))
535 ret = 1;
536 else if (test_utf8)
537 {
538 if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL)
539 {
540 puts ("setlocale cs_CZ.UTF-8 failed");
541 ret = 1;
542 }
543 else if (test (pattern, cflags, string, eflags, expect, matches,
544 "UTF-8 FAIL")
545 || (try_bre_ere
546 && test (pattern, cflags & ~REG_EXTENDED, string,
547 eflags, expect, matches, "UTF-8 FAIL")))
548 ret = 1;
549 else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
550 || (try_bre_ere
551 && mb_tests (pattern, cflags & ~REG_EXTENDED, string,
552 eflags, expect, matches)))
553 ret = 1;
554 }
555 }
556
557 free (line);
558 fclose (f);
559 return ret;
560 }
561