1# regular expression test set 2# Lines are at least three fields, separated by one or more tabs. "" stands 3# for an empty field. First field is an RE. Second field is flags. If 4# C flag given, regcomp() is expected to fail, and the third field is the 5# error name (minus the leading REG_). 6# 7# Otherwise it is expected to succeed, and the third field is the string to 8# try matching it against. If there is no fourth field, the match is 9# expected to fail. If there is a fourth field, it is the substring that 10# the RE is expected to match. If there is a fifth field, it is a comma- 11# separated list of what the subexpressions should match, with - indicating 12# no match for that one. In both the fourth and fifth fields, a (sub)field 13# starting with @ indicates that the (sub)expression is expected to match 14# a null string followed by the stuff after the @; this provides a way to 15# test where null strings match. The character `N' in REs and strings 16# is newline, `S' is space, `T' is tab, `Z' is NUL. 17# 18# The full list of flags: 19# - placeholder, does nothing 20# b RE is a BRE, not an ERE 21# & try it as both an ERE and a BRE 22# C regcomp() error expected, third field is error name 23# i REG_ICASE 24# m ("mundane") REG_NOSPEC 25# s REG_NOSUB (not really testable) 26# n REG_NEWLINE 27# ^ REG_NOTBOL 28# $ REG_NOTEOL 29# # REG_STARTEND (see below) 30# p REG_PEND 31# 32# For REG_STARTEND, the start/end offsets are those of the substring 33# enclosed in (). 34 35# basics 36a & a a 37abc & abc abc 38abc|de - abc abc 39a|b|c - abc a 40 41# parentheses and perversions thereof 42a(b)c - abc abc 43a\(b\)c b abc abc 44a( C EPAREN 45a( b a( a( 46a\( - a( a( 47a\( bC EPAREN 48a\(b bC EPAREN 49a(b C EPAREN 50a(b b a(b a(b 51# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) 52a) - a) a) 53) - ) ) 54# end gagging (in a just world, those *should* give EPAREN) 55a) b a) a) 56a\) bC EPAREN 57\) bC EPAREN 58a()b - ab ab 59a\(\)b b ab ab 60 61# anchoring and REG_NEWLINE 62^abc$ & abc abc 63a^b - a^b 64a^b b a^b a^b 65a$b - a$b 66a$b b a$b a$b 67^ & abc @abc 68$ & abc @ 69^$ & "" @ 70$^ - "" @ 71\($\)\(^\) b "" @ 72# stop retching, those are legitimate (although disgusting) 73^^ - "" @ 74$$ - "" @ 75b$ & abNc 76b$ &n abNc b 77^b$ & aNbNc 78^b$ &n aNbNc b 79^$ &n aNNb @Nb 80^$ n abc 81^$ n abcN @ 82$^ n aNNb @Nb 83\($\)\(^\) bn aNNb @Nb 84^^ n^ aNNb @Nb 85$$ n aNNb @NN 86^a ^ a 87a$ $ a 88^a ^n aNb 89^b ^n aNb b 90a$ $n bNa 91b$ $n bNa b 92a*(^b$)c* - b b 93a*\(^b$\)c* b b b 94 95# certain syntax errors and non-errors 96| C EMPTY 97| b | | 98* C BADRPT 99* b * * 100+ C BADRPT 101? C BADRPT 102"" &C EMPTY 103() - abc @abc 104\(\) b abc @abc 105a||b C EMPTY 106|ab C EMPTY 107ab| C EMPTY 108(|a)b C EMPTY 109(a|)b C EMPTY 110(*a) C BADRPT 111(+a) C BADRPT 112(?a) C BADRPT 113({1}a) C BADRPT 114\(\{1\}a\) bC BADRPT 115(a|*b) C BADRPT 116(a|+b) C BADRPT 117(a|?b) C BADRPT 118(a|{1}b) C BADRPT 119^* C BADRPT 120^* b * * 121^+ C BADRPT 122^? C BADRPT 123^{1} C BADRPT 124^\{1\} bC BADRPT 125 126# metacharacters, backslashes 127a.c & abc abc 128a[bc]d & abd abd 129a\*c & a*c a*c 130a\\b & a\b a\b 131a\\\*b & a\*b a\*b 132# The following test is wrong. Using \b in an BRE or ERE is undefined. 133# a\bc & abc abc 134a\ &C EESCAPE 135a\\bc & a\bc a\bc 136\{ bC BADRPT 137a\[b & a[b a[b 138a[b &C EBRACK 139# trailing $ is a peculiar special case for the BRE code 140a$ & a a 141a$ & a$ 142a\$ & a 143a\$ & a$ a$ 144a\\$ & a 145a\\$ & a$ 146a\\$ & a\$ 147a\\$ & a\ a\ 148 149# back references, ugh 150a\(b\)\2c bC ESUBREG 151a\(b\1\)c bC ESUBREG 152a\(b*\)c\1d b abbcbbd abbcbbd bb 153a\(b*\)c\1d b abbcbd 154a\(b*\)c\1d b abbcbbbd 155^\(.\)\1 b abc 156a\([bc]\)\1d b abcdabbd abbd b 157a\(\([bc]\)\2\)*d b abbccd abbccd 158a\(\([bc]\)\2\)*d b abbcbd 159# actually, this next one probably ought to fail, but the spec is unclear 160a\(\(b\)*\2\)*d b abbbd abbbd 161# here is a case that no NFA implementation does right 162\(ab*\)[ab]*\1 b ababaaa ababaaa a 163# check out normal matching in the presence of back refs 164\(a\)\1bcd b aabcd aabcd 165\(a\)\1bc*d b aabcd aabcd 166\(a\)\1bc*d b aabd aabd 167\(a\)\1bc*d b aabcccd aabcccd 168\(a\)\1bc*[ce]d b aabcccd aabcccd 169^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd 170 171# ordinary repetitions 172ab*c & abc abc 173ab+c - abc abc 174ab?c - abc abc 175a\(*\)b b a*b a*b 176a\(**\)b b ab ab 177a\(***\)b bC BADRPT 178*a b *a *a 179**a b a a 180***a bC BADRPT 181 182# the dreaded bounded repetitions 183# The following two tests are not correct: 184#{ & { { 185#{abc & {abc {abc 186# '{' is always a special char outside bracket expressions. So test ony BRE: 187{ b { { 188{abc b {abc {abc 189{1 C BADRPT 190{1} C BADRPT 191# Same reason as for the two tests above: 192#a{b & a{b a{b 193a{b b a{b a{b 194a{1}b - ab ab 195a\{1\}b b ab ab 196a{1,}b - ab ab 197a\{1,\}b b ab ab 198a{1,2}b - aab aab 199a\{1,2\}b b aab aab 200a{1 C EBRACE 201a\{1 bC EBRACE 202a{1a C EBRACE 203a\{1a bC EBRACE 204a{1a} C BADBR 205a\{1a\} bC BADBR 206# These four tests checks for undefined behavior. Our implementation does 207# something different. 208#a{,2} - a{,2} a{,2} 209#a\{,2\} bC BADBR 210#a{,} - a{,} a{,} 211#a\{,\} bC BADBR 212a{1,x} C BADBR 213a\{1,x\} bC BADBR 214a{1,x C EBRACE 215a\{1,x bC EBRACE 216# These two tests probably fails due to an arbitrary limit on the number of 217# repetitions in the other implementation. 218#a{300} C BADBR 219#a\{300\} bC BADBR 220a{1,0} C BADBR 221a\{1,0\} bC BADBR 222ab{0,0}c - abcac ac 223ab\{0,0\}c b abcac ac 224ab{0,1}c - abcac abc 225ab\{0,1\}c b abcac abc 226ab{0,3}c - abbcac abbc 227ab\{0,3\}c b abbcac abbc 228ab{1,1}c - acabc abc 229ab\{1,1\}c b acabc abc 230ab{1,3}c - acabc abc 231ab\{1,3\}c b acabc abc 232ab{2,2}c - abcabbc abbc 233ab\{2,2\}c b abcabbc abbc 234ab{2,4}c - abcabbc abbc 235ab\{2,4\}c b abcabbc abbc 236((a{1,10}){1,10}){1,10} - a a a,a 237 238# multiple repetitions 239# Wow, there is serious disconnect here. The ERE grammar is like this: 240# ERE_expression : one_char_or_coll_elem_ERE 241# | '^' 242# | '$' 243# | '(' extended_reg_exp ')' 244# | ERE_expression ERE_dupl_symbol 245# ; 246# where ERE_dupl_symbol is any of the repetition methods. It is clear from 247# this that consecutive repetition is OK. On top of this, the one test not 248# marked as failing must fail. For BREs the situation is different, so we 249# use the four tests. 250#a** &C BADRPT 251a** bC BADRPT 252#a++ C BADRPT 253#a?? C BADRPT 254#a*+ C BADRPT 255#a*? C BADRPT 256#a+* C BADRPT 257#a+? C BADRPT 258#a?* C BADRPT 259#a?+ C BADRPT 260#a{1}{1} C BADRPT 261#a*{1} C BADRPT 262#a+{1} C BADRPT 263#a?{1} C BADRPT 264#a{1}* C BADRPT 265#a{1}+ C BADRPT 266#a{1}? C BADRPT 267#a*{b} - a{b} a{b} 268a\{1\}\{1\} bC BADRPT 269a*\{1\} bC BADRPT 270a\{1\}* bC BADRPT 271 272# brackets, and numerous perversions thereof 273a[b]c & abc abc 274a[ab]c & abc abc 275a[^ab]c & adc adc 276a[]b]c & a]c a]c 277a[[b]c & a[c a[c 278a[-b]c & a-c a-c 279a[^]b]c & adc adc 280a[^-b]c & adc adc 281a[b-]c & a-c a-c 282a[b &C EBRACK 283a[] &C EBRACK 284a[1-3]c & a2c a2c 285a[3-1]c &C ERANGE 286a[1-3-5]c &C ERANGE 287a[[.-.]--]c & a-c a-c 288# I don't thing the error value should be ERANGE since a[1-] would be 289# valid, too. Expect EBRACK. 290#a[1- &C ERANGE 291a[1- &C EBRACK 292a[[. &C EBRACK 293a[[.x &C EBRACK 294a[[.x. &C EBRACK 295a[[.x.] &C EBRACK 296a[[.x.]] & ax ax 297a[[.x,.]] &C ECOLLATE 298# This test is invalid. "one" is no collating symbol in any standardized 299# locale. 300# a[[.one.]]b & a1b a1b 301a[[.notdef.]]b &C ECOLLATE 302a[[.].]]b & a]b a]b 303a[[:alpha:]]c & abc abc 304a[[:notdef:]]c &C ECTYPE 305a[[: &C EBRACK 306a[[:alpha &C EBRACK 307a[[:alpha:] &C EBRACK 308a[[:alpha,:] &C ECTYPE 309a[[:]:]]b &C ECTYPE 310a[[:-:]]b &C ECTYPE 311a[[:alph:]] &C ECTYPE 312a[[:alphabet:]] &C ECTYPE 313[[:alnum:]]+ - -%@a0X- a0X 314[[:alpha:]]+ - -%@aX0- aX 315[[:blank:]]+ - aSSTb SST 316[[:cntrl:]]+ - aNTb NT 317[[:digit:]]+ - a019b 019 318[[:graph:]]+ - Sa%bS a%b 319[[:lower:]]+ - AabC ab 320[[:print:]]+ - NaSbN aSb 321[[:punct:]]+ - S%-&T %-& 322[[:space:]]+ - aSNTb SNT 323[[:upper:]]+ - aBCd BC 324[[:xdigit:]]+ - p0f3Cq 0f3C 325a[[=b=]]c & abc abc 326a[[= &C EBRACK 327a[[=b &C EBRACK 328a[[=b= &C EBRACK 329a[[=b=] &C EBRACK 330a[[=b,=]] &C ECOLLATE 331# This test is invalid. "one" is no collating symbol in any standardized 332# locale. 333#a[[=one=]]b & a1b a1b 334 335# complexities 336a(((b)))c - abc abc 337a(b|(c))d - abd abd 338a(b*|c)d - abbd abbd 339# just gotta have one DFA-buster, of course 340a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 341# and an inline expansion in case somebody gets tricky 342a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 343# and in case somebody just slips in an NFA... 344a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights 345# fish for anomalies as the number of states passes 32 34612345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789 347123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890 3481234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901 34912345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012 350123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123 351# and one really big one, beyond any plausible word width 3521234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890 353# fish for problems as brackets go past 8 354[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm 355[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo 356[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq 357[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq 358 359# subtleties of matching 360abc & xabcy abc 361a\(b\)?c\1d b acd 362aBc i Abc Abc 363a[Bc]*d i abBCcd abBCcd 3640[[:upper:]]1 &i 0a1 0a1 3650[[:lower:]]1 &i 0A1 0A1 366a[^b]c &i abc 367a[^b]c &i aBc 368a[^b]c &i adc adc 369[a]b[c] - abc abc 370[a]b[a] - aba aba 371[abc]b[abc] - abc abc 372[abc]b[abd] - abd abd 373a(b?c)+d - accd accd 374(wee|week)(knights|night) - weeknights weeknights 375(we|wee|week|frob)(knights|night|day) - weeknights weeknights 376a[bc]d - xyzaaabcaababdacd abd 377a[ab]c - aaabc abc 378abc s abc abc 379() s abc @abc 380a* & b @b 381 382# Let's have some fun -- try to match a C comment. 383# first the obvious, which looks okay at first glance... 384/\*.*\*/ - /*x*/ /*x*/ 385# but... 386/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/ 387# okay, we must not match */ inside; try to do that... 388/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/ 389/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/ 390# but... 391/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/ 392# and a still fancier version, which does it right (I think)... 393/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/ 394/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/ 395/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/ 396/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/ 397/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/ 398/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/ 399 400# subexpressions 401.* - abc abc - 402a(b)(c)d - abcd abcd b,c 403a(((b)))c - abc abc b,b,b 404a(b|(c))d - abd abd b,- 405a(b*|c|e)d - abbd abbd bb 406a(b*|c|e)d - acd acd c 407a(b*|c|e)d - ad ad @d 408a(b?)c - abc abc b 409a(b?)c - ac ac @c 410a(b+)c - abc abc b 411a(b+)c - abbbc abbbc bbb 412a(b*)c - ac ac @c 413(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de 414# the regression tester only asks for 9 subexpressions 415a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j 416a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k 417a([bc]?)c - abc abc b 418a([bc]?)c - ac ac @c 419a([bc]+)c - abc abc b 420a([bc]+)c - abcc abcc bc 421a([bc]+)bc - abcbc abcbc bc 422a(bb+|b)b - abb abb b 423a(bbb+|bb+|b)b - abb abb b 424a(bbb+|bb+|b)b - abbb abbb bb 425a(bbb+|bb+|b)bb - abbb abbb b 426(.*).* - abcdef abcdef abcdef 427(a*)* - bc @b @b 428 429# do we get the right subexpression when it is used more than once? 430a(b|c)*d - ad ad - 431a(b|c)*d - abcd abcd c 432a(b|c)+d - abd abd b 433a(b|c)+d - abcd abcd c 434a(b|c?)+d - ad ad @d 435a(b|c?)+d - abcd abcd c 436a(b|c){0,0}d - ad ad - 437a(b|c){0,1}d - ad ad - 438a(b|c){0,1}d - abd abd b 439a(b|c){0,2}d - ad ad - 440a(b|c){0,2}d - abcd abcd c 441a(b|c){0,}d - ad ad - 442a(b|c){0,}d - abcd abcd c 443a(b|c){1,1}d - abd abd b 444a(b|c){1,1}d - acd acd c 445a(b|c){1,2}d - abd abd b 446a(b|c){1,2}d - abcd abcd c 447a(b|c){1,}d - abd abd b 448a(b|c){1,}d - abcd abcd c 449a(b|c){2,2}d - acbd acbd b 450a(b|c){2,2}d - abcd abcd c 451a(b|c){2,4}d - abcd abcd c 452a(b|c){2,4}d - abcbd abcbd b 453a(b|c){2,4}d - abcbcd abcbcd c 454a(b|c){2,}d - abcd abcd c 455a(b|c){2,}d - abcbd abcbd b 456a(b+|((c)*))+d - abd abd b,-,- 457a(b+|((c)*))+d - abcd abcd c,c,c 458 459# check out the STARTEND option 460[abc] &# a(b)c b 461[abc] &# a(d)c 462[abc] &# a(bc)d b 463[abc] &# a(dc)d c 464. &# a()c 465b.*c &# b(bc)c bc 466b.* &# b(bc)c bc 467.*c &# b(bc)c bc 468 469# plain strings, with the NOSPEC flag 470abc m abc abc 471abc m xabcy abc 472abc m xyz 473a*b m aba*b a*b 474a*b m ab 475"" mC EMPTY 476 477# cases involving NULs 478aZb & a a 479aZb &p a 480aZb &p# (aZb) aZb 481aZ*b &p# (ab) ab 482a.b &# (aZb) aZb 483a.* &# (aZb)c aZb 484 485# word boundaries (ick) 486[[:<:]]a & a a 487[[:<:]]a & ba 488[[:<:]]a & -a a 489a[[:>:]] & a a 490a[[:>:]] & ab 491a[[:>:]] & a- a 492[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc 493[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc 494[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc 495[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc 496[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_ 497[[:<:]]a_b[[:>:]] & x_a_b 498 499# past problems, and suspected problems 500(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1 501abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop 502abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv 503(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11 504CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11 505Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz 506a?b - ab ab 507-\{0,1\}[0-9]*$ b -5 -5 508a*a*a*a*a*a*a* & aaaaaa aaaaaa 509(\b){0} - x @x - 510\(\b\)\{0,0\} b abc @abc - 511a(\b){0}c - ac ac - 512a(.*)b(\1){0}c - abc abc @bc,- 513a(.*)b(\1){0}c - axbc axbc x,- 514 515a\(\(b*\)\)c\1d b abbcbbd abbcbbd bb,bb 516a\(\([bc]\)\)\2d b abcdabbd abbd b,b 517a\(\(\(\([bc]\)\)\3\)\)*d b abbccd abbccd cc,cc,c,c 518a(b)(c)d - abcd abcd b,c 519a(((b)))c - abc abc b,b,b 520a(((b|(((c))))))d - abd abd b,b,b,-,-,- 521a(((b*|c|e)))d - abbd abbd bb,bb,bb 522a((b|c)){0,0}d - ad ad -,- 523a((b|c)){0,1}d - abd abd b,b 524a((b|c)){0,2}d - abcd abcd c,c 525a((b+|((c)*)))+d - abd abd b,b,-,- 526a((b+|((c)*)))+d - abcd abcd c,c,c,c 527(((\b))){0} - x @x -,-,- 528a(((.*)))b((\2)){0}c - abc abc @bc,@bc,@bc,-,- 529a(((.*)))b((\1)){0}c - axbc axbc x,x,x,-,- 530 531\b & SaT @aT 532\b & aT @aT 533a.*\b & abT ab 534\b & STSS 535\B & abc @bc 536\B & aSbTc 537\B & SaT @SaT 538\B & aSTSb @TSb 539 540o$($|.) - oN 541o$($|.) - op 542o$($|.) - o o 543