1#! /bin/sh 2 3# Script for testing regular expressions with perl to check that PCRE2 handles 4# them the same. The Perl code has to have "use utf8" and "require Encode" at 5# the start when running UTF-8 tests, but *not* for non-utf8 tests. (The 6# "require" would actually be OK for non-utf8-tests, but is not always 7# installed, so this way the script will always run for these tests.) 8# 9# The desired effect is achieved by making this a shell script that passes the 10# Perl script to Perl through a pipe. If the first argument is "-utf8", a 11# suitable prefix is set up. 12# 13# The remaining arguments, if any, are passed to Perl. They are an input file 14# and an output file. If there is one argument, the output is written to 15# STDOUT. If Perl receives no arguments, it opens /dev/tty as input, and writes 16# output to STDOUT. (I haven't found a way of getting it to use STDIN, because 17# of the contorted piping input.) 18 19perl=perl 20prefix='' 21if [ $# -gt 0 -a "$1" = "-utf8" ] ; then 22 prefix="use utf8; require Encode;" 23 shift 24fi 25 26 27# The Perl script that follows has a similar specification to pcre2test, and so 28# can be given identical input, except that input patterns can be followed only 29# by Perl's lower case modifiers and certain other pcre2test modifiers that are 30# either handled or ignored: 31# 32# aftertext interpreted as "print $' afterwards" 33# afteralltext ignored 34# dupnames ignored (Perl always allows) 35# mark ignored 36# no_auto_possess ignored 37# no_start_optimize ignored 38# ucp sets Perl's /u modifier 39# utf invoke UTF-8 functionality 40# 41# The data lines must not have any pcre2test modifiers. They are processed as 42# Perl double-quoted strings, so if they contain " $ or @ characters, these 43# have to be escaped. For this reason, all such characters in the 44# Perl-compatible testinput1 and testinput4 files are escaped so that they can 45# be used for perltest as well as for pcre2test. The output from this script 46# should be same as from pcre2test, apart from the initial identifying banner. 47# 48# The other testinput files are not suitable for feeding to perltest.sh, 49# because they make use of the special modifiers that pcre2test uses for 50# testing features of PCRE2. Some of these files also contain malformed regular 51# expressions, in order to check that PCRE2 diagnoses them correctly. 52 53(echo "$prefix" ; cat <<'PERLEND' 54 55# Function for turning a string into a string of printing chars. 56 57sub pchars { 58my($t) = ""; 59if ($utf8) 60 { 61 @p = unpack('U*', $_[0]); 62 foreach $c (@p) 63 { 64 if ($c >= 32 && $c < 127) { $t .= chr $c; } 65 else { $t .= sprintf("\\x{%02x}", $c); 66 } 67 } 68 } 69else 70 { 71 foreach $c (split(//, $_[0])) 72 { 73 if (ord $c >= 32 && ord $c < 127) { $t .= $c; } 74 else { $t .= sprintf("\\x%02x", ord $c); } 75 } 76 } 77$t; 78} 79 80 81# Read lines from a named file or stdin and write to a named file or stdout; 82# lines consist of a regular expression, in delimiters and optionally followed 83# by options, followed by a set of test data, terminated by an empty line. 84 85# Sort out the input and output files 86 87if (@ARGV > 0) 88 { 89 open(INFILE, "<$ARGV[0]") || die "Failed to open $ARGV[0]\n"; 90 $infile = "INFILE"; 91 $interact = 0; 92 } 93else 94 { 95 open(INFILE, "</dev/tty") || die "Failed to open /dev/tty\n"; 96 $infile = "INFILE"; 97 $interact = 1; 98 } 99 100if (@ARGV > 1) 101 { 102 open(OUTFILE, ">$ARGV[1]") || die "Failed to open $ARGV[1]\n"; 103 $outfile = "OUTFILE"; 104 } 105else { $outfile = "STDOUT"; } 106 107printf($outfile "Perl $] Regular Expressions\n\n"); 108 109# Main loop 110 111NEXT_RE: 112for (;;) 113 { 114 printf " re> " if $interact; 115 last if ! ($_ = <$infile>); 116 printf $outfile "$_" if ! $interact; 117 next if ($_ =~ /^\s*$/ || $_ =~ /^#/); 118 119 $pattern = $_; 120 121 while ($pattern !~ /^\s*(.).*\1/s) 122 { 123 printf " > " if $interact; 124 last if ! ($_ = <$infile>); 125 printf $outfile "$_" if ! $interact; 126 $pattern .= $_; 127 } 128 129 chomp($pattern); 130 $pattern =~ s/\s+$//; 131 132 # Split the pattern from the modifiers and adjust them as necessary. 133 134 $pattern =~ /^\s*((.).*\2)(.*)$/s; 135 $pat = $1; 136 $mod = $3; 137 138 # The private "aftertext" modifier means "print $' afterwards". 139 140 $showrest = ($mod =~ s/aftertext,?//); 141 142 # "allaftertext" is used by pcre2test to print remainders after captures 143 144 $mod =~ s/allaftertext,?//; 145 146 # Detect utf 147 148 $utf8 = $mod =~ s/utf,?//; 149 150 # Remove "dupnames". 151 152 $mod =~ s/dupnames,?//; 153 154 # Remove "mark" (asks pcre2test to check MARK data) */ 155 156 $mod =~ s/mark,?//; 157 158 # "ucp" asks pcre2test to set PCRE2_UCP; change this to /u for Perl 159 160 $mod =~ s/ucp,?/u/; 161 162 # Remove "no_auto_possess" and "no_start_optimize" (disable PCRE2 optimizations) 163 164 $mod =~ s/no_auto_possess,?//; 165 $mod =~ s/no_start_optimize,?//; 166 167 # Add back retained modifiers and check that the pattern is valid. 168 169 $mod =~ s/,//g; 170 $pattern = "$pat$mod"; 171 eval "\$_ =~ ${pattern}"; 172 if ($@) 173 { 174 printf $outfile "Error: $@"; 175 if (! $interact) 176 { 177 for (;;) 178 { 179 last if ! ($_ = <$infile>); 180 last if $_ =~ /^\s*$/; 181 } 182 } 183 next NEXT_RE; 184 } 185 186 # If the /g modifier is present, we want to put a loop round the matching; 187 # otherwise just a single "if". 188 189 $cmd = ($pattern =~ /g[a-z]*$/)? "while" : "if"; 190 191 # If the pattern is actually the null string, Perl uses the most recently 192 # executed (and successfully compiled) regex is used instead. This is a 193 # nasty trap for the unwary! The PCRE2 test suite does contain null strings 194 # in places - if they are allowed through here all sorts of weird and 195 # unexpected effects happen. To avoid this, we replace such patterns with 196 # a non-null pattern that has the same effect. 197 198 $pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/); 199 200 # Read data lines and test them 201 202 for (;;) 203 { 204 printf "data> " if $interact; 205 last NEXT_RE if ! ($_ = <$infile>); 206 chomp; 207 printf $outfile "%s", "$_\n" if ! $interact; 208 209 s/\s+$//; # Remove trailing space 210 s/^\s+//; # Remove leading space 211 212 last if ($_ eq ""); 213 next if $_ =~ /^\\=(?:\s|$)/; # Comment line 214 215 $x = eval "\"$_\""; # To get escapes processed 216 217 # Empty array for holding results, ensure $REGERROR and $REGMARK are 218 # unset, then do the matching. 219 220 @subs = (); 221 222 $pushes = "push \@subs,\$&;" . 223 "push \@subs,\$1;" . 224 "push \@subs,\$2;" . 225 "push \@subs,\$3;" . 226 "push \@subs,\$4;" . 227 "push \@subs,\$5;" . 228 "push \@subs,\$6;" . 229 "push \@subs,\$7;" . 230 "push \@subs,\$8;" . 231 "push \@subs,\$9;" . 232 "push \@subs,\$10;" . 233 "push \@subs,\$11;" . 234 "push \@subs,\$12;" . 235 "push \@subs,\$13;" . 236 "push \@subs,\$14;" . 237 "push \@subs,\$15;" . 238 "push \@subs,\$16;" . 239 "push \@subs,\$'; }"; 240 241 undef $REGERROR; 242 undef $REGMARK; 243 244 eval "${cmd} (\$x =~ ${pattern}) {" . $pushes; 245 246 if ($@) 247 { 248 printf $outfile "Error: $@\n"; 249 next NEXT_RE; 250 } 251 elsif (scalar(@subs) == 0) 252 { 253 printf $outfile "No match"; 254 if (defined $REGERROR && $REGERROR != 1) 255 { printf $outfile (", mark = %s", &pchars($REGERROR)); } 256 printf $outfile "\n"; 257 } 258 else 259 { 260 while (scalar(@subs) != 0) 261 { 262 printf $outfile (" 0: %s\n", &pchars($subs[0])); 263 printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest; 264 $last_printed = 0; 265 for ($i = 1; $i <= 16; $i++) 266 { 267 if (defined $subs[$i]) 268 { 269 while ($last_printed++ < $i-1) 270 { printf $outfile ("%2d: <unset>\n", $last_printed); } 271 printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i])); 272 $last_printed = $i; 273 } 274 } 275 splice(@subs, 0, 18); 276 } 277 278 # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is 279 # set and the input pattern was a UTF-8 string. We can, however, force 280 # it to be so marked. 281 282 if (defined $REGMARK && $REGMARK != 1) 283 { 284 $xx = $REGMARK; 285 $xx = Encode::decode_utf8($xx) if $utf8; 286 printf $outfile ("MK: %s\n", &pchars($xx)); 287 } 288 } 289 } 290 } 291 292# printf $outfile "\n"; 293 294PERLEND 295) | $perl - $@ 296 297# End 298