#!/usr/bin/perl # Copyright 2008 The RE2 Authors. All Rights Reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. # Generate table entries giving character ranges # for POSIX/Perl character classes. Rather than # figure out what the definition is, it is easier to ask # Perl about each letter from 0-128 and write down # its answer. @posixclasses = ( "[:alnum:]", "[:alpha:]", "[:ascii:]", "[:blank:]", "[:cntrl:]", "[:digit:]", "[:graph:]", "[:lower:]", "[:print:]", "[:punct:]", "[:space:]", "[:upper:]", "[:word:]", "[:xdigit:]", ); @perlclasses = ( "\\d", "\\s", "\\w", ); sub ComputeClass($) { my @ranges; my ($class) = @_; my $regexp = "[$class]"; my $start = -1; for (my $i=0; $i<=129; $i++) { if ($i == 129) { $i = 256; } if ($i <= 128 && chr($i) =~ $regexp) { if ($start < 0) { $start = $i; } } else { if ($start >= 0) { push @ranges, [$start, $i-1]; } $start = -1; } } return @ranges; } sub PrintClass($$@) { my ($cname, $name, @ranges) = @_; print "static URange16 code${cname}[] = { /* $name */\n"; for (my $i=0; $i<@ranges; $i++) { my @a = @{$ranges[$i]}; printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1]; } print "};\n"; my $n = @ranges; my $escname = $name; $escname =~ s/\\/\\\\/g; $negname = $escname; if ($negname =~ /:/) { $negname =~ s/:/:^/; } else { $negname =~ y/a-z/A-Z/; } return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }"; } my $gen = 0; sub PrintClasses($@) { my ($cname, @classes) = @_; my @entries; foreach my $cl (@classes) { my @ranges = ComputeClass($cl); push @entries, PrintClass(++$gen, $cl, @ranges); } print "UGroup ${cname}_groups[] = {\n"; foreach my $e (@entries) { print "\t$e,\n"; } print "};\n"; my $count = @entries; print "int num_${cname}_groups = $count;\n"; } print <perl_groups.cc #include "re2/unicode_groups.h" namespace re2 { EOF PrintClasses("perl", @perlclasses); PrintClasses("posix", @posixclasses); print <