1#!/usr/bin/python 2# coding=utf-8 3# 4# Copyright 2008 The RE2 Authors. All Rights Reserved. 5# Use of this source code is governed by a BSD-style 6# license that can be found in the LICENSE file. 7 8# See unicode_casefold.h for description of case folding tables. 9 10"""Generate C++ table for Unicode case folding.""" 11 12from __future__ import absolute_import 13from __future__ import division 14from __future__ import print_function 15 16import sys 17import unicode 18 19_header = """ 20// GENERATED BY make_unicode_casefold.py; DO NOT EDIT. 21// make_unicode_casefold.py >unicode_casefold.cc 22 23#include "re2/unicode_casefold.h" 24 25namespace re2 { 26 27""" 28 29_trailer = """ 30 31} // namespace re2 32 33""" 34 35def _Delta(a, b): 36 """Compute the delta for b - a. Even/odd and odd/even 37 are handled specially, as described above.""" 38 if a+1 == b: 39 if a%2 == 0: 40 return 'EvenOdd' 41 else: 42 return 'OddEven' 43 if a == b+1: 44 if a%2 == 0: 45 return 'OddEven' 46 else: 47 return 'EvenOdd' 48 return b - a 49 50def _AddDelta(a, delta): 51 """Return a + delta, handling EvenOdd and OddEven specially.""" 52 if type(delta) == int: 53 return a+delta 54 if delta == 'EvenOdd': 55 if a%2 == 0: 56 return a+1 57 else: 58 return a-1 59 if delta == 'OddEven': 60 if a%2 == 1: 61 return a+1 62 else: 63 return a-1 64 print("Bad Delta:", delta, file=sys.stderr) 65 raise unicode.Error("Bad Delta") 66 67def _MakeRanges(pairs): 68 """Turn a list like [(65,97), (66, 98), ..., (90,122)] 69 into [(65, 90, +32)].""" 70 ranges = [] 71 last = -100 72 73 def evenodd(last, a, b, r): 74 if a != last+1 or b != _AddDelta(a, r[2]): 75 return False 76 r[1] = a 77 return True 78 79 def evenoddpair(last, a, b, r): 80 if a != last+2: 81 return False 82 delta = r[2] 83 d = delta 84 if type(delta) is not str: 85 return False 86 if delta.endswith('Skip'): 87 d = delta[:-4] 88 else: 89 delta = d + 'Skip' 90 if b != _AddDelta(a, d): 91 return False 92 r[1] = a 93 r[2] = delta 94 return True 95 96 for a, b in pairs: 97 if ranges and evenodd(last, a, b, ranges[-1]): 98 pass 99 elif ranges and evenoddpair(last, a, b, ranges[-1]): 100 pass 101 else: 102 ranges.append([a, a, _Delta(a, b)]) 103 last = a 104 return ranges 105 106# The maximum size of a case-folding group. 107# Case folding is implemented in parse.cc by a recursive process 108# with a recursion depth equal to the size of the largest 109# case-folding group, so it is important that this bound be small. 110# The current tables have no group bigger than 4. 111# If there are ever groups bigger than 10 or so, it will be 112# time to rework the code in parse.cc. 113MaxCasefoldGroup = 4 114 115def main(): 116 lowergroups, casegroups = unicode.CaseGroups() 117 foldpairs = [] 118 seen = {} 119 for c in casegroups: 120 if len(c) > MaxCasefoldGroup: 121 raise unicode.Error("casefold group too long: %s" % (c,)) 122 for i in range(len(c)): 123 if c[i-1] in seen: 124 raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) 125 seen[c[i-1]] = True 126 foldpairs.append([c[i-1], c[i]]) 127 128 lowerpairs = [] 129 for lower, group in lowergroups.items(): 130 for g in group: 131 if g != lower: 132 lowerpairs.append([g, lower]) 133 134 def printpairs(name, foldpairs): 135 foldpairs.sort() 136 foldranges = _MakeRanges(foldpairs) 137 print("// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))) 138 print("const CaseFold unicode_%s[] = {" % (name,)) 139 for lo, hi, delta in foldranges: 140 print("\t{ %d, %d, %s }," % (lo, hi, delta)) 141 print("};") 142 print("const int num_unicode_%s = %d;" % (name, len(foldranges))) 143 print("") 144 145 print(_header) 146 printpairs("casefold", foldpairs) 147 printpairs("tolower", lowerpairs) 148 print(_trailer) 149 150if __name__ == '__main__': 151 main() 152