• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# coding=utf-8
3#
4# Copyright 2008 The RE2 Authors.  All Rights Reserved.
5# Use of this source code is governed by a BSD-style
6# license that can be found in the LICENSE file.
7
8# See unicode_casefold.h for description of case folding tables.
9
10"""Generate C++ table for Unicode case folding."""
11
12from __future__ import absolute_import
13from __future__ import division
14from __future__ import print_function
15
16import sys
17import unicode
18
19_header = """
20// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
21// make_unicode_casefold.py >unicode_casefold.cc
22
23#include "re2/unicode_casefold.h"
24
25namespace re2 {
26
27"""
28
29_trailer = """
30
31} // namespace re2
32
33"""
34
35def _Delta(a, b):
36  """Compute the delta for b - a.  Even/odd and odd/even
37     are handled specially, as described above."""
38  if a+1 == b:
39    if a%2 == 0:
40      return 'EvenOdd'
41    else:
42      return 'OddEven'
43  if a == b+1:
44    if a%2 == 0:
45      return 'OddEven'
46    else:
47      return 'EvenOdd'
48  return b - a
49
50def _AddDelta(a, delta):
51  """Return a + delta, handling EvenOdd and OddEven specially."""
52  if type(delta) == int:
53    return a+delta
54  if delta == 'EvenOdd':
55    if a%2 == 0:
56      return a+1
57    else:
58      return a-1
59  if delta == 'OddEven':
60    if a%2 == 1:
61      return a+1
62    else:
63      return a-1
64  print("Bad Delta:", delta, file=sys.stderr)
65  raise unicode.Error("Bad Delta")
66
67def _MakeRanges(pairs):
68  """Turn a list like [(65,97), (66, 98), ..., (90,122)]
69     into [(65, 90, +32)]."""
70  ranges = []
71  last = -100
72
73  def evenodd(last, a, b, r):
74    if a != last+1 or b != _AddDelta(a, r[2]):
75      return False
76    r[1] = a
77    return True
78
79  def evenoddpair(last, a, b, r):
80    if a != last+2:
81      return False
82    delta = r[2]
83    d = delta
84    if type(delta) is not str:
85      return False
86    if delta.endswith('Skip'):
87      d = delta[:-4]
88    else:
89      delta = d + 'Skip'
90    if b != _AddDelta(a, d):
91      return False
92    r[1] = a
93    r[2] = delta
94    return True
95
96  for a, b in pairs:
97    if ranges and evenodd(last, a, b, ranges[-1]):
98      pass
99    elif ranges and evenoddpair(last, a, b, ranges[-1]):
100      pass
101    else:
102      ranges.append([a, a, _Delta(a, b)])
103    last = a
104  return ranges
105
106# The maximum size of a case-folding group.
107# Case folding is implemented in parse.cc by a recursive process
108# with a recursion depth equal to the size of the largest
109# case-folding group, so it is important that this bound be small.
110# The current tables have no group bigger than 4.
111# If there are ever groups bigger than 10 or so, it will be
112# time to rework the code in parse.cc.
113MaxCasefoldGroup = 4
114
115def main():
116  lowergroups, casegroups = unicode.CaseGroups()
117  foldpairs = []
118  seen = {}
119  for c in casegroups:
120    if len(c) > MaxCasefoldGroup:
121      raise unicode.Error("casefold group too long: %s" % (c,))
122    for i in range(len(c)):
123      if c[i-1] in seen:
124        raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
125      seen[c[i-1]] = True
126      foldpairs.append([c[i-1], c[i]])
127
128  lowerpairs = []
129  for lower, group in lowergroups.items():
130    for g in group:
131      if g != lower:
132        lowerpairs.append([g, lower])
133
134  def printpairs(name, foldpairs):
135    foldpairs.sort()
136    foldranges = _MakeRanges(foldpairs)
137    print("// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)))
138    print("const CaseFold unicode_%s[] = {" % (name,))
139    for lo, hi, delta in foldranges:
140      print("\t{ %d, %d, %s }," % (lo, hi, delta))
141    print("};")
142    print("const int num_unicode_%s = %d;" % (name, len(foldranges)))
143    print("")
144
145  print(_header)
146  printpairs("casefold", foldpairs)
147  printpairs("tolower", lowerpairs)
148  print(_trailer)
149
150if __name__ == '__main__':
151  main()
152