1 // Copyright 2008 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_UNICODE_CASEFOLD_H_ 6 #define RE2_UNICODE_CASEFOLD_H_ 7 8 // Unicode case folding tables. 9 10 // The Unicode case folding tables encode the mapping from one Unicode point 11 // to the next largest Unicode point with equivalent folding. The largest 12 // point wraps back to the first. For example, the tables map: 13 // 14 // 'A' -> 'a' 15 // 'a' -> 'A' 16 // 17 // 'K' -> 'k' 18 // 'k' -> 'K' (Kelvin symbol) 19 // 'K' -> 'K' 20 // 21 // Like everything Unicode, these tables are big. If we represent the table 22 // as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB. 23 // Most table entries look like the ones around them: 24 // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. 25 // Instead of listing all the pairs explicitly, we make a list of ranges 26 // and deltas, so that the table entries for 'A' through 'Z' can be represented 27 // as a single entry { 'A', 'Z', +32 }. 28 // 29 // In addition to blocks that map to each other (A-Z mapping to a-z) 30 // there are blocks of pairs that individually map to each other 31 // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). 32 // For those, the special delta value EvenOdd marks even/odd pairs 33 // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. 34 // 35 // In this form, the table has 274 entries, about 3kB. If we were to split 36 // the table into one for 16-bit codes and an overflow table for larger ones, 37 // we could get it down to about 1.5kB, but that's not worth the complexity. 38 // 39 // The grouped form also allows for efficient fold range calculations 40 // rather than looping one character at a time. 41 42 #include <stdint.h> 43 44 #include "util/util.h" 45 #include "util/utf.h" 46 47 namespace re2 { 48 49 enum { 50 EvenOdd = 1, 51 OddEven = -1, 52 EvenOddSkip = 1<<30, 53 OddEvenSkip, 54 }; 55 56 struct CaseFold { 57 Rune lo; 58 Rune hi; 59 int32_t delta; 60 }; 61 62 extern const CaseFold unicode_casefold[]; 63 extern const int num_unicode_casefold; 64 65 extern const CaseFold unicode_tolower[]; 66 extern const int num_unicode_tolower; 67 68 // Returns the CaseFold* in the tables that contains rune. 69 // If rune is not in the tables, returns the first CaseFold* after rune. 70 // If rune is larger than any value in the tables, returns NULL. 71 extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune); 72 73 // Returns the result of applying the fold f to the rune r. 74 extern Rune ApplyFold(const CaseFold *f, Rune r); 75 76 } // namespace re2 77 78 #endif // RE2_UNICODE_CASEFOLD_H_ 79