1 // Copyright 2008 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_UNICODE_GROUPS_H_ 6 #define RE2_UNICODE_GROUPS_H_ 7 8 // Unicode character groups. 9 10 // The codes get split into ranges of 16-bit codes 11 // and ranges of 32-bit codes. It would be simpler 12 // to use only 32-bit ranges, but these tables are large 13 // enough to warrant extra care. 14 // 15 // Using just 32-bit ranges gives 27 kB of data. 16 // Adding 16-bit ranges gives 18 kB of data. 17 // Adding an extra table of 16-bit singletons would reduce 18 // to 16.5 kB of data but make the data harder to use; 19 // we don't bother. 20 21 #include <stdint.h> 22 23 #include "util/util.h" 24 #include "util/utf.h" 25 26 namespace re2 { 27 28 struct URange16 29 { 30 uint16_t lo; 31 uint16_t hi; 32 }; 33 34 struct URange32 35 { 36 Rune lo; 37 Rune hi; 38 }; 39 40 struct UGroup 41 { 42 const char *name; 43 int sign; // +1 for [abc], -1 for [^abc] 44 const URange16 *r16; 45 int nr16; 46 const URange32 *r32; 47 int nr32; 48 }; 49 50 // Named by property or script name (e.g., "Nd", "N", "Han"). 51 // Negated groups are not included. 52 extern const UGroup unicode_groups[]; 53 extern const int num_unicode_groups; 54 55 // Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]"). 56 // Negated groups are included. 57 extern const UGroup posix_groups[]; 58 extern const int num_posix_groups; 59 60 // Named by Perl name (e.g., "\\d", "\\D"). 61 // Negated groups are included. 62 extern const UGroup perl_groups[]; 63 extern const int num_perl_groups; 64 65 } // namespace re2 66 67 #endif // RE2_UNICODE_GROUPS_H_ 68