1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
18 #define LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
19
20 namespace libtextclassifier {
21 namespace nlp_core {
22 namespace lang_id {
23
24 // Unicode scripts we care about. To get compact and fast code, we detect only
25 // a few Unicode scripts that offer a strong indication about the language of
26 // the text (e.g., Hiragana -> Japanese).
27 enum Script {
28 // Special value to indicate internal errors in the script detection code.
29 kScriptError,
30
31 // Special values for all Unicode scripts that we do not detect. One special
32 // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
33 // already have that information, we use it). kScriptOtherUtf8OneByte means
34 // ~Latin and kScriptOtherUtf8FourBytes means ~Han.
35 kScriptOtherUtf8OneByte,
36 kScriptOtherUtf8TwoBytes,
37 kScriptOtherUtf8ThreeBytes,
38 kScriptOtherUtf8FourBytes,
39
40 kScriptGreek,
41 kScriptCyrillic,
42 kScriptHebrew,
43 kScriptArabic,
44 kScriptHangulJamo, // Used primarily for Korean.
45 kScriptHiragana, // Used primarily for Japanese.
46 kScriptKatakana, // Used primarily for Japanese.
47
48 // Add new scripts here.
49
50 // Do not add any script after kNumRelevantScripts. This value indicates the
51 // number of elements in this enum Script (except this value) such that we can
52 // easily iterate over the scripts.
53 kNumRelevantScripts,
54 };
55
56 template<typename IntType>
InRange(IntType value,IntType low,IntType hi)57 inline bool InRange(IntType value, IntType low, IntType hi) {
58 return (value >= low) && (value <= hi);
59 }
60
61 // Returns Script for the UTF8 character that starts at address p.
62 // Precondition: p points to a valid UTF8 character of num_bytes bytes.
GetScript(const unsigned char * p,int num_bytes)63 inline Script GetScript(const unsigned char *p, int num_bytes) {
64 switch (num_bytes) {
65 case 1:
66 return kScriptOtherUtf8OneByte;
67
68 case 2: {
69 // 2-byte UTF8 characters have 11 bits of information. unsigned int has
70 // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
71 // it's enough. It's also usually the fastest int type on the current
72 // CPU, so it's better to use than int32.
73 static const unsigned int kGreekStart = 0x370;
74
75 // Commented out (unsued in the code): kGreekEnd = 0x3FF;
76 static const unsigned int kCyrillicStart = 0x400;
77 static const unsigned int kCyrillicEnd = 0x4FF;
78 static const unsigned int kHebrewStart = 0x590;
79
80 // Commented out (unsued in the code): kHebrewEnd = 0x5FF;
81 static const unsigned int kArabicStart = 0x600;
82 static const unsigned int kArabicEnd = 0x6FF;
83 const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
84 if (codepoint > kCyrillicEnd) {
85 if (codepoint >= kArabicStart) {
86 if (codepoint <= kArabicEnd) {
87 return kScriptArabic;
88 }
89 } else {
90 // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
91 // codepoint <= kHebrewEnd.
92 if (codepoint >= kHebrewStart) {
93 return kScriptHebrew;
94 }
95 }
96 } else {
97 if (codepoint >= kCyrillicStart) {
98 return kScriptCyrillic;
99 } else {
100 // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
101 // codepoint <= kGreekEnd.
102 if (codepoint >= kGreekStart) {
103 return kScriptGreek;
104 }
105 }
106 }
107 return kScriptOtherUtf8TwoBytes;
108 }
109
110 case 3: {
111 // 3-byte UTF8 characters have 16 bits of information. unsigned int has
112 // at least 16 bits.
113 static const unsigned int kHangulJamoStart = 0x1100;
114 static const unsigned int kHangulJamoEnd = 0x11FF;
115 static const unsigned int kHiraganaStart = 0x3041;
116 static const unsigned int kHiraganaEnd = 0x309F;
117
118 // Commented out (unsued in the code): kKatakanaStart = 0x30A0;
119 static const unsigned int kKatakanaEnd = 0x30FF;
120 const unsigned int codepoint =
121 ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
122 if (codepoint > kHiraganaEnd) {
123 // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
124 // codepoint >= kKatakanaStart.
125 if (codepoint <= kKatakanaEnd) {
126 return kScriptKatakana;
127 }
128 } else {
129 if (codepoint >= kHiraganaStart) {
130 return kScriptHiragana;
131 } else {
132 if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
133 return kScriptHangulJamo;
134 }
135 }
136 }
137 return kScriptOtherUtf8ThreeBytes;
138 }
139
140 case 4:
141 return kScriptOtherUtf8FourBytes;
142
143 default:
144 return kScriptError;
145 }
146 }
147
148 // Returns Script for the UTF8 character that starts at address p. Similar to
149 // the previous version of GetScript, except for "char" vs "unsigned char".
150 // Most code works with "char *" pointers, ignoring the fact that char is
151 // unsigned (by default) on most platforms, but signed on iOS. This code takes
152 // care of making sure we always treat chars as unsigned.
GetScript(const char * p,int num_bytes)153 inline Script GetScript(const char *p, int num_bytes) {
154 return GetScript(reinterpret_cast<const unsigned char *>(p),
155 num_bytes);
156 }
157
158 } // namespace lang_id
159 } // namespace nlp_core
160 } // namespace libtextclassifier
161
162 #endif // LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
163