• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
18 #define LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
19 
20 namespace libtextclassifier {
21 namespace nlp_core {
22 namespace lang_id {
23 
24 // Unicode scripts we care about.  To get compact and fast code, we detect only
25 // a few Unicode scripts that offer a strong indication about the language of
26 // the text (e.g., Hiragana -> Japanese).
27 enum Script {
28   // Special value to indicate internal errors in the script detection code.
29   kScriptError,
30 
31   // Special values for all Unicode scripts that we do not detect.  One special
32   // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
33   // already have that information, we use it).  kScriptOtherUtf8OneByte means
34   // ~Latin and kScriptOtherUtf8FourBytes means ~Han.
35   kScriptOtherUtf8OneByte,
36   kScriptOtherUtf8TwoBytes,
37   kScriptOtherUtf8ThreeBytes,
38   kScriptOtherUtf8FourBytes,
39 
40   kScriptGreek,
41   kScriptCyrillic,
42   kScriptHebrew,
43   kScriptArabic,
44   kScriptHangulJamo,  // Used primarily for Korean.
45   kScriptHiragana,    // Used primarily for Japanese.
46   kScriptKatakana,    // Used primarily for Japanese.
47 
48   // Add new scripts here.
49 
50   // Do not add any script after kNumRelevantScripts.  This value indicates the
51   // number of elements in this enum Script (except this value) such that we can
52   // easily iterate over the scripts.
53   kNumRelevantScripts,
54 };
55 
56 template<typename IntType>
InRange(IntType value,IntType low,IntType hi)57 inline bool InRange(IntType value, IntType low, IntType hi) {
58   return (value >= low) && (value <= hi);
59 }
60 
61 // Returns Script for the UTF8 character that starts at address p.
62 // Precondition: p points to a valid UTF8 character of num_bytes bytes.
GetScript(const unsigned char * p,int num_bytes)63 inline Script GetScript(const unsigned char *p, int num_bytes) {
64   switch (num_bytes) {
65     case 1:
66       return kScriptOtherUtf8OneByte;
67 
68     case 2: {
69       // 2-byte UTF8 characters have 11 bits of information.  unsigned int has
70       // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
71       // it's enough.  It's also usually the fastest int type on the current
72       // CPU, so it's better to use than int32.
73       static const unsigned int kGreekStart = 0x370;
74 
75       // Commented out (unsued in the code): kGreekEnd = 0x3FF;
76       static const unsigned int kCyrillicStart = 0x400;
77       static const unsigned int kCyrillicEnd = 0x4FF;
78       static const unsigned int kHebrewStart = 0x590;
79 
80       // Commented out (unsued in the code): kHebrewEnd = 0x5FF;
81       static const unsigned int kArabicStart = 0x600;
82       static const unsigned int kArabicEnd = 0x6FF;
83       const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
84       if (codepoint > kCyrillicEnd) {
85         if (codepoint >= kArabicStart) {
86           if (codepoint <= kArabicEnd) {
87             return kScriptArabic;
88           }
89         } else {
90           // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
91           // codepoint <= kHebrewEnd.
92           if (codepoint >= kHebrewStart) {
93             return kScriptHebrew;
94           }
95         }
96       } else {
97         if (codepoint >= kCyrillicStart) {
98           return kScriptCyrillic;
99         } else {
100           // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
101           // codepoint <= kGreekEnd.
102           if (codepoint >= kGreekStart) {
103             return kScriptGreek;
104           }
105         }
106       }
107       return kScriptOtherUtf8TwoBytes;
108     }
109 
110     case 3: {
111       // 3-byte UTF8 characters have 16 bits of information.  unsigned int has
112       // at least 16 bits.
113       static const unsigned int kHangulJamoStart = 0x1100;
114       static const unsigned int kHangulJamoEnd = 0x11FF;
115       static const unsigned int kHiraganaStart = 0x3041;
116       static const unsigned int kHiraganaEnd = 0x309F;
117 
118       // Commented out (unsued in the code): kKatakanaStart = 0x30A0;
119       static const unsigned int kKatakanaEnd = 0x30FF;
120       const unsigned int codepoint =
121           ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
122       if (codepoint > kHiraganaEnd) {
123         // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
124         // codepoint >= kKatakanaStart.
125         if (codepoint <= kKatakanaEnd) {
126           return kScriptKatakana;
127         }
128       } else {
129         if (codepoint >= kHiraganaStart) {
130           return kScriptHiragana;
131         } else {
132           if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
133             return kScriptHangulJamo;
134           }
135         }
136       }
137       return kScriptOtherUtf8ThreeBytes;
138     }
139 
140     case 4:
141       return kScriptOtherUtf8FourBytes;
142 
143     default:
144       return kScriptError;
145   }
146 }
147 
148 // Returns Script for the UTF8 character that starts at address p.  Similar to
149 // the previous version of GetScript, except for "char" vs "unsigned char".
150 // Most code works with "char *" pointers, ignoring the fact that char is
151 // unsigned (by default) on most platforms, but signed on iOS.  This code takes
152 // care of making sure we always treat chars as unsigned.
GetScript(const char * p,int num_bytes)153 inline Script GetScript(const char *p, int num_bytes) {
154   return GetScript(reinterpret_cast<const unsigned char *>(p),
155                    num_bytes);
156 }
157 
158 }  // namespace lang_id
159 }  // namespace nlp_core
160 }  // namespace libtextclassifier
161 
162 #endif  // LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
163