/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_ #define LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_ namespace libtextclassifier { namespace nlp_core { namespace lang_id { // Unicode scripts we care about. To get compact and fast code, we detect only // a few Unicode scripts that offer a strong indication about the language of // the text (e.g., Hiragana -> Japanese). enum Script { // Special value to indicate internal errors in the script detection code. kScriptError, // Special values for all Unicode scripts that we do not detect. One special // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we // already have that information, we use it). kScriptOtherUtf8OneByte means // ~Latin and kScriptOtherUtf8FourBytes means ~Han. kScriptOtherUtf8OneByte, kScriptOtherUtf8TwoBytes, kScriptOtherUtf8ThreeBytes, kScriptOtherUtf8FourBytes, kScriptGreek, kScriptCyrillic, kScriptHebrew, kScriptArabic, kScriptHangulJamo, // Used primarily for Korean. kScriptHiragana, // Used primarily for Japanese. kScriptKatakana, // Used primarily for Japanese. // Add new scripts here. // Do not add any script after kNumRelevantScripts. This value indicates the // number of elements in this enum Script (except this value) such that we can // easily iterate over the scripts. kNumRelevantScripts, }; template inline bool InRange(IntType value, IntType low, IntType hi) { return (value >= low) && (value <= hi); } // Returns Script for the UTF8 character that starts at address p. // Precondition: p points to a valid UTF8 character of num_bytes bytes. inline Script GetScript(const unsigned char *p, int num_bytes) { switch (num_bytes) { case 1: return kScriptOtherUtf8OneByte; case 2: { // 2-byte UTF8 characters have 11 bits of information. unsigned int has // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so // it's enough. It's also usually the fastest int type on the current // CPU, so it's better to use than int32. static const unsigned int kGreekStart = 0x370; // Commented out (unsued in the code): kGreekEnd = 0x3FF; static const unsigned int kCyrillicStart = 0x400; static const unsigned int kCyrillicEnd = 0x4FF; static const unsigned int kHebrewStart = 0x590; // Commented out (unsued in the code): kHebrewEnd = 0x5FF; static const unsigned int kArabicStart = 0x600; static const unsigned int kArabicEnd = 0x6FF; const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F); if (codepoint > kCyrillicEnd) { if (codepoint >= kArabicStart) { if (codepoint <= kArabicEnd) { return kScriptArabic; } } else { // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so // codepoint <= kHebrewEnd. if (codepoint >= kHebrewStart) { return kScriptHebrew; } } } else { if (codepoint >= kCyrillicStart) { return kScriptCyrillic; } else { // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so // codepoint <= kGreekEnd. if (codepoint >= kGreekStart) { return kScriptGreek; } } } return kScriptOtherUtf8TwoBytes; } case 3: { // 3-byte UTF8 characters have 16 bits of information. unsigned int has // at least 16 bits. static const unsigned int kHangulJamoStart = 0x1100; static const unsigned int kHangulJamoEnd = 0x11FF; static const unsigned int kHiraganaStart = 0x3041; static const unsigned int kHiraganaEnd = 0x309F; // Commented out (unsued in the code): kKatakanaStart = 0x30A0; static const unsigned int kKatakanaEnd = 0x30FF; const unsigned int codepoint = ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F); if (codepoint > kHiraganaEnd) { // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so // codepoint >= kKatakanaStart. if (codepoint <= kKatakanaEnd) { return kScriptKatakana; } } else { if (codepoint >= kHiraganaStart) { return kScriptHiragana; } else { if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) { return kScriptHangulJamo; } } } return kScriptOtherUtf8ThreeBytes; } case 4: return kScriptOtherUtf8FourBytes; default: return kScriptError; } } // Returns Script for the UTF8 character that starts at address p. Similar to // the previous version of GetScript, except for "char" vs "unsigned char". // Most code works with "char *" pointers, ignoring the fact that char is // unsigned (by default) on most platforms, but signed on iOS. This code takes // care of making sure we always treat chars as unsigned. inline Script GetScript(const char *p, int num_bytes) { return GetScript(reinterpret_cast(p), num_bytes); } } // namespace lang_id } // namespace nlp_core } // namespace libtextclassifier #endif // LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_