1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "lang_id/script/approx-script.h"
18
19 #include "lang_id/common/lite_base/integral-types.h"
20 #include "lang_id/common/lite_base/logging.h"
21 #include "lang_id/common/utf8.h"
22 #include "lang_id/script/approx-script-data.h"
23
24 namespace libtextclassifier3 {
25 namespace mobile {
26
27 // int value of USCRIPT_UNKNOWN from enum UScriptCode (from
28 // unicode/uscript.h). Note: we do have a test that
29 // USCRIPT_UNKNOWN evaluates to 103.
30 const int kUnknownUscript = 103;
31
32 namespace {
33 using approx_script_internal::kNumRanges;
34 using approx_script_internal::kRangeFirst;
35 using approx_script_internal::kRangeScript;
36 using approx_script_internal::kRangeSizeMinusOne;
37
Utf8ToCodepoint(const unsigned char * s,int num_bytes)38 uint32 Utf8ToCodepoint(const unsigned char *s, int num_bytes) {
39 switch (num_bytes) {
40 case 1:
41 return s[0];
42 case 2:
43 return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
44 case 3:
45 return (((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F));
46 case 4:
47 return (((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) |
48 ((s[2] & 0x3F) << 6) | (s[3] & 0x3F));
49 default:
50 SAFTM_DLOG(FATAL) << "Illegal num_bytes: " << num_bytes;
51 return 0;
52 }
53 }
54
BinarySearch(uint32 codepoint,int start,int end)55 inline int BinarySearch(uint32 codepoint, int start, int end) {
56 while (end > start + 1) {
57 // Due to the while loop condition, middle > start and middle < end. Hence,
58 // on both branches of the if below, we strictly reduce the end - start
59 // value, so we eventually get that difference below 1 and complete the
60 // while loop.
61 int middle = (start + end) / 2;
62 if (codepoint < kRangeFirst[middle]) {
63 end = middle;
64 } else {
65 start = middle;
66 }
67 }
68
69 if (end == start + 1) {
70 const uint32 range_start = kRangeFirst[start];
71 if ((codepoint >= range_start) &&
72 (codepoint <= range_start + kRangeSizeMinusOne[start])) {
73 return kRangeScript[start];
74 }
75 }
76
77 return kUnknownUscript;
78 }
79 } // namespace
80
GetApproxScript(const unsigned char * s,int num_bytes)81 int GetApproxScript(const unsigned char *s, int num_bytes) {
82 SAFTM_DCHECK_NE(s, nullptr);
83 SAFTM_DCHECK_EQ(num_bytes,
84 utils::OneCharLen(reinterpret_cast<const char *>(s)));
85 uint32 codepoint = Utf8ToCodepoint(s, num_bytes);
86 return BinarySearch(codepoint, 0, kNumRanges);
87 }
88
GetMaxApproxScriptResult()89 int GetMaxApproxScriptResult() { return approx_script_internal::kMaxScript; }
90
91 SAFTM_STATIC_REGISTRATION(ApproxScriptDetector);
92
93 } // namespace mobile
94 } // namespace nlp_saft
95