• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "lang_id/custom-tokenizer.h"
18 
19 #include <ctype.h>
20 
21 #include <string>
22 
23 #include "lang_id/common/lite_base/attributes.h"
24 #include "lang_id/common/lite_base/logging.h"
25 #include "lang_id/common/utf8.h"
26 #include "utf.h"
27 
28 namespace libtextclassifier3 {
29 namespace mobile {
30 namespace lang_id {
31 
32 namespace {
IsTokenSeparator(int num_bytes,const char * curr)33 inline bool IsTokenSeparator(int num_bytes, const char *curr) {
34   if (num_bytes != 1) {
35     return false;
36   }
37   return !isalpha(*curr);
38 }
39 
40 // Appends to *word the UTF8 encoding for the lowercase version of the UTF8
41 // character that starts at |curr| and has |num_bytes| bytes.
42 //
43 // NOTE: if the current UTF8 character does not have a lowercase version, then
44 // we append the original UTF8 character.
AppendLowerCase(const char * curr,int num_bytes,string * word)45 inline SAFTM_ATTRIBUTE_ALWAYS_INLINE void AppendLowerCase(const char *curr,
46                                                           int num_bytes,
47                                                           string *word) {
48   if (num_bytes == 1) {
49     // Optimize the ASCII case.
50     word->push_back(tolower(*curr));
51     return;
52   }
53 
54   // Harder, general case.
55   //
56   // NOTE: for lowercasing, we use the utils from utf.h:
57   // charntorune + tolowerrune + runetochar.  Unfortunately, that library does
58   // not contain any fast util for determining the number of bytes for the UTF8
59   // character that starts at a given address *without* converting to a full
60   // codepoint (like our utils::OneCharLen, which is used intensively by the
61   // rest of our code, including by the performance-critical char ngram
62   // feature).  Hence, the rest of our code continues to use utils::OneCharLen,
63   // and here, when we append the bytes to *word, we make sure that's consistent
64   // with utils::OneCharLen.
65 
66   // charntorune() below reads the UTF8 character that starts at curr (using at
67   // most num_bytes bytes) and stores the corresponding codepoint into rune.
68   Rune rune;
69   charntorune(&rune, curr, num_bytes);
70   if (rune != Runeerror) {
71     Rune lower = tolowerrune(rune);
72     char lower_buf[UTFmax];
73     runetochar(lower_buf, &lower);
74 
75     // When appending the UTF8 bytes to word, we do not use the number of bytes
76     // returned by runetochar(); instead, we use utils::OneCharLen(), the same
77     // method used by the char ngram feature.  We expect them to be equal, but
78     // just in case.
79     int lower_num_bytes = utils::OneCharLen(lower_buf);
80 
81     // Using lower_num_bytes below is safe, because, by definition of UTFmax,
82     SAFTM_DCHECK_GE(UTFmax, 4);
83 
84     // And, by implementation of utils::OneCharLen():
85     SAFTM_DCHECK_GT(lower_num_bytes, 0);
86     SAFTM_DCHECK_LE(lower_num_bytes, 4);
87     word->append(lower_buf, lower_num_bytes);
88   } else {
89     // There are sequences of bytes that charntorune() can't convert into a
90     // valid Rune (a special case is [0xEF, 0xBF, 0xBD], the UTF8 encoding for
91     // the U+FFFD special Unicode character, which is also the value of
92     // Runeerror).  We keep those bytes unchanged.
93     word->append(curr, num_bytes);
94   }
95 }
96 }  // namespace
97 
Setup(TaskContext * context)98 void TokenizerForLangId::Setup(TaskContext *context) {
99   lowercase_input_ = context->Get("lang_id_lowercase_input", false);
100 }
101 
Tokenize(StringPiece text,LightSentence * sentence) const102 void TokenizerForLangId::Tokenize(StringPiece text,
103                                   LightSentence *sentence) const {
104   const char *const start = text.data();
105   const char *curr = start;
106   const char *end = utils::GetSafeEndOfUtf8String(start, text.size());
107 
108   // Corner case: the safe part of the text is empty ("").
109   if (curr >= end) {
110     return;
111   }
112 
113   // Number of bytes for UTF8 character starting at *curr.  Note: the loop below
114   // is guaranteed to terminate because in each iteration, we move curr by at
115   // least num_bytes, and num_bytes is guaranteed to be > 0.
116   int num_bytes = utils::OneCharLen(curr);
117   while (curr < end) {
118     // Jump over consecutive token separators.
119     while (IsTokenSeparator(num_bytes, curr)) {
120       curr += num_bytes;
121       if (curr >= end) {
122         return;
123       }
124       num_bytes = utils::OneCharLen(curr);
125     }
126 
127     // If control reaches this point, we are at beginning of a non-empty token.
128     sentence->emplace_back();
129     string *word = &(sentence->back());
130 
131     // Add special token-start character.
132     word->push_back('^');
133 
134     // Add UTF8 characters to word, until we hit the end of the safe text or a
135     // token separator.
136     while (true) {
137       if (lowercase_input_) {
138         AppendLowerCase(curr, num_bytes, word);
139       } else {
140         word->append(curr, num_bytes);
141       }
142       curr += num_bytes;
143       if (curr >= end) {
144         break;
145       }
146       num_bytes = utils::OneCharLen(curr);
147       if (IsTokenSeparator(num_bytes, curr)) {
148         curr += num_bytes;
149         if (curr >= end) {
150           break;
151         }
152         num_bytes = utils::OneCharLen(curr);
153         break;
154       }
155     }
156     word->push_back('$');
157   }
158 }
159 
160 }  // namespace lang_id
161 }  // namespace mobile
162 }  // namespace nlp_saft
163