1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "lang_id/custom-tokenizer.h"
18
19 #include <ctype.h>
20
21 #include <string>
22
23 #include "util/strings/utf8.h"
24
25 namespace libtextclassifier {
26 namespace nlp_core {
27 namespace lang_id {
28
29 namespace {
IsTokenSeparator(int num_bytes,const char * curr)30 inline bool IsTokenSeparator(int num_bytes, const char *curr) {
31 if (num_bytes != 1) {
32 return false;
33 }
34 return !isalpha(*curr);
35 }
36 } // namespace
37
GetSafeEndOfString(const char * data,size_t size)38 const char *GetSafeEndOfString(const char *data, size_t size) {
39 const char *const hard_end = data + size;
40 const char *curr = data;
41 while (curr < hard_end) {
42 int num_bytes = GetNumBytesForUTF8Char(curr);
43 if (num_bytes == 0) {
44 break;
45 }
46 const char *new_curr = curr + num_bytes;
47 if (new_curr > hard_end) {
48 return curr;
49 }
50 curr = new_curr;
51 }
52 return curr;
53 }
54
TokenizeTextForLangId(const std::string & text,LightSentence * sentence)55 void TokenizeTextForLangId(const std::string &text, LightSentence *sentence) {
56 const char *const start = text.data();
57 const char *curr = start;
58 const char *end = GetSafeEndOfString(start, text.size());
59
60 // Corner case: empty safe part of the text.
61 if (curr >= end) {
62 return;
63 }
64
65 // Number of bytes for UTF8 character starting at *curr. Note: the loop below
66 // is guaranteed to terminate because in each iteration, we move curr by at
67 // least num_bytes, and num_bytes is guaranteed to be > 0.
68 int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
69 while (curr < end) {
70 // Jump over consecutive token separators.
71 while (IsTokenSeparator(num_bytes, curr)) {
72 curr += num_bytes;
73 if (curr >= end) {
74 return;
75 }
76 num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
77 }
78
79 // If control reaches this point, we are at beginning of a non-empty token.
80 std::string *word = sentence->add_word();
81
82 // Add special token-start character.
83 word->push_back('^');
84
85 // Add UTF8 characters to word, until we hit the end of the safe text or a
86 // token separator.
87 while (true) {
88 word->append(curr, num_bytes);
89 curr += num_bytes;
90 if (curr >= end) {
91 break;
92 }
93 num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
94 if (IsTokenSeparator(num_bytes, curr)) {
95 curr += num_bytes;
96 num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
97 break;
98 }
99 }
100 word->push_back('$');
101
102 // Note: we intentionally do not token.set_start()/end(), as those fields
103 // are not used by the langid model.
104 }
105 }
106
107 } // namespace lang_id
108 } // namespace nlp_core
109 } // namespace libtextclassifier
110