• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "lang_id/custom-tokenizer.h"
18 
19 #include <ctype.h>
20 
21 #include <string>
22 
23 #include "util/strings/utf8.h"
24 
25 namespace libtextclassifier {
26 namespace nlp_core {
27 namespace lang_id {
28 
29 namespace {
IsTokenSeparator(int num_bytes,const char * curr)30 inline bool IsTokenSeparator(int num_bytes, const char *curr) {
31   if (num_bytes != 1) {
32     return false;
33   }
34   return !isalpha(*curr);
35 }
36 }  // namespace
37 
GetSafeEndOfString(const char * data,size_t size)38 const char *GetSafeEndOfString(const char *data, size_t size) {
39   const char *const hard_end = data + size;
40   const char *curr = data;
41   while (curr < hard_end) {
42     int num_bytes = GetNumBytesForUTF8Char(curr);
43     if (num_bytes == 0) {
44       break;
45     }
46     const char *new_curr = curr + num_bytes;
47     if (new_curr > hard_end) {
48       return curr;
49     }
50     curr = new_curr;
51   }
52   return curr;
53 }
54 
TokenizeTextForLangId(const std::string & text,LightSentence * sentence)55 void TokenizeTextForLangId(const std::string &text, LightSentence *sentence) {
56   const char *const start = text.data();
57   const char *curr = start;
58   const char *end = GetSafeEndOfString(start, text.size());
59 
60   // Corner case: empty safe part of the text.
61   if (curr >= end) {
62     return;
63   }
64 
65   // Number of bytes for UTF8 character starting at *curr.  Note: the loop below
66   // is guaranteed to terminate because in each iteration, we move curr by at
67   // least num_bytes, and num_bytes is guaranteed to be > 0.
68   int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
69   while (curr < end) {
70     // Jump over consecutive token separators.
71     while (IsTokenSeparator(num_bytes, curr)) {
72       curr += num_bytes;
73       if (curr >= end) {
74         return;
75       }
76       num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
77     }
78 
79     // If control reaches this point, we are at beginning of a non-empty token.
80     std::string *word = sentence->add_word();
81 
82     // Add special token-start character.
83     word->push_back('^');
84 
85     // Add UTF8 characters to word, until we hit the end of the safe text or a
86     // token separator.
87     while (true) {
88       word->append(curr, num_bytes);
89       curr += num_bytes;
90       if (curr >= end) {
91         break;
92       }
93       num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
94       if (IsTokenSeparator(num_bytes, curr)) {
95         curr += num_bytes;
96         num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
97         break;
98       }
99     }
100     word->push_back('$');
101 
102     // Note: we intentionally do not token.set_start()/end(), as those fields
103     // are not used by the langid model.
104   }
105 }
106 
107 }  // namespace lang_id
108 }  // namespace nlp_core
109 }  // namespace libtextclassifier
110