/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "lang_id/custom-tokenizer.h" #include #include #include "lang_id/common/lite_base/attributes.h" #include "lang_id/common/lite_base/logging.h" #include "lang_id/common/utf8.h" #include "utf.h" namespace libtextclassifier3 { namespace mobile { namespace lang_id { namespace { inline bool IsTokenSeparator(int num_bytes, const char *curr) { if (num_bytes != 1) { return false; } return !isalpha(*curr); } // Appends to *word the UTF8 encoding for the lowercase version of the UTF8 // character that starts at |curr| and has |num_bytes| bytes. // // NOTE: if the current UTF8 character does not have a lowercase version, then // we append the original UTF8 character. inline SAFTM_ATTRIBUTE_ALWAYS_INLINE void AppendLowerCase(const char *curr, int num_bytes, string *word) { if (num_bytes == 1) { // Optimize the ASCII case. word->push_back(tolower(*curr)); return; } // Harder, general case. // // NOTE: for lowercasing, we use the utils from utf.h: // charntorune + tolowerrune + runetochar. Unfortunately, that library does // not contain any fast util for determining the number of bytes for the UTF8 // character that starts at a given address *without* converting to a full // codepoint (like our utils::OneCharLen, which is used intensively by the // rest of our code, including by the performance-critical char ngram // feature). Hence, the rest of our code continues to use utils::OneCharLen, // and here, when we append the bytes to *word, we make sure that's consistent // with utils::OneCharLen. // charntorune() below reads the UTF8 character that starts at curr (using at // most num_bytes bytes) and stores the corresponding codepoint into rune. Rune rune; charntorune(&rune, curr, num_bytes); if (rune != Runeerror) { Rune lower = tolowerrune(rune); char lower_buf[UTFmax]; runetochar(lower_buf, &lower); // When appending the UTF8 bytes to word, we do not use the number of bytes // returned by runetochar(); instead, we use utils::OneCharLen(), the same // method used by the char ngram feature. We expect them to be equal, but // just in case. int lower_num_bytes = utils::OneCharLen(lower_buf); // Using lower_num_bytes below is safe, because, by definition of UTFmax, SAFTM_DCHECK_GE(UTFmax, 4); // And, by implementation of utils::OneCharLen(): SAFTM_DCHECK_GT(lower_num_bytes, 0); SAFTM_DCHECK_LE(lower_num_bytes, 4); word->append(lower_buf, lower_num_bytes); } else { // There are sequences of bytes that charntorune() can't convert into a // valid Rune (a special case is [0xEF, 0xBF, 0xBD], the UTF8 encoding for // the U+FFFD special Unicode character, which is also the value of // Runeerror). We keep those bytes unchanged. word->append(curr, num_bytes); } } } // namespace void TokenizerForLangId::Setup(TaskContext *context) { lowercase_input_ = context->Get("lang_id_lowercase_input", false); } void TokenizerForLangId::Tokenize(StringPiece text, LightSentence *sentence) const { const char *const start = text.data(); const char *curr = start; const char *end = utils::GetSafeEndOfUtf8String(start, text.size()); // Corner case: the safe part of the text is empty (""). if (curr >= end) { return; } // Number of bytes for UTF8 character starting at *curr. Note: the loop below // is guaranteed to terminate because in each iteration, we move curr by at // least num_bytes, and num_bytes is guaranteed to be > 0. int num_bytes = utils::OneCharLen(curr); while (curr < end) { // Jump over consecutive token separators. while (IsTokenSeparator(num_bytes, curr)) { curr += num_bytes; if (curr >= end) { return; } num_bytes = utils::OneCharLen(curr); } // If control reaches this point, we are at beginning of a non-empty token. sentence->emplace_back(); string *word = &(sentence->back()); // Add special token-start character. word->push_back('^'); // Add UTF8 characters to word, until we hit the end of the safe text or a // token separator. while (true) { if (lowercase_input_) { AppendLowerCase(curr, num_bytes, word); } else { word->append(curr, num_bytes); } curr += num_bytes; if (curr >= end) { break; } num_bytes = utils::OneCharLen(curr); if (IsTokenSeparator(num_bytes, curr)) { curr += num_bytes; if (curr >= end) { break; } num_bytes = utils::OneCharLen(curr); break; } } word->push_back('$'); } } } // namespace lang_id } // namespace mobile } // namespace nlp_saft