1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_ 18 #define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_ 19 20 #include <memory> 21 #include <string> 22 23 #include "utils/sentencepiece/double_array_trie.h" 24 #include "utils/strings/stringpiece.h" 25 26 namespace libtextclassifier3 { 27 28 // Normalizer implements a simple text normalizer with user-defined 29 // string-to-string rules and leftmost longest matching. 30 class SentencePieceNormalizer { 31 public: 32 // charsmap_trie and charsmap_normalized specify the normalization/replacement 33 // string-to-string rules in the following way: 34 // A match in the trie for a string will return the offset in 35 // charsmap_normalized that contains the replacement string. 36 // 37 // add_dummy_prefix: Whether to add dummy whitespace at the beginning of the 38 // text in order to treat "world" in "world" and "hello world" uniformly. 39 // 40 // remove_extra_whitespaces: Whether to remove leading, trailing and duplicate 41 // internal whitespace. 42 // 43 // escape_whitespaces: Whether to replace whitespace with a meta symbol. 44 SentencePieceNormalizer(const DoubleArrayTrie& charsmap_trie, 45 StringPiece charsmap_normalized, 46 bool add_dummy_prefix = true, 47 bool remove_extra_whitespaces = true, 48 bool escape_whitespaces = true) charsmap_trie_(charsmap_trie)49 : charsmap_trie_(charsmap_trie), 50 charsmap_normalized_(charsmap_normalized), 51 add_dummy_prefix_(add_dummy_prefix), 52 remove_extra_whitespaces_(remove_extra_whitespaces), 53 escape_whitespaces_(escape_whitespaces) {} 54 55 // Normalizes a plain utf8 string into an internal representation for 56 // Sentencepiece model. 57 bool Normalize(StringPiece input, std::string* normalized_input) const; 58 59 private: 60 // Normalizes the prefix of `input` and returns the pair of 61 // normalized prefix and the length of the prefix of `input` processed in the 62 // normalization. 63 bool NormalizePrefix(StringPiece input, 64 std::pair<StringPiece, int>* prefix) const; 65 66 // Internal trie for efficient longest prefix string matching. 67 DoubleArrayTrie charsmap_trie_; 68 69 // "\0" delimitered concatenated normalized strings. 70 // the value of `charsmap_trie_` stores offsets into this string. 71 StringPiece charsmap_normalized_; 72 73 const bool add_dummy_prefix_; 74 const bool remove_extra_whitespaces_; 75 const bool escape_whitespaces_; 76 }; 77 78 } // namespace libtextclassifier3 79 80 #endif // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_ 81