• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
18 #define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
19 
20 #include <memory>
21 #include <string>
22 
23 #include "utils/container/double-array-trie.h"
24 #include "utils/strings/stringpiece.h"
25 
26 namespace libtextclassifier3 {
27 
28 // Normalizer implements a simple text normalizer with user-defined
29 // string-to-string rules and leftmost longest matching.
30 class SentencePieceNormalizer {
31  public:
32   // charsmap_trie and charsmap_normalized specify the normalization/replacement
33   // string-to-string rules in the following way:
34   // A match in the trie for a string will return the offset in
35   // charsmap_normalized that contains the replacement string.
36   //
37   // add_dummy_prefix: Whether to add dummy whitespace at the beginning of the
38   //   text in order to treat "world" in "world" and "hello world" uniformly.
39   //
40   // remove_extra_whitespaces: Whether to remove leading, trailing and duplicate
41   //   internal whitespace.
42   //
43   // escape_whitespaces: Whether to replace whitespace with a meta symbol.
44   SentencePieceNormalizer(const DoubleArrayTrie& charsmap_trie,
45                           StringPiece charsmap_normalized,
46                           bool add_dummy_prefix = true,
47                           bool remove_extra_whitespaces = true,
48                           bool escape_whitespaces = true)
charsmap_trie_(charsmap_trie)49       : charsmap_trie_(charsmap_trie),
50         charsmap_normalized_(charsmap_normalized),
51         add_dummy_prefix_(add_dummy_prefix),
52         remove_extra_whitespaces_(remove_extra_whitespaces),
53         escape_whitespaces_(escape_whitespaces) {}
54 
55   // Normalizes a plain utf8 string into an internal representation for
56   // Sentencepiece model.
57   bool Normalize(StringPiece input, std::string* normalized_input) const;
58 
59  private:
60   // Normalizes the prefix of `input` and returns the pair of
61   // normalized prefix and the length of the prefix of `input` processed in the
62   // normalization.
63   bool NormalizePrefix(StringPiece input,
64                        std::pair<StringPiece, int>* prefix) const;
65 
66   // Internal trie for efficient longest prefix string matching.
67   DoubleArrayTrie charsmap_trie_;
68 
69   // "\0" delimitered concatenated normalized strings.
70   // the value of `charsmap_trie_` stores offsets into this string.
71   StringPiece charsmap_normalized_;
72 
73   const bool add_dummy_prefix_;
74   const bool remove_extra_whitespaces_;
75   const bool escape_whitespaces_;
76 };
77 
78 }  // namespace libtextclassifier3
79 
80 #endif  // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
81