1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // This is a lexer that runs off the tokenizer and outputs the tokens to a 18 // grammar matcher. The tokens it forwards are the same as the ones produced 19 // by the tokenizer, but possibly further split and normalized (downcased). 20 // Examples: 21 // 22 // - single character tokens for punctuation (e.g., AddTerminal("?")) 23 // 24 // - a string of letters (e.g., "Foo" -- it calls AddTerminal() on "foo") 25 // 26 // - a string of digits (e.g., AddTerminal("37")) 27 // 28 // In addition to the terminal tokens above, it also outputs certain 29 // special nonterminals: 30 // 31 // - a <token> nonterminal, which it outputs in addition to the 32 // regular AddTerminal() call for every token 33 // 34 // - a <digits> nonterminal, which it outputs in addition to 35 // the regular AddTerminal() call for each string of digits 36 // 37 // - <N_digits> nonterminals, where N is the length of the string of 38 // digits. By default the maximum N that will be output is 20. This 39 // may be changed at compile time by kMaxNDigitsLength. For instance, 40 // "123" will produce a <3_digits> nonterminal, "1234567" will produce 41 // a <7_digits> nonterminal. 42 // 43 // It does not output any whitespace. Instead, whitespace gets absorbed into 44 // the token that follows them in the text. 45 // For example, if the text contains: 46 // 47 // ...hello there world... 48 // | | | 49 // offset=16 39 52 50 // 51 // then the output will be: 52 // 53 // "hello" [?, 16) 54 // "there" [16, 44) <-- note "16" NOT "39" 55 // "world" [44, ?) <-- note "44" NOT "52" 56 // 57 // This makes it appear to the Matcher as if the tokens are adjacent -- so 58 // whitespace is simply ignored. 59 // 60 // A minor optimization: We don't bother to output nonterminals if the grammar 61 // rules don't reference them. 62 63 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_ 64 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_ 65 66 #include "annotator/types.h" 67 #include "utils/grammar/matcher.h" 68 #include "utils/grammar/rules_generated.h" 69 #include "utils/grammar/types.h" 70 #include "utils/strings/stringpiece.h" 71 #include "utils/utf8/unicodetext.h" 72 #include "utils/utf8/unilib.h" 73 74 namespace libtextclassifier3::grammar { 75 76 class Lexer { 77 public: 78 explicit Lexer(const UniLib* unilib, const RulesSet* rules); 79 80 // Processes a tokenized text. Classifies the tokens and feeds them to the 81 // matcher. 82 // The provided annotations will be fed to the matcher alongside the tokens. 83 // NOTE: The `annotations` need to outlive any dependent processing. 84 void Process(const UnicodeText& text, const std::vector<Token>& tokens, 85 const std::vector<AnnotatedSpan>* annotations, 86 Matcher* matcher) const; 87 void Process(const UnicodeText& text, 88 const std::vector<Token>::const_iterator& begin, 89 const std::vector<Token>::const_iterator& end, 90 const std::vector<AnnotatedSpan>* annotations, 91 Matcher* matcher) const; 92 93 private: 94 // A lexical symbol with an identified meaning that represents raw tokens, 95 // token categories or predefined text matches. 96 // It is the unit fed to the grammar matcher. 97 struct Symbol { 98 // The type of the lexical symbol. 99 enum class Type { 100 // A raw token. 101 TYPE_TERM, 102 103 // A symbol representing a string of digits. 104 TYPE_DIGITS, 105 106 // Punctuation characters. 107 TYPE_PUNCTUATION, 108 109 // A predefined match. 110 TYPE_MATCH 111 }; 112 113 explicit Symbol() = default; 114 115 // Constructs a symbol of a given type with an anchor in the text. SymbolSymbol116 Symbol(const Type type, const CodepointSpan codepoint_span, 117 const int match_offset, StringPiece lexeme) 118 : type(type), 119 codepoint_span(codepoint_span), 120 match_offset(match_offset), 121 lexeme(lexeme) {} 122 123 // Constructs a symbol from a pre-defined match. SymbolSymbol124 explicit Symbol(Match* match) 125 : type(Type::TYPE_MATCH), 126 codepoint_span(match->codepoint_span), 127 match_offset(match->match_offset), 128 match(match) {} 129 130 // The type of the symbole. 131 Type type; 132 133 // The span in the text as codepoint offsets. 134 CodepointSpan codepoint_span; 135 136 // The match start offset (including preceding whitespace) as codepoint 137 // offset. 138 int match_offset; 139 140 // The symbol text value. 141 StringPiece lexeme; 142 143 // The predefined match. 144 Match* match; 145 }; 146 147 // Processes a single token: the token is split and classified into symbols. 148 void ProcessToken(const StringPiece value, const int prev_token_end, 149 const CodepointSpan codepoint_span, 150 std::vector<Symbol>* symbols) const; 151 152 // Emits a token to the matcher. 153 void Emit(const Symbol& symbol, const RulesSet_::Nonterminals* nonterms, 154 Matcher* matcher) const; 155 156 // Gets the type of a character. 157 Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const; 158 159 private: 160 struct RegexAnnotator { 161 std::unique_ptr<UniLib::RegexPattern> pattern; 162 Nonterm nonterm; 163 }; 164 165 // Uncompress and build the defined regex annotators. 166 std::vector<RegexAnnotator> BuildRegexAnnotator(const UniLib& unilib, 167 const RulesSet* rules) const; 168 169 const UniLib& unilib_; 170 const RulesSet* rules_; 171 std::vector<RegexAnnotator> regex_annotators_; 172 }; 173 174 } // namespace libtextclassifier3::grammar 175 176 #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_ 177