• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // This is a lexer that runs off the tokenizer and outputs the tokens to a
18 // grammar matcher. The tokens it forwards are the same as the ones produced
19 // by the tokenizer, but possibly further split and normalized (downcased).
20 // Examples:
21 //
22 //    - single character tokens for punctuation (e.g., AddTerminal("?"))
23 //
24 //    - a string of letters (e.g., "Foo" -- it calls AddTerminal() on "foo")
25 //
26 //    - a string of digits (e.g., AddTerminal("37"))
27 //
28 // In addition to the terminal tokens above, it also outputs certain
29 // special nonterminals:
30 //
31 //    - a <token> nonterminal, which it outputs in addition to the
32 //      regular AddTerminal() call for every token
33 //
34 //    - a <digits> nonterminal, which it outputs in addition to
35 //      the regular AddTerminal() call for each string of digits
36 //
37 //    - <N_digits> nonterminals, where N is the length of the string of
38 //      digits. By default the maximum N that will be output is 20. This
39 //      may be changed at compile time by kMaxNDigitsLength. For instance,
40 //      "123" will produce a <3_digits> nonterminal, "1234567" will produce
41 //      a <7_digits> nonterminal.
42 //
43 // It does not output any whitespace.  Instead, whitespace gets absorbed into
44 // the token that follows them in the text.
45 // For example, if the text contains:
46 //
47 //      ...hello                       there        world...
48 //              |                      |            |
49 //              offset=16              39           52
50 //
51 // then the output will be:
52 //
53 //      "hello" [?, 16)
54 //      "there" [16, 44)      <-- note "16" NOT "39"
55 //      "world" [44, ?)       <-- note "44" NOT "52"
56 //
57 // This makes it appear to the Matcher as if the tokens are adjacent -- so
58 // whitespace is simply ignored.
59 //
60 // A minor optimization:  We don't bother to output nonterminals if the grammar
61 // rules don't reference them.
62 
63 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_
64 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_
65 
66 #include "annotator/types.h"
67 #include "utils/grammar/matcher.h"
68 #include "utils/grammar/rules_generated.h"
69 #include "utils/grammar/types.h"
70 #include "utils/strings/stringpiece.h"
71 #include "utils/utf8/unicodetext.h"
72 #include "utils/utf8/unilib.h"
73 
74 namespace libtextclassifier3::grammar {
75 
76 class Lexer {
77  public:
78   explicit Lexer(const UniLib* unilib, const RulesSet* rules);
79 
80   // Processes a tokenized text. Classifies the tokens and feeds them to the
81   // matcher.
82   // The provided annotations will be fed to the matcher alongside the tokens.
83   // NOTE: The `annotations` need to outlive any dependent processing.
84   void Process(const UnicodeText& text, const std::vector<Token>& tokens,
85                const std::vector<AnnotatedSpan>* annotations,
86                Matcher* matcher) const;
87   void Process(const UnicodeText& text,
88                const std::vector<Token>::const_iterator& begin,
89                const std::vector<Token>::const_iterator& end,
90                const std::vector<AnnotatedSpan>* annotations,
91                Matcher* matcher) const;
92 
93  private:
94   // A lexical symbol with an identified meaning that represents raw tokens,
95   // token categories or predefined text matches.
96   // It is the unit fed to the grammar matcher.
97   struct Symbol {
98     // The type of the lexical symbol.
99     enum class Type {
100       // A raw token.
101       TYPE_TERM,
102 
103       // A symbol representing a string of digits.
104       TYPE_DIGITS,
105 
106       // Punctuation characters.
107       TYPE_PUNCTUATION,
108 
109       // A predefined match.
110       TYPE_MATCH
111     };
112 
113     explicit Symbol() = default;
114 
115     // Constructs a symbol of a given type with an anchor in the text.
SymbolSymbol116     Symbol(const Type type, const CodepointSpan codepoint_span,
117            const int match_offset, StringPiece lexeme)
118         : type(type),
119           codepoint_span(codepoint_span),
120           match_offset(match_offset),
121           lexeme(lexeme) {}
122 
123     // Constructs a symbol from a pre-defined match.
SymbolSymbol124     explicit Symbol(Match* match)
125         : type(Type::TYPE_MATCH),
126           codepoint_span(match->codepoint_span),
127           match_offset(match->match_offset),
128           match(match) {}
129 
130     // The type of the symbole.
131     Type type;
132 
133     // The span in the text as codepoint offsets.
134     CodepointSpan codepoint_span;
135 
136     // The match start offset (including preceding whitespace) as codepoint
137     // offset.
138     int match_offset;
139 
140     // The symbol text value.
141     StringPiece lexeme;
142 
143     // The predefined match.
144     Match* match;
145   };
146 
147   // Processes a single token: the token is split and classified into symbols.
148   void ProcessToken(const StringPiece value, const int prev_token_end,
149                     const CodepointSpan codepoint_span,
150                     std::vector<Symbol>* symbols) const;
151 
152   // Emits a token to the matcher.
153   void Emit(const Symbol& symbol, const RulesSet_::Nonterminals* nonterms,
154             Matcher* matcher) const;
155 
156   // Gets the type of a character.
157   Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const;
158 
159  private:
160   struct RegexAnnotator {
161     std::unique_ptr<UniLib::RegexPattern> pattern;
162     Nonterm nonterm;
163   };
164 
165   // Uncompress and build the defined regex annotators.
166   std::vector<RegexAnnotator> BuildRegexAnnotator(const UniLib& unilib,
167                                                   const RulesSet* rules) const;
168 
169   const UniLib& unilib_;
170   const RulesSet* rules_;
171   std::vector<RegexAnnotator> regex_annotators_;
172 };
173 
174 }  // namespace libtextclassifier3::grammar
175 
176 #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_
177