• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // A token matcher based on context-free grammars.
18 //
19 // A lexer passes token to the matcher: literal terminal strings and token
20 // types. It passes tokens to the matcher by calling AddTerminal() and
21 // AddMatch() for literal terminals and token types, respectively.
22 // The lexer passes each token along with the [begin, end) position range
23 // in which it occurs.  So for an input string "Groundhog February 2, 2007", the
24 // lexer would tell the matcher that:
25 //
26 //     "Groundhog" occurs at [0, 9)
27 //     <space> occurs at [9, 10)
28 //     "February" occurs at [10, 18)
29 //     <space> occurs at [18, 19)
30 //     <string_of_digits> occurs at [19, 20)
31 //     "," occurs at [20, 21)
32 //     <space> occurs at [21, 22)
33 //     <string_of_digits> occurs at [22, 26)
34 //
35 // The lexer passes tokens to the matcher by calling AddTerminal() and
36 // AddMatch() for literal terminals and token types, respectively.
37 //
38 // Although it is unnecessary for this example grammar, a lexer can
39 // output multiple tokens for the same input range.  So our lexer could
40 // additionally output:
41 //     "2" occurs at [19, 20)        // a second token for [19, 20)
42 //     "2007" occurs at [22, 26)
43 //     <syllable> occurs at [0, 6)   // overlaps with (Groundhog [0, 9))
44 //     <syllable> occurs at [6, 9)
45 // The only constraint on the lexer's output is that it has to pass tokens
46 // to the matcher in left-to-right order, strictly speaking, their "end"
47 // positions must be nondecreasing.  (This constraint allows a more
48 // efficient matching algorithm.)  The "begin" positions can be in any
49 // order.
50 //
51 // There are two kinds of supported callbacks:
52 // (1) OUTPUT:  Callbacks are the only output mechanism a matcher has.  For each
53 // "top-level" rule in your grammar, like the rule for <date> above -- something
54 // you're trying to find instances of -- you use a callback which the matcher
55 // will invoke every time it finds an instance of <date>.
56 // (2) FILTERS:
57 // Callbacks allow you to put extra conditions on when a grammar rule
58 // applies.  In the example grammar, the rule
59 //
60 //       <day> ::= <string_of_digits>     // must be between 1 and 31
61 //
62 // should only apply for *some* <string_of_digits> tokens, not others.
63 // By using a filter callback on this rule, you can tell the matcher that
64 // an instance of the rule's RHS is only *sometimes* considered an
65 // instance of its LHS.  The filter callback will get invoked whenever
66 // the matcher finds an instance of <string_of_digits>.  The callback can
67 // look at the digits and decide whether they represent a number between
68 // 1 and 31.  If so, the callback calls Matcher::AddMatch() to tell the
69 // matcher there's a <day> there.  If not, the callback simply exits
70 // without calling AddMatch().
71 //
72 // Technically, a FILTER callback can make any number of calls to
73 // AddMatch() or even AddTerminal().  But the expected usage is to just
74 // make zero or one call to AddMatch().  OUTPUT callbacks are not expected
75 // to call either of these -- output callbacks are invoked merely as a
76 // side-effect, not in order to decide whether a rule applies or not.
77 //
78 // In the above example, you would probably use three callbacks.  Filter
79 // callbacks on the rules for <day> and <year> would check the numeric
80 // value of the <string_of_digits>.  An output callback on the rule for
81 // <date> would simply increment the counter of dates found on the page.
82 //
83 // Note that callbacks are attached to rules, not to nonterminals.  You
84 // could have two alternative rules for <date> and use a different
85 // callback for each one.
86 
87 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_
88 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_
89 
90 #include <array>
91 #include <functional>
92 #include <vector>
93 
94 #include "annotator/types.h"
95 #include "utils/base/arena.h"
96 #include "utils/grammar/callback-delegate.h"
97 #include "utils/grammar/match.h"
98 #include "utils/grammar/rules_generated.h"
99 #include "utils/strings/stringpiece.h"
100 #include "utils/utf8/unilib.h"
101 
102 namespace libtextclassifier3::grammar {
103 
104 class Matcher {
105  public:
Matcher(const UniLib * unilib,const RulesSet * rules,const std::vector<const RulesSet_::Rules * > rules_shards,CallbackDelegate * delegate)106   explicit Matcher(const UniLib* unilib, const RulesSet* rules,
107                    const std::vector<const RulesSet_::Rules*> rules_shards,
108                    CallbackDelegate* delegate)
109       : state_(STATE_DEFAULT),
110         unilib_(*unilib),
111         arena_(kBlocksize),
112         rules_(rules),
113         rules_shards_(rules_shards),
114         delegate_(delegate) {
115     TC3_CHECK(rules_ != nullptr);
116     Reset();
117   }
Matcher(const UniLib * unilib,const RulesSet * rules,CallbackDelegate * delegate)118   explicit Matcher(const UniLib* unilib, const RulesSet* rules,
119                    CallbackDelegate* delegate)
120       : Matcher(unilib, rules, {}, delegate) {
121     rules_shards_.reserve(rules->rules()->size());
122     rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(),
123                          rules->rules()->end());
124   }
125 
126   // Resets the matcher.
127   void Reset();
128 
129   // Finish the matching.
130   void Finish();
131 
132   // Tells the matcher that the given terminal was found occupying position
133   // range [begin, end) in the input.
134   // The matcher may invoke callback functions before returning, if this
135   // terminal triggers any new matches for rules in the grammar.
136   // Calls to AddTerminal() and AddMatch() must be in left-to-right order,
137   // that is, the sequence of `end` values must be non-decreasing.
138   void AddTerminal(const CodepointSpan codepoint_span, const int match_offset,
139                    StringPiece terminal);
AddTerminal(const CodepointIndex begin,const CodepointIndex end,StringPiece terminal)140   void AddTerminal(const CodepointIndex begin, const CodepointIndex end,
141                    StringPiece terminal) {
142     AddTerminal(CodepointSpan{begin, end}, begin, terminal);
143   }
144 
145   // Adds a nonterminal match to the chart.
146   // This can be invoked by the lexer if the lexer needs to add nonterminals to
147   // the chart.
148   void AddMatch(Match* match);
149 
150   // Allocates memory from an area for a new match.
151   // The `size` parameter is there to allow subclassing of the match object
152   // with additional fields.
AllocateMatch(const size_t size)153   Match* AllocateMatch(const size_t size) {
154     return reinterpret_cast<Match*>(arena_.Alloc(size));
155   }
156 
157   template <typename T>
AllocateMatch()158   T* AllocateMatch() {
159     return reinterpret_cast<T*>(arena_.Alloc(sizeof(T)));
160   }
161 
162   template <typename T, typename... Args>
AllocateAndInitMatch(Args...args)163   T* AllocateAndInitMatch(Args... args) {
164     T* match = AllocateMatch<T>();
165     match->Init(args...);
166     return match;
167   }
168 
169   // Returns the current number of bytes allocated for all match objects.
ArenaSize()170   size_t ArenaSize() const { return arena_.status().bytes_allocated(); }
171 
172  private:
173   static constexpr int kBlocksize = 16 << 10;
174 
175   // The state of the matcher.
176   enum State {
177     // The matcher is in the default state.
178     STATE_DEFAULT = 0,
179 
180     // The matcher is currently processing queued match items.
181     STATE_PROCESSING = 1,
182   };
183   State state_;
184 
185   // Process matches from lhs set.
186   void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset,
187                      const int whitespace_gap,
188                      const std::function<void(Match*)>& initializer,
189                      const RulesSet_::LhsSet* lhs_set,
190                      CallbackDelegate* delegate);
191 
192   // Queues a newly created match item.
193   void QueueForProcessing(Match* item);
194 
195   // Queues a match item for later post checking of the exclusion condition.
196   // For exclusions we need to check that the `item->excluded_nonterminal`
197   // doesn't match the same span. As we cannot know which matches have already
198   // been added, we queue the item for later post checking - once all matches
199   // up to `item->codepoint_span.second` have been added.
200   void QueueForPostCheck(ExclusionMatch* item);
201 
202   // Adds pending items to the chart, possibly generating new matches as a
203   // result.
204   void ProcessPendingSet();
205 
206   // Returns whether the chart contains a match for a given nonterminal.
207   bool ContainsMatch(const Nonterm nonterm, const CodepointSpan& span) const;
208 
209   // Checks all pending exclusion matches that their exclusion condition is
210   // fulfilled.
211   void ProcessPendingExclusionMatches();
212 
213   UniLib unilib_;
214 
215   // Memory arena for match allocation.
216   UnsafeArena arena_;
217 
218   // The end position of the most recent match or terminal, for sanity
219   // checking.
220   int last_end_;
221 
222   // Rules.
223   const RulesSet* rules_;
224 
225   // The set of items pending to be added to the chart as a singly-linked list.
226   Match* pending_items_;
227 
228   // The set of items pending to be post-checked as a singly-linked list.
229   ExclusionMatch* pending_exclusion_items_;
230 
231   // The chart data structure: a hashtable containing all matches, indexed by
232   // their end positions.
233   static constexpr int kChartHashTableNumBuckets = 1 << 8;
234   static constexpr int kChartHashTableBitmask = kChartHashTableNumBuckets - 1;
235   std::array<Match*, kChartHashTableNumBuckets> chart_;
236 
237   // The active rule shards.
238   std::vector<const RulesSet_::Rules*> rules_shards_;
239 
240   // The callback handler.
241   CallbackDelegate* delegate_;
242 };
243 
244 }  // namespace libtextclassifier3::grammar
245 
246 #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_
247