1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // A token matcher based on context-free grammars. 18 // 19 // A lexer passes token to the matcher: literal terminal strings and token 20 // types. It passes tokens to the matcher by calling AddTerminal() and 21 // AddMatch() for literal terminals and token types, respectively. 22 // The lexer passes each token along with the [begin, end) position range 23 // in which it occurs. So for an input string "Groundhog February 2, 2007", the 24 // lexer would tell the matcher that: 25 // 26 // "Groundhog" occurs at [0, 9) 27 // <space> occurs at [9, 10) 28 // "February" occurs at [10, 18) 29 // <space> occurs at [18, 19) 30 // <string_of_digits> occurs at [19, 20) 31 // "," occurs at [20, 21) 32 // <space> occurs at [21, 22) 33 // <string_of_digits> occurs at [22, 26) 34 // 35 // The lexer passes tokens to the matcher by calling AddTerminal() and 36 // AddMatch() for literal terminals and token types, respectively. 37 // 38 // Although it is unnecessary for this example grammar, a lexer can 39 // output multiple tokens for the same input range. So our lexer could 40 // additionally output: 41 // "2" occurs at [19, 20) // a second token for [19, 20) 42 // "2007" occurs at [22, 26) 43 // <syllable> occurs at [0, 6) // overlaps with (Groundhog [0, 9)) 44 // <syllable> occurs at [6, 9) 45 // The only constraint on the lexer's output is that it has to pass tokens 46 // to the matcher in left-to-right order, strictly speaking, their "end" 47 // positions must be nondecreasing. (This constraint allows a more 48 // efficient matching algorithm.) The "begin" positions can be in any 49 // order. 50 // 51 // There are two kinds of supported callbacks: 52 // (1) OUTPUT: Callbacks are the only output mechanism a matcher has. For each 53 // "top-level" rule in your grammar, like the rule for <date> above -- something 54 // you're trying to find instances of -- you use a callback which the matcher 55 // will invoke every time it finds an instance of <date>. 56 // (2) FILTERS: 57 // Callbacks allow you to put extra conditions on when a grammar rule 58 // applies. In the example grammar, the rule 59 // 60 // <day> ::= <string_of_digits> // must be between 1 and 31 61 // 62 // should only apply for *some* <string_of_digits> tokens, not others. 63 // By using a filter callback on this rule, you can tell the matcher that 64 // an instance of the rule's RHS is only *sometimes* considered an 65 // instance of its LHS. The filter callback will get invoked whenever 66 // the matcher finds an instance of <string_of_digits>. The callback can 67 // look at the digits and decide whether they represent a number between 68 // 1 and 31. If so, the callback calls Matcher::AddMatch() to tell the 69 // matcher there's a <day> there. If not, the callback simply exits 70 // without calling AddMatch(). 71 // 72 // Technically, a FILTER callback can make any number of calls to 73 // AddMatch() or even AddTerminal(). But the expected usage is to just 74 // make zero or one call to AddMatch(). OUTPUT callbacks are not expected 75 // to call either of these -- output callbacks are invoked merely as a 76 // side-effect, not in order to decide whether a rule applies or not. 77 // 78 // In the above example, you would probably use three callbacks. Filter 79 // callbacks on the rules for <day> and <year> would check the numeric 80 // value of the <string_of_digits>. An output callback on the rule for 81 // <date> would simply increment the counter of dates found on the page. 82 // 83 // Note that callbacks are attached to rules, not to nonterminals. You 84 // could have two alternative rules for <date> and use a different 85 // callback for each one. 86 87 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_ 88 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_ 89 90 #include <array> 91 #include <functional> 92 #include <vector> 93 94 #include "annotator/types.h" 95 #include "utils/base/arena.h" 96 #include "utils/grammar/callback-delegate.h" 97 #include "utils/grammar/match.h" 98 #include "utils/grammar/rules_generated.h" 99 #include "utils/strings/stringpiece.h" 100 #include "utils/utf8/unilib.h" 101 102 namespace libtextclassifier3::grammar { 103 104 class Matcher { 105 public: Matcher(const UniLib * unilib,const RulesSet * rules,const std::vector<const RulesSet_::Rules * > rules_shards,CallbackDelegate * delegate)106 explicit Matcher(const UniLib* unilib, const RulesSet* rules, 107 const std::vector<const RulesSet_::Rules*> rules_shards, 108 CallbackDelegate* delegate) 109 : state_(STATE_DEFAULT), 110 unilib_(*unilib), 111 arena_(kBlocksize), 112 rules_(rules), 113 rules_shards_(rules_shards), 114 delegate_(delegate) { 115 TC3_CHECK(rules_ != nullptr); 116 Reset(); 117 } Matcher(const UniLib * unilib,const RulesSet * rules,CallbackDelegate * delegate)118 explicit Matcher(const UniLib* unilib, const RulesSet* rules, 119 CallbackDelegate* delegate) 120 : Matcher(unilib, rules, {}, delegate) { 121 rules_shards_.reserve(rules->rules()->size()); 122 rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(), 123 rules->rules()->end()); 124 } 125 126 // Resets the matcher. 127 void Reset(); 128 129 // Finish the matching. 130 void Finish(); 131 132 // Tells the matcher that the given terminal was found occupying position 133 // range [begin, end) in the input. 134 // The matcher may invoke callback functions before returning, if this 135 // terminal triggers any new matches for rules in the grammar. 136 // Calls to AddTerminal() and AddMatch() must be in left-to-right order, 137 // that is, the sequence of `end` values must be non-decreasing. 138 void AddTerminal(const CodepointSpan codepoint_span, const int match_offset, 139 StringPiece terminal); AddTerminal(const CodepointIndex begin,const CodepointIndex end,StringPiece terminal)140 void AddTerminal(const CodepointIndex begin, const CodepointIndex end, 141 StringPiece terminal) { 142 AddTerminal(CodepointSpan{begin, end}, begin, terminal); 143 } 144 145 // Adds a nonterminal match to the chart. 146 // This can be invoked by the lexer if the lexer needs to add nonterminals to 147 // the chart. 148 void AddMatch(Match* match); 149 150 // Allocates memory from an area for a new match. 151 // The `size` parameter is there to allow subclassing of the match object 152 // with additional fields. AllocateMatch(const size_t size)153 Match* AllocateMatch(const size_t size) { 154 return reinterpret_cast<Match*>(arena_.Alloc(size)); 155 } 156 157 template <typename T> AllocateMatch()158 T* AllocateMatch() { 159 return reinterpret_cast<T*>(arena_.Alloc(sizeof(T))); 160 } 161 162 template <typename T, typename... Args> AllocateAndInitMatch(Args...args)163 T* AllocateAndInitMatch(Args... args) { 164 T* match = AllocateMatch<T>(); 165 match->Init(args...); 166 return match; 167 } 168 169 // Returns the current number of bytes allocated for all match objects. ArenaSize()170 size_t ArenaSize() const { return arena_.status().bytes_allocated(); } 171 172 private: 173 static constexpr int kBlocksize = 16 << 10; 174 175 // The state of the matcher. 176 enum State { 177 // The matcher is in the default state. 178 STATE_DEFAULT = 0, 179 180 // The matcher is currently processing queued match items. 181 STATE_PROCESSING = 1, 182 }; 183 State state_; 184 185 // Process matches from lhs set. 186 void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset, 187 const int whitespace_gap, 188 const std::function<void(Match*)>& initializer, 189 const RulesSet_::LhsSet* lhs_set, 190 CallbackDelegate* delegate); 191 192 // Queues a newly created match item. 193 void QueueForProcessing(Match* item); 194 195 // Queues a match item for later post checking of the exclusion condition. 196 // For exclusions we need to check that the `item->excluded_nonterminal` 197 // doesn't match the same span. As we cannot know which matches have already 198 // been added, we queue the item for later post checking - once all matches 199 // up to `item->codepoint_span.second` have been added. 200 void QueueForPostCheck(ExclusionMatch* item); 201 202 // Adds pending items to the chart, possibly generating new matches as a 203 // result. 204 void ProcessPendingSet(); 205 206 // Returns whether the chart contains a match for a given nonterminal. 207 bool ContainsMatch(const Nonterm nonterm, const CodepointSpan& span) const; 208 209 // Checks all pending exclusion matches that their exclusion condition is 210 // fulfilled. 211 void ProcessPendingExclusionMatches(); 212 213 UniLib unilib_; 214 215 // Memory arena for match allocation. 216 UnsafeArena arena_; 217 218 // The end position of the most recent match or terminal, for sanity 219 // checking. 220 int last_end_; 221 222 // Rules. 223 const RulesSet* rules_; 224 225 // The set of items pending to be added to the chart as a singly-linked list. 226 Match* pending_items_; 227 228 // The set of items pending to be post-checked as a singly-linked list. 229 ExclusionMatch* pending_exclusion_items_; 230 231 // The chart data structure: a hashtable containing all matches, indexed by 232 // their end positions. 233 static constexpr int kChartHashTableNumBuckets = 1 << 8; 234 static constexpr int kChartHashTableBitmask = kChartHashTableNumBuckets - 1; 235 std::array<Match*, kChartHashTableNumBuckets> chart_; 236 237 // The active rule shards. 238 std::vector<const RulesSet_::Rules*> rules_shards_; 239 240 // The callback handler. 241 CallbackDelegate* delegate_; 242 }; 243 244 } // namespace libtextclassifier3::grammar 245 246 #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_ 247