1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // A token based context-free grammar matcher. 18 // 19 // A parser passes token to the matcher: literal terminal strings and token 20 // types. 21 // The parser passes each token along with the [begin, end) position range 22 // in which it occurs. So for an input string "Groundhog February 2, 2007", the 23 // parser would tell the matcher that: 24 // 25 // "Groundhog" occurs at [0, 9) 26 // "February" occurs at [9, 18) 27 // <digits> occurs at [18, 20) 28 // "," occurs at [20, 21) 29 // <digits> occurs at [21, 26) 30 // 31 // Multiple overlapping symbols can be passed. 32 // The only constraint on symbol order is that they have to be passed in 33 // left-to-right order, strictly speaking, their "end" positions must be 34 // nondecreasing. This constraint allows a more efficient matching algorithm. 35 // The "begin" positions can be in any order. 36 37 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ 38 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ 39 40 #include <array> 41 #include <functional> 42 #include <vector> 43 44 #include "annotator/types.h" 45 #include "utils/base/arena.h" 46 #include "utils/grammar/parsing/chart.h" 47 #include "utils/grammar/parsing/derivation.h" 48 #include "utils/grammar/parsing/parse-tree.h" 49 #include "utils/grammar/rules_generated.h" 50 #include "utils/strings/stringpiece.h" 51 #include "utils/utf8/unilib.h" 52 53 namespace libtextclassifier3::grammar { 54 55 class Matcher { 56 public: Matcher(const UniLib * unilib,const RulesSet * rules,const std::vector<const RulesSet_::Rules * > rules_shards,UnsafeArena * arena)57 explicit Matcher(const UniLib* unilib, const RulesSet* rules, 58 const std::vector<const RulesSet_::Rules*> rules_shards, 59 UnsafeArena* arena) 60 : unilib_(*unilib), 61 arena_(arena), 62 last_end_(std::numeric_limits<int>().lowest()), 63 rules_(rules), 64 rules_shards_(rules_shards), 65 pending_items_(nullptr), 66 pending_exclusion_items_(nullptr) { 67 TC3_CHECK_NE(rules, nullptr); 68 } 69 Matcher(const UniLib * unilib,const RulesSet * rules,UnsafeArena * arena)70 explicit Matcher(const UniLib* unilib, const RulesSet* rules, 71 UnsafeArena* arena) 72 : Matcher(unilib, rules, {}, arena) { 73 rules_shards_.reserve(rules->rules()->size()); 74 rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(), 75 rules->rules()->end()); 76 } 77 78 // Finish the matching. 79 void Finish(); 80 81 // Tells the matcher that the given terminal was found occupying position 82 // range [begin, end) in the input. 83 // The matcher may invoke callback functions before returning, if this 84 // terminal triggers any new matches for rules in the grammar. 85 // Calls to AddTerminal() and AddParseTree() must be in left-to-right order, 86 // that is, the sequence of `end` values must be non-decreasing. 87 void AddTerminal(const CodepointSpan codepoint_span, const int match_offset, 88 StringPiece terminal); AddTerminal(const CodepointIndex begin,const CodepointIndex end,StringPiece terminal)89 void AddTerminal(const CodepointIndex begin, const CodepointIndex end, 90 StringPiece terminal) { 91 AddTerminal(CodepointSpan{begin, end}, begin, terminal); 92 } 93 94 // Adds predefined parse tree. 95 void AddParseTree(ParseTree* parse_tree); 96 chart()97 const Chart<> chart() const { return chart_; } 98 99 private: 100 // Process matches from lhs set. 101 void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset, 102 const int whitespace_gap, 103 const std::function<void(ParseTree*)>& initializer_fn, 104 const RulesSet_::LhsSet* lhs_set); 105 106 // Queues a newly created match item. 107 void QueueForProcessing(ParseTree* item); 108 109 // Queues a match item for later post checking of the exclusion condition. 110 // For exclusions we need to check that the `item->excluded_nonterminal` 111 // doesn't match the same span. As we cannot know which matches have already 112 // been added, we queue the item for later post checking - once all matches 113 // up to `item->codepoint_span.second` have been added. 114 void QueueForPostCheck(ExclusionNode* item); 115 116 // Adds pending items to the chart, possibly generating new matches as a 117 // result. 118 void ProcessPendingSet(); 119 120 // Checks all pending exclusion matches that their exclusion condition is 121 // fulfilled. 122 void ProcessPendingExclusionMatches(); 123 124 UniLib unilib_; 125 126 // Memory arena for match allocation. 127 UnsafeArena* arena_; 128 129 // The end position of the most recent match or terminal, for sanity 130 // checking. 131 int last_end_; 132 133 // Rules. 134 const RulesSet* rules_; 135 // The active rule shards. 136 std::vector<const RulesSet_::Rules*> rules_shards_; 137 138 // The set of items pending to be added to the chart as a singly-linked list. 139 ParseTree* pending_items_; 140 141 // The set of items pending to be post-checked as a singly-linked list. 142 ExclusionNode* pending_exclusion_items_; 143 144 // The chart data structure: a hashtable containing all matches, indexed by 145 // their end positions. 146 Chart<> chart_; 147 }; 148 149 } // namespace libtextclassifier3::grammar 150 151 #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ 152