1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Utility functions for pre-processing, creating and testing context free 18 // grammars. 19 20 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_ 21 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_ 22 23 #include <unordered_map> 24 #include <vector> 25 26 #include "utils/grammar/types.h" 27 #include "utils/grammar/utils/ir.h" 28 29 namespace libtextclassifier3::grammar { 30 31 // Special nonterminals. 32 constexpr const char* kFiller = "<filler>"; 33 34 // All rules for a grammar will be collected in a rules object. 35 // 36 // Rules r; 37 // r.Add("<date>", {"<monthname>", "<day>", <year>"}); 38 // r.Add("<monthname>", {"January"}); 39 // ... 40 // r.Add("<monthname>", {"December"}); 41 // r.Add("<day>", {"<string_of_digits>"}); 42 // r.Add("<year>", {"<string_of_digits>"}); 43 // 44 // The Add() method adds a rule with a given lhs, rhs/ 45 // The rhs is just a list of terminals and nonterminals. Anything 46 // surrounded in angle brackets is considered a nonterminal. A "?" can follow 47 // any element of the RHS, like this: 48 // 49 // r.Add("<date>", {"<monthname>", "<day>?", ",?", "<year>"}); 50 // 51 // This indicates that the <day> and "," parts of the rhs are optional. 52 // (This is just notational shorthand for adding a bunch of rules.) 53 // 54 // Once you're done adding rules, r.Finalize() lowers the rule set into an 55 // internal representation. 56 class Rules { 57 public: Rules(const LocaleShardMap & locale_shard_map)58 explicit Rules(const LocaleShardMap& locale_shard_map) 59 : locale_shard_map_(locale_shard_map) {} 60 61 // Represents one item in a right-hand side, a single terminal or nonterminal. 62 struct RhsElement { RhsElementRhsElement63 RhsElement() {} RhsElementRhsElement64 explicit RhsElement(const std::string& terminal, const bool is_optional) 65 : is_terminal(true), 66 terminal(terminal), 67 is_optional(is_optional), 68 is_constituent(false) {} 69 explicit RhsElement(const int nonterminal, const bool is_optional, 70 const bool is_constituent = true) is_terminalRhsElement71 : is_terminal(false), 72 nonterminal(nonterminal), 73 is_optional(is_optional), 74 is_constituent(is_constituent) {} 75 bool is_terminal; 76 std::string terminal; 77 int nonterminal; 78 bool is_optional; 79 80 // Whether the element is a constituent of a rule - these are the explicit 81 // nonterminals, but not terminals or implicitly added anchors. 82 bool is_constituent; 83 }; 84 85 // Represents the right-hand side, and possibly callback, of one rule. 86 struct Rule { 87 std::vector<RhsElement> rhs; 88 CallbackId callback = kNoCallback; 89 int64 callback_param = 0; 90 int8 max_whitespace_gap = -1; 91 bool case_sensitive = false; 92 int shard = 0; 93 }; 94 95 struct NontermInfo { 96 // The name of the non-terminal, if defined. 97 std::string name; 98 99 // Whether the nonterminal is provided via an annotation. 100 bool from_annotation = false; 101 102 // Rules that have this non-terminal as the lhs. 103 std::vector<int> rules; 104 105 // Regex rules that have this non-terminal as the lhs. 106 std::vector<int> regex_rules; 107 }; 108 109 // Adds a rule `lhs ::= rhs` with the given callback id and parameter. 110 // Note: Nonterminal names are in angle brackets and cannot contain 111 // whitespace. The `rhs` is a list of components, each of which is either: 112 // * A nonterminal name (in angle brackets) 113 // * A terminal 114 // optionally followed by a `?` which indicates that the component is 115 // optional. The `rhs` must contain at least one non-optional component. 116 void Add(const std::string& lhs, const std::vector<std::string>& rhs, 117 const CallbackId callback = kNoCallback, 118 const int64 callback_param = 0, int8 max_whitespace_gap = -1, 119 bool case_sensitive = false, int shard = 0); 120 121 // Adds a rule `lhs ::= rhs` with the given callback id and parameter. 122 // The `rhs` must contain at least one non-optional component. 123 void Add(int lhs, const std::vector<RhsElement>& rhs, 124 CallbackId callback = kNoCallback, int64 callback_param = 0, 125 int8 max_whitespace_gap = -1, bool case_sensitive = false, 126 int shard = 0); 127 128 // Adds a rule `lhs ::= rhs` with exclusion. 129 // The rule only matches, if `excluded_nonterminal` doesn't match the same 130 // span. 131 void AddWithExclusion(const std::string& lhs, 132 const std::vector<std::string>& rhs, 133 const std::string& excluded_nonterminal, 134 int8 max_whitespace_gap = -1, 135 bool case_sensitive = false, int shard = 0); 136 137 // Adds an assertion callback. 138 void AddAssertion(const std::string& lhs, const std::vector<std::string>& rhs, 139 bool negative = true, int8 max_whitespace_gap = -1, 140 bool case_sensitive = false, int shard = 0); 141 142 // Adds a mapping callback. 143 void AddValueMapping(const std::string& lhs, 144 const std::vector<std::string>& rhs, int64 value, 145 int8 max_whitespace_gap = -1, 146 bool case_sensitive = false, int shard = 0); 147 void AddValueMapping(int lhs, const std::vector<RhsElement>& rhs, int64 value, 148 int8 max_whitespace_gap = -1, 149 bool case_sensitive = false, int shard = 0); 150 151 // Adds a regex rule. 152 void AddRegex(const std::string& lhs, const std::string& regex_pattern); 153 void AddRegex(int lhs, const std::string& regex_pattern); 154 155 // Creates a nonterminal with the given name, if one doesn't already exist. 156 int AddNonterminal(const std::string& nonterminal_name); 157 158 // Creates a new nonterminal. 159 int AddNewNonterminal(); 160 161 // Defines a nonterminal for an externally provided annotation. 162 int AddAnnotation(const std::string& annotation_name); 163 164 // Defines a nonterminal for an externally provided annotation. 165 void BindAnnotation(const std::string& nonterminal_name, 166 const std::string& annotation_name); 167 168 // Adds an alias for a nonterminal. This is a separate name for the same 169 // nonterminal. 170 void AddAlias(const std::string& nonterminal_name, const std::string& alias); 171 172 // Lowers the rule set into the intermediate representation. 173 // Treats nonterminals given by the argument `predefined_nonterminals` as 174 // defined externally. This allows to define rules that are dependent on 175 // non-terminals produced by e.g. existing text annotations and that will be 176 // fed to the matcher by the lexer. 177 Ir Finalize(const std::set<std::string>& predefined_nonterminals = {}) const; 178 nonterminals()179 const std::vector<NontermInfo>& nonterminals() const { return nonterminals_; } rules()180 const std::vector<Rule>& rules() const { return rules_; } 181 182 private: 183 void ExpandOptionals( 184 int lhs, const std::vector<RhsElement>& rhs, CallbackId callback, 185 int64 callback_param, int8 max_whitespace_gap, bool case_sensitive, 186 int shard, std::vector<int>::const_iterator optional_element_indices, 187 std::vector<int>::const_iterator optional_element_indices_end, 188 std::vector<bool>* omit_these); 189 190 // Applies optimizations to the right hand side of a rule. 191 std::vector<RhsElement> OptimizeRhs(const std::vector<RhsElement>& rhs, 192 int shard = 0); 193 194 // Removes start and end anchors in case they are followed (respectively 195 // preceded) by unbounded filler. 196 std::vector<RhsElement> ResolveAnchors( 197 const std::vector<RhsElement>& rhs) const; 198 199 // Rewrites fillers in a rule. 200 // Fillers in a rule such as `lhs ::= <a> <filler> <b>` could be lowered as 201 // <tokens> ::= <token> 202 // <tokens> ::= <tokens> <token> 203 // This has the disadvantage that it will produce a match for each possible 204 // span in the text, which is quadratic in the number of tokens. 205 // It can be more efficiently written as: 206 // `lhs ::= <a_with_tokens> <b>` with 207 // `<a_with_tokens> ::= <a>` 208 // `<a_with_tokens> ::= <a_with_tokens> <token>` 209 // In this each occurrence of `<a>` can start a sequence of tokens. 210 std::vector<RhsElement> ResolveFillers(const std::vector<RhsElement>& rhs, 211 int shard = 0); 212 213 // Checks whether an element denotes a specific nonterminal. 214 bool IsNonterminalOfName(const RhsElement& element, 215 const std::string& nonterminal) const; 216 217 // Checks whether the fillers are used in any active rule. 218 bool UsesFillers() const; 219 220 const LocaleShardMap& locale_shard_map_; 221 222 // Non-terminal to id map. 223 std::unordered_map<std::string, int> nonterminal_names_; 224 std::vector<NontermInfo> nonterminals_; 225 std::unordered_map<std::string, std::string> nonterminal_alias_; 226 std::unordered_map<std::string, int> annotation_nonterminals_; 227 228 // Rules. 229 std::vector<Rule> rules_; 230 std::vector<std::string> regex_rules_; 231 }; 232 233 } // namespace libtextclassifier3::grammar 234 235 #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_ 236