1// 2// Copyright (C) 2018 The Android Open Source Project 3// 4// Licensed under the Apache License, Version 2.0 (the "License"); 5// you may not use this file except in compliance with the License. 6// You may obtain a copy of the License at 7// 8// http://www.apache.org/licenses/LICENSE-2.0 9// 10// Unless required by applicable law or agreed to in writing, software 11// distributed under the License is distributed on an "AS IS" BASIS, 12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13// See the License for the specific language governing permissions and 14// limitations under the License. 15// 16 17include "utils/grammar/semantics/expression.fbs"; 18include "utils/i18n/language-tag.fbs"; 19include "utils/zlib/buffer.fbs"; 20 21// The terminal rules map as sorted strings table. 22// The sorted terminal strings table is represented as offsets into the 23// global strings pool, this allows to save memory between localized 24// rules sets. 25namespace libtextclassifier3.grammar.RulesSet_.Rules_; 26table TerminalRulesMap { 27 // The offsets into the terminals pool. 28 terminal_offsets:[uint]; 29 30 // The lhs set associated with a terminal rule. 31 // This is an offset into the (deduplicated) global `lhs_set` vector. 32 lhs_set_index:[uint]; 33 34 // Bounds the lengths of the terminal strings for quick early lookup 35 // abort. 36 min_terminal_length:int; 37 38 max_terminal_length:int; 39} 40 41namespace libtextclassifier3.grammar.RulesSet_.Rules_; 42struct UnaryRulesEntry { 43 key:uint (key); 44 value:uint; 45} 46 47// One key, value pair entry in the binary rules hash map. 48// The key is a pair of nonterminals and the value the index of the lhs set. 49namespace libtextclassifier3.grammar.RulesSet_.Rules_; 50struct BinaryRule { 51 // The two rhs nonterminals. 52 rhs_first:uint; 53 54 rhs_second:uint; 55 56 // The lhs set associated with this binary rule. 57 // This is an offset into the (deduplicated) global `lhs_set` vector. 58 lhs_set_index:uint; 59} 60 61// One bucket in the binary rule hash map that contains all entries for a 62// given hash value. 63namespace libtextclassifier3.grammar.RulesSet_.Rules_; 64table BinaryRuleTableBucket { 65 rules:[BinaryRule]; 66} 67 68namespace libtextclassifier3.grammar.RulesSet_; 69table Rules { 70 // The locale this rule set applies to. 71 locale:[LanguageTag]; 72 73 terminal_rules:Rules_.TerminalRulesMap; 74 lowercase_terminal_rules:Rules_.TerminalRulesMap; 75 76 // The unary rules map. 77 // This is a map from a nonterminal to an lhs set index into the 78 // (deduplicated) global `lhs_set` vector. 79 unary_rules:[Rules_.UnaryRulesEntry]; 80 81 // The binary rules (hash) map. 82 // This is a map from nonterminal pair to an lhs set index into the 83 // (deduplicated) global `lhs_set` vector. 84 binary_rules:[Rules_.BinaryRuleTableBucket]; 85} 86 87// A set of lhs nonterminals associated with a rule match. 88// Most commonly, that is just the id of the lhs nonterminal of the rule that 89// is triggered, in this case `lhs` is set to the id of the nonterminal. 90// If a callback needs to be triggered, lhs is the (negated) index into the 91// `lhs` vector below that specifies additionally to the nonterminal, also the 92// callback and parameter to call. 93namespace libtextclassifier3.grammar.RulesSet_; 94table LhsSet { 95 lhs:[int]; 96} 97 98namespace libtextclassifier3.grammar.RulesSet_; 99struct Lhs { 100 // The lhs nonterminal. 101 nonterminal:uint; 102 103 // The id of the callback to trigger. 104 callback_id:uint; 105 106 // A parameter to pass when invoking the callback. 107 callback_param:ulong; 108 109 // The maximum amount of whitespace allowed between the two parts. 110 // A value of -1 allows for unbounded whitespace. 111 max_whitespace_gap:byte; 112} 113 114namespace libtextclassifier3.grammar.RulesSet_.Nonterminals_; 115table AnnotationNtEntry { 116 key:string (key, shared); 117 value:int; 118} 119 120// Usage of pre-defined non-terminals that the lexer can generate if used by 121// the grammar. 122namespace libtextclassifier3.grammar.RulesSet_; 123table Nonterminals { 124 // Id of the nonterminal indicating the start of input. 125 start_nt:int; 126 127 // Id of the nonterminal indicating the end of input. 128 end_nt:int; 129 130 // Id of the nonterminal indicating a token. 131 token_nt:int; 132 133 // Id of the nonterminal indicating a string of digits. 134 digits_nt:int; 135 136 // `n_digits_nt[k]` is the id of the nonterminal indicating a string of 137 // `k` digits. 138 n_digits_nt:[int]; 139 140 // Id of the nonterminal indicating a word or token boundary. 141 wordbreak_nt:int; 142 143 // Id of the nonterminal indicating an uppercase token. 144 uppercase_token_nt:int; 145 146 // Predefined nonterminals for annotations. 147 // Maps annotation/collection names to non-terminal ids. 148 annotation_nt:[Nonterminals_.AnnotationNtEntry]; 149} 150 151namespace libtextclassifier3.grammar.RulesSet_.DebugInformation_; 152table NonterminalNamesEntry { 153 key:int (key); 154 value:string (shared); 155} 156 157// Debug information for e.g. printing parse trees and show match 158// information. 159namespace libtextclassifier3.grammar.RulesSet_; 160table DebugInformation { 161 nonterminal_names:[DebugInformation_.NonterminalNamesEntry]; 162} 163 164// Regex annotators. 165namespace libtextclassifier3.grammar.RulesSet_; 166table RegexAnnotator { 167 // The pattern to run. 168 pattern:string (shared); 169 170 compressed_pattern:CompressedBuffer; 171 172 // The nonterminal to trigger. 173 nonterminal:uint; 174} 175 176// Context free grammar rules representation. 177// Rules are represented in (mostly) Chomsky Normal Form, where all rules are 178// of the following form, either: 179// * <nonterm> ::= term 180// * <nonterm> ::= <nonterm> 181// * <nonterm> ::= <nonterm> <nonterm> 182// The `terminals`, `unary_rules` and `binary_rules` maps below represent 183// these sets of rules. 184namespace libtextclassifier3.grammar; 185table RulesSet { 186 rules:[RulesSet_.Rules]; 187 lhs_set:[RulesSet_.LhsSet]; 188 lhs:[RulesSet_.Lhs]; 189 190 // Terminals string pool. 191 // The strings are zero-byte delimited and offset indexed by 192 // `terminal_offsets` in the terminals rules map. 193 terminals:string (shared); 194 195 nonterminals:RulesSet_.Nonterminals; 196 reserved_6:int16 (deprecated); 197 debug_information:RulesSet_.DebugInformation; 198 regex_annotator:[RulesSet_.RegexAnnotator]; 199 200 // If true, will compile the regexes only on first use. 201 lazy_regex_compilation:bool; 202 203 // The semantic expressions associated with rule matches. 204 semantic_expression:[SemanticExpression]; 205 206 // The schema defining the semantic results. 207 semantic_values_schema:[ubyte]; 208} 209 210