// // Copyright (C) 2018 The Android Open Source Project // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // include "utils/grammar/semantics/expression.fbs"; include "utils/i18n/language-tag.fbs"; include "utils/zlib/buffer.fbs"; // The terminal rules map as sorted strings table. // The sorted terminal strings table is represented as offsets into the // global strings pool, this allows to save memory between localized // rules sets. namespace libtextclassifier3.grammar.RulesSet_.Rules_; table TerminalRulesMap { // The offsets into the terminals pool. terminal_offsets:[uint]; // The lhs set associated with a terminal rule. // This is an offset into the (deduplicated) global `lhs_set` vector. lhs_set_index:[uint]; // Bounds the lengths of the terminal strings for quick early lookup // abort. min_terminal_length:int; max_terminal_length:int; } namespace libtextclassifier3.grammar.RulesSet_.Rules_; struct UnaryRulesEntry { key:uint (key); value:uint; } // One key, value pair entry in the binary rules hash map. // The key is a pair of nonterminals and the value the index of the lhs set. namespace libtextclassifier3.grammar.RulesSet_.Rules_; struct BinaryRule { // The two rhs nonterminals. rhs_first:uint; rhs_second:uint; // The lhs set associated with this binary rule. // This is an offset into the (deduplicated) global `lhs_set` vector. lhs_set_index:uint; } // One bucket in the binary rule hash map that contains all entries for a // given hash value. namespace libtextclassifier3.grammar.RulesSet_.Rules_; table BinaryRuleTableBucket { rules:[BinaryRule]; } namespace libtextclassifier3.grammar.RulesSet_; table Rules { // The locale this rule set applies to. locale:[LanguageTag]; terminal_rules:Rules_.TerminalRulesMap; lowercase_terminal_rules:Rules_.TerminalRulesMap; // The unary rules map. // This is a map from a nonterminal to an lhs set index into the // (deduplicated) global `lhs_set` vector. unary_rules:[Rules_.UnaryRulesEntry]; // The binary rules (hash) map. // This is a map from nonterminal pair to an lhs set index into the // (deduplicated) global `lhs_set` vector. binary_rules:[Rules_.BinaryRuleTableBucket]; } // A set of lhs nonterminals associated with a rule match. // Most commonly, that is just the id of the lhs nonterminal of the rule that // is triggered, in this case `lhs` is set to the id of the nonterminal. // If a callback needs to be triggered, lhs is the (negated) index into the // `lhs` vector below that specifies additionally to the nonterminal, also the // callback and parameter to call. namespace libtextclassifier3.grammar.RulesSet_; table LhsSet { lhs:[int]; } namespace libtextclassifier3.grammar.RulesSet_; struct Lhs { // The lhs nonterminal. nonterminal:uint; // The id of the callback to trigger. callback_id:uint; // A parameter to pass when invoking the callback. callback_param:ulong; // The maximum amount of whitespace allowed between the two parts. // A value of -1 allows for unbounded whitespace. max_whitespace_gap:byte; } namespace libtextclassifier3.grammar.RulesSet_.Nonterminals_; table AnnotationNtEntry { key:string (key, shared); value:int; } // Usage of pre-defined non-terminals that the lexer can generate if used by // the grammar. namespace libtextclassifier3.grammar.RulesSet_; table Nonterminals { // Id of the nonterminal indicating the start of input. start_nt:int; // Id of the nonterminal indicating the end of input. end_nt:int; // Id of the nonterminal indicating a token. token_nt:int; // Id of the nonterminal indicating a string of digits. digits_nt:int; // `n_digits_nt[k]` is the id of the nonterminal indicating a string of // `k` digits. n_digits_nt:[int]; // Id of the nonterminal indicating a word or token boundary. wordbreak_nt:int; // Id of the nonterminal indicating an uppercase token. uppercase_token_nt:int; // Predefined nonterminals for annotations. // Maps annotation/collection names to non-terminal ids. annotation_nt:[Nonterminals_.AnnotationNtEntry]; } namespace libtextclassifier3.grammar.RulesSet_.DebugInformation_; table NonterminalNamesEntry { key:int (key); value:string (shared); } // Debug information for e.g. printing parse trees and show match // information. namespace libtextclassifier3.grammar.RulesSet_; table DebugInformation { nonterminal_names:[DebugInformation_.NonterminalNamesEntry]; } // Regex annotators. namespace libtextclassifier3.grammar.RulesSet_; table RegexAnnotator { // The pattern to run. pattern:string (shared); compressed_pattern:CompressedBuffer; // The nonterminal to trigger. nonterminal:uint; } // Context free grammar rules representation. // Rules are represented in (mostly) Chomsky Normal Form, where all rules are // of the following form, either: // * ::= term // * ::= // * ::= // The `terminals`, `unary_rules` and `binary_rules` maps below represent // these sets of rules. namespace libtextclassifier3.grammar; table RulesSet { rules:[RulesSet_.Rules]; lhs_set:[RulesSet_.LhsSet]; lhs:[RulesSet_.Lhs]; // Terminals string pool. // The strings are zero-byte delimited and offset indexed by // `terminal_offsets` in the terminals rules map. terminals:string (shared); nonterminals:RulesSet_.Nonterminals; reserved_6:int16 (deprecated); debug_information:RulesSet_.DebugInformation; regex_annotator:[RulesSet_.RegexAnnotator]; // If true, will compile the regexes only on first use. lazy_regex_compilation:bool; // The semantic expressions associated with rule matches. semantic_expression:[SemanticExpression]; // The schema defining the semantic results. semantic_values_schema:[ubyte]; }