1// 2// Copyright (C) 2018 The Android Open Source Project 3// 4// Licensed under the Apache License, Version 2.0 (the "License"); 5// you may not use this file except in compliance with the License. 6// You may obtain a copy of the License at 7// 8// http://www.apache.org/licenses/LICENSE-2.0 9// 10// Unless required by applicable law or agreed to in writing, software 11// distributed under the License is distributed on an "AS IS" BASIS, 12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13// See the License for the specific language governing permissions and 14// limitations under the License. 15// 16 17include "utils/codepoint-range.fbs"; 18include "utils/flatbuffers.fbs"; 19include "utils/intents/intent-config.fbs"; 20include "utils/resources.fbs"; 21include "utils/tokenizer.fbs"; 22include "utils/zlib/buffer.fbs"; 23 24file_identifier "TC2 "; 25 26// The possible model modes, represents a bit field. 27namespace libtextclassifier3; 28enum ModeFlag : int { 29 NONE = 0, 30 ANNOTATION = 1, 31 CLASSIFICATION = 2, 32 ANNOTATION_AND_CLASSIFICATION = 3, 33 SELECTION = 4, 34 ANNOTATION_AND_SELECTION = 5, 35 CLASSIFICATION_AND_SELECTION = 6, 36 ALL = 7, 37} 38 39// Enum for specifying the annotation usecase. 40namespace libtextclassifier3; 41enum AnnotationUsecase : int { 42 // Results are optimized for Smart{Select,Share,Linkify}. 43 ANNOTATION_USECASE_SMART = 0, 44 45 // Results are optimized for using TextClassifier as an infrastructure that 46 // annotates as much as possible. 47 ANNOTATION_USECASE_RAW = 1, 48} 49 50namespace libtextclassifier3; 51enum DatetimeExtractorType : int { 52 UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, 53 AM = 1, 54 PM = 2, 55 JANUARY = 3, 56 FEBRUARY = 4, 57 MARCH = 5, 58 APRIL = 6, 59 MAY = 7, 60 JUNE = 8, 61 JULY = 9, 62 AUGUST = 10, 63 SEPTEMBER = 11, 64 OCTOBER = 12, 65 NOVEMBER = 13, 66 DECEMBER = 14, 67 NEXT = 15, 68 NEXT_OR_SAME = 16, 69 LAST = 17, 70 NOW = 18, 71 TOMORROW = 19, 72 YESTERDAY = 20, 73 PAST = 21, 74 FUTURE = 22, 75 DAY = 23, 76 WEEK = 24, 77 MONTH = 25, 78 YEAR = 26, 79 MONDAY = 27, 80 TUESDAY = 28, 81 WEDNESDAY = 29, 82 THURSDAY = 30, 83 FRIDAY = 31, 84 SATURDAY = 32, 85 SUNDAY = 33, 86 DAYS = 34, 87 WEEKS = 35, 88 MONTHS = 36, 89 90 // TODO(zilka): Make the following 3 values singular for consistency. 91 HOURS = 37, 92 93 MINUTES = 38, 94 SECONDS = 39, 95 YEARS = 40, 96 DIGITS = 41, 97 SIGNEDDIGITS = 42, 98 ZERO = 43, 99 ONE = 44, 100 TWO = 45, 101 THREE = 46, 102 FOUR = 47, 103 FIVE = 48, 104 SIX = 49, 105 SEVEN = 50, 106 EIGHT = 51, 107 NINE = 52, 108 TEN = 53, 109 ELEVEN = 54, 110 TWELVE = 55, 111 THIRTEEN = 56, 112 FOURTEEN = 57, 113 FIFTEEN = 58, 114 SIXTEEN = 59, 115 SEVENTEEN = 60, 116 EIGHTEEN = 61, 117 NINETEEN = 62, 118 TWENTY = 63, 119 THIRTY = 64, 120 FORTY = 65, 121 FIFTY = 66, 122 SIXTY = 67, 123 SEVENTY = 68, 124 EIGHTY = 69, 125 NINETY = 70, 126 HUNDRED = 71, 127 THOUSAND = 72, 128} 129 130namespace libtextclassifier3; 131enum DatetimeGroupType : int { 132 GROUP_UNKNOWN = 0, 133 GROUP_UNUSED = 1, 134 GROUP_YEAR = 2, 135 GROUP_MONTH = 3, 136 GROUP_DAY = 4, 137 GROUP_HOUR = 5, 138 GROUP_MINUTE = 6, 139 GROUP_SECOND = 7, 140 GROUP_AMPM = 8, 141 GROUP_RELATIONDISTANCE = 9, 142 GROUP_RELATION = 10, 143 GROUP_RELATIONTYPE = 11, 144 145 // Dummy groups serve just as an inflator of the selection. E.g. we might want 146 // to select more text than was contained in an envelope of all extractor 147 // spans. 148 GROUP_DUMMY1 = 12, 149 150 GROUP_DUMMY2 = 13, 151} 152 153// Options for the model that predicts text selection. 154namespace libtextclassifier3; 155table SelectionModelOptions { 156 // If true, before the selection is returned, the unpaired brackets contained 157 // in the predicted selection are stripped from the both selection ends. 158 // The bracket codepoints are defined in the Unicode standard: 159 // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt 160 strip_unpaired_brackets:bool = true; 161 162 // Number of hypothetical click positions on either side of the actual click 163 // to consider in order to enforce symmetry. 164 symmetry_context_size:int; 165 166 // Number of examples to bundle in one batch for inference. 167 batch_size:int = 1024; 168 169 // Whether to always classify a suggested selection or only on demand. 170 always_classify_suggested_selection:bool = false; 171} 172 173// Options for the model that classifies a text selection. 174namespace libtextclassifier3; 175table ClassificationModelOptions { 176 // Limits for phone numbers. 177 phone_min_num_digits:int = 7; 178 179 phone_max_num_digits:int = 15; 180 181 // Limits for addresses. 182 address_min_num_tokens:int; 183 184 // Maximum number of tokens to attempt a classification (-1 is unlimited). 185 max_num_tokens:int = -1; 186} 187 188// Options for post-checks, checksums and verification to apply on a match. 189namespace libtextclassifier3; 190table VerificationOptions { 191 verify_luhn_checksum:bool = false; 192 193 // Lua verifier to use. 194 // Index of the lua verifier in the model. 195 lua_verifier:int = -1; 196} 197 198// Behaviour of capturing groups. 199namespace libtextclassifier3.RegexModel_.Pattern_; 200table CapturingGroup { 201 // If true, the span of the capturing group will be used to 202 // extend the selection. 203 extend_selection:bool = true; 204 205 // If set, the text of the capturing group will be used to set a field in 206 // the classfication result entity data. 207 entity_field_path:FlatbufferFieldPath; 208} 209 210// List of regular expression matchers to check. 211namespace libtextclassifier3.RegexModel_; 212table Pattern { 213 // The name of the collection of a match. 214 collection_name:string; 215 216 // The pattern to check. 217 pattern:string; 218 219 // The modes for which to apply the patterns. 220 enabled_modes:ModeFlag = ALL; 221 222 // The final score to assign to the results of this pattern. 223 target_classification_score:float = 1; 224 225 // Priority score used for conflict resolution with the other models. 226 priority_score:float = 0; 227 228 // If true, will use an approximate matching implementation implemented 229 // using Find() instead of the true Match(). This approximate matching will 230 // use the first Find() result and then check that it spans the whole input. 231 use_approximate_matching:bool = false; 232 233 compressed_pattern:CompressedBuffer; 234 235 // Verification to apply on a match. 236 verification_options:VerificationOptions; 237 238 capturing_group:[Pattern_.CapturingGroup]; 239 240 // Serialized entity data to set for a match. 241 serialized_entity_data:string; 242} 243 244namespace libtextclassifier3; 245table RegexModel { 246 patterns:[RegexModel_.Pattern]; 247 248 // If true, will compile the regexes only on first use. 249 lazy_regex_compilation:bool = true; 250 251 // Lua scripts for match verification. 252 // The verifier can access: 253 // * `context`: The context as a string. 254 // * `match`: The groups of the regex match as an array, each group gives 255 // * `begin`: span start 256 // * `end`: span end 257 // * `text`: the text 258 // The verifier is expected to return a boolean, indicating whether the 259 // verification succeeded or not. 260 lua_verifier:[string]; 261} 262 263// List of regex patterns. 264namespace libtextclassifier3.DatetimeModelPattern_; 265table Regex { 266 pattern:string; 267 268 // The ith entry specifies the type of the ith capturing group. 269 // This is used to decide how the matched content has to be parsed. 270 groups:[DatetimeGroupType]; 271 272 compressed_pattern:CompressedBuffer; 273} 274 275namespace libtextclassifier3; 276table DatetimeModelPattern { 277 regexes:[DatetimeModelPattern_.Regex]; 278 279 // List of locale indices in DatetimeModel that represent the locales that 280 // these patterns should be used for. If empty, can be used for all locales. 281 locales:[int]; 282 283 // The final score to assign to the results of this pattern. 284 target_classification_score:float = 1; 285 286 // Priority score used for conflict resolution with the other models. 287 priority_score:float = 0; 288 289 // The modes for which to apply the patterns. 290 enabled_modes:ModeFlag = ALL; 291 292 // The annotation usecases for which to apply the patterns. 293 // This is a flag field for values of AnnotationUsecase. 294 enabled_annotation_usecases:uint = 4294967295; 295} 296 297namespace libtextclassifier3; 298table DatetimeModelExtractor { 299 extractor:DatetimeExtractorType; 300 pattern:string; 301 locales:[int]; 302 compressed_pattern:CompressedBuffer; 303} 304 305namespace libtextclassifier3; 306table DatetimeModel { 307 // List of BCP 47 locale strings representing all locales supported by the 308 // model. The individual patterns refer back to them using an index. 309 locales:[string]; 310 311 patterns:[DatetimeModelPattern]; 312 extractors:[DatetimeModelExtractor]; 313 314 // If true, will use the extractors for determining the match location as 315 // opposed to using the location where the global pattern matched. 316 use_extractors_for_locating:bool = true; 317 318 // List of locale ids, rules of whose are always run, after the requested 319 // ones. 320 default_locales:[int]; 321 322 // If true, will generate the alternative interpretations for ambiguous 323 // datetime expressions. 324 generate_alternative_interpretations_when_ambiguous:bool = false; 325 326 // If true, will compile the regexes only on first use. 327 lazy_regex_compilation:bool = true; 328} 329 330namespace libtextclassifier3.DatetimeModelLibrary_; 331table Item { 332 key:string; 333 value:DatetimeModel; 334} 335 336// A set of named DateTime models. 337namespace libtextclassifier3; 338table DatetimeModelLibrary { 339 models:[DatetimeModelLibrary_.Item]; 340} 341 342// Options controlling the output of the Tensorflow Lite models. 343namespace libtextclassifier3; 344table ModelTriggeringOptions { 345 // Lower bound threshold for filtering annotation model outputs. 346 min_annotate_confidence:float = 0; 347 348 // The modes for which to enable the models. 349 enabled_modes:ModeFlag = ALL; 350 351 // Comma-separated list of locales (BCP 47 tags) that dictionary 352 // classification supports. 353 dictionary_locales:string; 354 355 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 356 // are used to prevent triggering on input in unsupported languages. If 357 // empty, the model will trigger on all inputs. 358 locales:string; 359} 360 361// Options controlling the output of the classifier. 362namespace libtextclassifier3; 363table OutputOptions { 364 // Lists of collection names that will be filtered out at the output: 365 // - For annotation, the spans of given collection are simply dropped. 366 // - For classification, the result is mapped to the class "other". 367 // - For selection, the spans of given class are returned as 368 // single-selection. 369 filtered_collections_annotation:[string]; 370 371 filtered_collections_classification:[string]; 372 filtered_collections_selection:[string]; 373} 374 375namespace libtextclassifier3.Model_; 376table EmbeddingPruningMask { 377 // If true, use pruning mask. In this case, we use mask 378 // pruning_mask to determine the mapping of hashed-charactergrams. 379 enabled:bool; 380 381 // Packing of the binary pruning mask into uint64 values. 382 pruning_mask:[ulong] (force_align: 16); 383 384 // Number of buckets before pruning. 385 full_num_buckets:int; 386 387 // Index of row of compressed embedding matrix to which all pruned buckets 388 // are mapped. 389 pruned_row_bucket_id:int; 390} 391 392namespace libtextclassifier3; 393table Model { 394 // Comma-separated list of locales supported by the model as BCP 47 tags. 395 locales:string; 396 397 version:int; 398 399 // A name for the model that can be used for e.g. logging. 400 name:string; 401 402 selection_feature_options:FeatureProcessorOptions; 403 classification_feature_options:FeatureProcessorOptions; 404 405 // Tensorflow Lite models. 406 selection_model:[ubyte] (force_align: 16); 407 408 classification_model:[ubyte] (force_align: 16); 409 embedding_model:[ubyte] (force_align: 16); 410 411 // Options for the different models. 412 selection_options:SelectionModelOptions; 413 414 classification_options:ClassificationModelOptions; 415 regex_model:RegexModel; 416 datetime_model:DatetimeModel; 417 418 // Options controlling the output of the models. 419 triggering_options:ModelTriggeringOptions; 420 421 // Global switch that controls if SuggestSelection(), ClassifyText() and 422 // Annotate() will run. If a mode is disabled it returns empty/no-op results. 423 enabled_modes:ModeFlag = ALL; 424 425 // If true, will snap the selections that consist only of whitespaces to the 426 // containing suggested span. Otherwise, no suggestion is proposed, since the 427 // selections are not part of any token. 428 snap_whitespace_selections:bool = true; 429 430 // Global configuration for the output of SuggestSelection(), ClassifyText() 431 // and Annotate(). 432 output_options:OutputOptions; 433 434 // Configures how Intents should be generated on Android. 435 android_intent_options:AndroidIntentFactoryOptions; 436 437 intent_options:IntentFactoryModel; 438 439 // Model resources. 440 resources:ResourcePool; 441 442 // Schema data for handling entity data. 443 entity_data_schema:[ubyte]; 444 445 number_annotator_options:NumberAnnotatorOptions; 446 duration_annotator_options:DurationAnnotatorOptions; 447 448 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 449 // are used to prevent triggering on input in unsupported languages. If 450 // empty, the model will trigger on all inputs. 451 triggering_locales:string; 452 453 embedding_pruning_mask:Model_.EmbeddingPruningMask; 454} 455 456// Method for selecting the center token. 457namespace libtextclassifier3.FeatureProcessorOptions_; 458enum CenterTokenSelectionMethod : int { 459 DEFAULT_CENTER_TOKEN_METHOD = 0, 460 461 // Use click indices to determine the center token. 462 CENTER_TOKEN_FROM_CLICK = 1, 463 464 // Use selection indices to get a token range, and select the middle of it 465 // as the center token. 466 CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, 467} 468 469// Bounds-sensitive feature extraction configuration. 470namespace libtextclassifier3.FeatureProcessorOptions_; 471table BoundsSensitiveFeatures { 472 // Enables the extraction of bounds-sensitive features, instead of the click 473 // context features. 474 enabled:bool; 475 476 // The numbers of tokens to extract in specific locations relative to the 477 // bounds. 478 // Immediately before the span. 479 num_tokens_before:int; 480 481 // Inside the span, aligned with the beginning. 482 num_tokens_inside_left:int; 483 484 // Inside the span, aligned with the end. 485 num_tokens_inside_right:int; 486 487 // Immediately after the span. 488 num_tokens_after:int; 489 490 // If true, also extracts the tokens of the entire span and adds up their 491 // features forming one "token" to include in the extracted features. 492 include_inside_bag:bool; 493 494 // If true, includes the selection length (in the number of tokens) as a 495 // feature. 496 include_inside_length:bool; 497 498 // If true, for selection, single token spans are not run through the model 499 // and their score is assumed to be zero. 500 score_single_token_spans_as_zero:bool; 501} 502 503namespace libtextclassifier3; 504table FeatureProcessorOptions { 505 // Number of buckets used for hashing charactergrams. 506 num_buckets:int = -1; 507 508 // Size of the embedding. 509 embedding_size:int = -1; 510 511 // Number of bits for quantization for embeddings. 512 embedding_quantization_bits:int = 8; 513 514 // Context size defines the number of words to the left and to the right of 515 // the selected word to be used as context. For example, if context size is 516 // N, then we take N words to the left and N words to the right of the 517 // selected word as its context. 518 context_size:int = -1; 519 520 // Maximum number of words of the context to select in total. 521 max_selection_span:int = -1; 522 523 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 524 // character trigrams etc. 525 chargram_orders:[int]; 526 527 // Maximum length of a word, in codepoints. 528 max_word_length:int = 20; 529 530 // If true, will use the unicode-aware functionality for extracting features. 531 unicode_aware_features:bool = false; 532 533 // Whether to extract the token case feature. 534 extract_case_feature:bool = false; 535 536 // Whether to extract the selection mask feature. 537 extract_selection_mask_feature:bool = false; 538 539 // List of regexps to run over each token. For each regexp, if there is a 540 // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. 541 regexp_feature:[string]; 542 543 // Whether to remap all digits to a single number. 544 remap_digits:bool = false; 545 546 // Whether to lower-case each token before generating hashgrams. 547 lowercase_tokens:bool; 548 549 // If true, the selection classifier output will contain only the selections 550 // that are feasible (e.g., those that are shorter than max_selection_span), 551 // if false, the output will be a complete cross-product of possible 552 // selections to the left and possible selections to the right, including the 553 // infeasible ones. 554 // NOTE: Exists mainly for compatibility with older models that were trained 555 // with the non-reduced output space. 556 selection_reduced_output_space:bool = true; 557 558 // Collection names. 559 collections:[string]; 560 561 // An index of collection in collections to be used if a collection name can't 562 // be mapped to an id. 563 default_collection:int = -1; 564 565 // If true, will split the input by lines, and only use the line that contains 566 // the clicked token. 567 only_use_line_with_click:bool = false; 568 569 // If true, will split tokens that contain the selection boundary, at the 570 // position of the boundary. 571 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" 572 split_tokens_on_selection_boundaries:bool = false; 573 574 // Codepoint ranges that determine how different codepoints are tokenized. 575 // The ranges must not overlap. 576 tokenization_codepoint_config:[TokenizationCodepointRange]; 577 578 center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod; 579 580 // If true, span boundaries will be snapped to containing tokens and not 581 // required to exactly match token boundaries. 582 snap_label_span_boundaries_to_containing_tokens:bool; 583 584 // A set of codepoint ranges supported by the model. 585 supported_codepoint_ranges:[CodepointRange]; 586 587 // A set of codepoint ranges to use in the mixed tokenization mode to identify 588 // stretches of tokens to re-tokenize using the internal tokenizer. 589 internal_tokenizer_codepoint_ranges:[CodepointRange]; 590 591 // Minimum ratio of supported codepoints in the input context. If the ratio 592 // is lower than this, the feature computation will fail. 593 min_supported_codepoint_ratio:float = 0; 594 595 // Used for versioning the format of features the model expects. 596 // - feature_version == 0: 597 // For each token the features consist of: 598 // - chargram embeddings 599 // - dense features 600 // Chargram embeddings for tokens are concatenated first together, 601 // and at the end, the dense features for the tokens are concatenated 602 // to it. So the resulting feature vector has two regions. 603 feature_version:int = 0; 604 605 tokenization_type:TokenizationType = INTERNAL_TOKENIZER; 606 icu_preserve_whitespace_tokens:bool = false; 607 608 // List of codepoints that will be stripped from beginning and end of 609 // predicted spans. 610 ignored_span_boundary_codepoints:[int]; 611 612 bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures; 613 614 // List of allowed charactergrams. The extracted charactergrams are filtered 615 // using this list, and charactergrams that are not present are interpreted as 616 // out-of-vocabulary. 617 // If no allowed_chargrams are specified, all charactergrams are allowed. 618 // The field is typed as bytes type to allow non-UTF8 chargrams. 619 allowed_chargrams:[string]; 620 621 // If true, tokens will be also split when the codepoint's script_id changes 622 // as defined in TokenizationCodepointRange. 623 tokenize_on_script_change:bool = false; 624} 625 626namespace libtextclassifier3; 627table NumberAnnotatorOptions { 628 // If true, number annotations will be produced. 629 enabled:bool = false; 630 631 // Score to assign to the annotated numbers from the annotator. 632 score:float = 1; 633 634 // Priority score used for conflict resolution with the other models. 635 priority_score:float = 0; 636 637 // The modes in which to enable number annotations. 638 enabled_modes:ModeFlag = ALL; 639 640 // The annotation usecases for which to produce number annotations. 641 // This is a flag field for values of AnnotationUsecase. 642 enabled_annotation_usecases:uint = 4294967295; 643 644 // A list of codepoints that can form a prefix of a valid number. 645 allowed_prefix_codepoints:[int]; 646 647 // A list of codepoints that can form a suffix of a valid number. 648 allowed_suffix_codepoints:[int]; 649} 650 651// DurationAnnotator is so far tailored for English only. 652namespace libtextclassifier3; 653table DurationAnnotatorOptions { 654 // If true, duration annotations will be produced. 655 enabled:bool = false; 656 657 // Score to assign to the annotated durations from the annotator. 658 score:float = 1; 659 660 // Priority score used for conflict resolution with the other models. 661 priority_score:float = 0; 662 663 // The modes in which to enable duration annotations. 664 enabled_modes:ModeFlag = ALL; 665 666 // The annotation usecases for which to produce duration annotations. 667 enabled_annotation_usecases:uint = 4294967295; 668 669 // Durations typically look like XX hours and XX minutes etc... The list of 670 // strings below enumerate variants of "hours", "minutes", etc. in these 671 // expressions. These are verbatim strings that are matched against tokens in 672 // the input. 673 week_expressions:[string]; 674 675 day_expressions:[string]; 676 hour_expressions:[string]; 677 minute_expressions:[string]; 678 second_expressions:[string]; 679 680 // List of expressions that doesn't break a duration expression (can become 681 // a part of it) but has not semantic meaning. 682 filler_expressions:[string]; 683 684 // List of expressions that mean half of a unit of duration (e.g. "half an 685 // hour"). 686 half_expressions:[string]; 687} 688 689root_type libtextclassifier3.Model; 690