1// 2// Copyright (C) 2018 The Android Open Source Project 3// 4// Licensed under the Apache License, Version 2.0 (the "License"); 5// you may not use this file except in compliance with the License. 6// You may obtain a copy of the License at 7// 8// http://www.apache.org/licenses/LICENSE-2.0 9// 10// Unless required by applicable law or agreed to in writing, software 11// distributed under the License is distributed on an "AS IS" BASIS, 12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13// See the License for the specific language governing permissions and 14// limitations under the License. 15// 16 17include "annotator/entity-data.fbs"; 18include "annotator/experimental/experimental.fbs"; 19include "utils/codepoint-range.fbs"; 20include "utils/container/bit-vector.fbs"; 21include "utils/flatbuffers/flatbuffers.fbs"; 22include "utils/grammar/rules.fbs"; 23include "utils/intents/intent-config.fbs"; 24include "utils/normalization.fbs"; 25include "utils/resources.fbs"; 26include "utils/tokenizer.fbs"; 27include "utils/zlib/buffer.fbs"; 28 29file_identifier "TC2 "; 30 31// The possible model modes, represents a bit field. 32namespace libtextclassifier3; 33enum ModeFlag : int { 34 NONE = 0, 35 ANNOTATION = 1, 36 CLASSIFICATION = 2, 37 ANNOTATION_AND_CLASSIFICATION = 3, 38 SELECTION = 4, 39 ANNOTATION_AND_SELECTION = 5, 40 CLASSIFICATION_AND_SELECTION = 6, 41 ALL = 7, 42} 43 44// Enum for specifying the annotation usecase. 45namespace libtextclassifier3; 46enum AnnotationUsecase : int { 47 // Results are optimized for Smart{Select,Share,Linkify}. 48 ANNOTATION_USECASE_SMART = 0, 49 // Smart{Select,Share,Linkify} 50 51 // Results are optimized for using TextClassifier as an infrastructure that 52 // annotates as much as possible. 53 ANNOTATION_USECASE_RAW = 1, 54} 55 56namespace libtextclassifier3; 57enum DatetimeExtractorType : int { 58 UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, 59 AM = 1, 60 PM = 2, 61 JANUARY = 3, 62 FEBRUARY = 4, 63 MARCH = 5, 64 APRIL = 6, 65 MAY = 7, 66 JUNE = 8, 67 JULY = 9, 68 AUGUST = 10, 69 SEPTEMBER = 11, 70 OCTOBER = 12, 71 NOVEMBER = 13, 72 DECEMBER = 14, 73 NEXT = 15, 74 NEXT_OR_SAME = 16, 75 LAST = 17, 76 NOW = 18, 77 TOMORROW = 19, 78 YESTERDAY = 20, 79 PAST = 21, 80 FUTURE = 22, 81 DAY = 23, 82 WEEK = 24, 83 MONTH = 25, 84 YEAR = 26, 85 MONDAY = 27, 86 TUESDAY = 28, 87 WEDNESDAY = 29, 88 THURSDAY = 30, 89 FRIDAY = 31, 90 SATURDAY = 32, 91 SUNDAY = 33, 92 DAYS = 34, 93 WEEKS = 35, 94 MONTHS = 36, 95 96 // TODO(zilka): Make the following 3 values singular for consistency. 97 HOURS = 37, 98 99 MINUTES = 38, 100 SECONDS = 39, 101 YEARS = 40, 102 DIGITS = 41, 103 SIGNEDDIGITS = 42, 104 ZERO = 43, 105 ONE = 44, 106 TWO = 45, 107 THREE = 46, 108 FOUR = 47, 109 FIVE = 48, 110 SIX = 49, 111 SEVEN = 50, 112 EIGHT = 51, 113 NINE = 52, 114 TEN = 53, 115 ELEVEN = 54, 116 TWELVE = 55, 117 THIRTEEN = 56, 118 FOURTEEN = 57, 119 FIFTEEN = 58, 120 SIXTEEN = 59, 121 SEVENTEEN = 60, 122 EIGHTEEN = 61, 123 NINETEEN = 62, 124 TWENTY = 63, 125 THIRTY = 64, 126 FORTY = 65, 127 FIFTY = 66, 128 SIXTY = 67, 129 SEVENTY = 68, 130 EIGHTY = 69, 131 NINETY = 70, 132 HUNDRED = 71, 133 THOUSAND = 72, 134 NOON = 73, 135 MIDNIGHT = 74, 136} 137 138namespace libtextclassifier3; 139enum DatetimeGroupType : int { 140 GROUP_UNKNOWN = 0, 141 GROUP_UNUSED = 1, 142 GROUP_YEAR = 2, 143 GROUP_MONTH = 3, 144 GROUP_DAY = 4, 145 GROUP_HOUR = 5, 146 GROUP_MINUTE = 6, 147 GROUP_SECOND = 7, 148 GROUP_AMPM = 8, 149 GROUP_RELATIONDISTANCE = 9, 150 GROUP_RELATION = 10, 151 GROUP_RELATIONTYPE = 11, 152 153 // Dummy groups serve just as an inflator of the selection. E.g. we might want 154 // to select more text than was contained in an envelope of all extractor 155 // spans. 156 GROUP_DUMMY1 = 12, 157 158 GROUP_DUMMY2 = 13, 159 GROUP_ABSOLUTETIME = 14, 160} 161 162// Options for the model that predicts text selection. 163namespace libtextclassifier3; 164table SelectionModelOptions { 165 // If true, before the selection is returned, the unpaired brackets contained 166 // in the predicted selection are stripped from the both selection ends. 167 // The bracket codepoints are defined in the Unicode standard: 168 // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt 169 strip_unpaired_brackets:bool = true; 170 171 // Number of hypothetical click positions on either side of the actual click 172 // to consider in order to enforce symmetry. 173 symmetry_context_size:int; 174 175 // Number of examples to bundle in one batch for inference. 176 batch_size:int = 1024; 177 178 // Whether to always classify a suggested selection or only on demand. 179 always_classify_suggested_selection:bool = false; 180} 181 182// Options for the model that classifies a text selection. 183namespace libtextclassifier3; 184table ClassificationModelOptions { 185 // Limits for phone numbers. 186 phone_min_num_digits:int = 7; 187 188 phone_max_num_digits:int = 15; 189 190 // Limits for addresses. 191 address_min_num_tokens:int; 192 193 // Maximum number of tokens to attempt a classification (-1 is unlimited). 194 max_num_tokens:int = -1; 195} 196 197// Options for post-checks, checksums and verification to apply on a match. 198namespace libtextclassifier3; 199table VerificationOptions { 200 verify_luhn_checksum:bool = false; 201 202 // Lua verifier to use. 203 // Index of the lua verifier in the model. 204 lua_verifier:int = -1; 205} 206 207// Behaviour of rule capturing groups. 208// This specifies how the text and span of a capturing group, in a regular 209// expression or from a capturing match in a grammar rule, should be handled. 210namespace libtextclassifier3; 211table CapturingGroup { 212 // If true, the span of the capturing group will be used to 213 // extend the selection. 214 extend_selection:bool = true; 215 216 // If set, the text of the capturing group will be used to set a field in 217 // the classfication result entity data. 218 entity_field_path:FlatbufferFieldPath; 219 220 // If set, the flatbuffer entity data will be merged with the 221 // classification result entity data. 222 serialized_entity_data:string (shared); 223 224 // If set, normalization to apply before text is used in entity data. 225 normalization_options:NormalizationOptions; 226 227 entity_data:EntityData; 228} 229 230// List of regular expression matchers to check. 231namespace libtextclassifier3.RegexModel_; 232table Pattern { 233 // The name of the collection of a match. 234 collection_name:string (shared); 235 236 // The pattern to check. 237 pattern:string (shared); 238 239 // The modes for which to apply the patterns. 240 enabled_modes:ModeFlag = ALL; 241 242 // The final score to assign to the results of this pattern. 243 target_classification_score:float = 1; 244 245 // Priority score used for conflict resolution with the other models. 246 priority_score:float = 0; 247 248 // If true, will use an approximate matching implementation implemented 249 // using Find() instead of the true Match(). This approximate matching will 250 // use the first Find() result and then check that it spans the whole input. 251 use_approximate_matching:bool = false; 252 253 compressed_pattern:CompressedBuffer; 254 255 // Verification to apply on a match. 256 verification_options:VerificationOptions; 257 258 capturing_group:[CapturingGroup]; 259 260 // Entity data to set for a match. 261 serialized_entity_data:string (shared); 262 263 entity_data:EntityData; 264} 265 266namespace libtextclassifier3; 267table RegexModel { 268 patterns:[RegexModel_.Pattern]; 269 270 // If true, will compile the regexes only on first use. 271 lazy_regex_compilation:bool = true; 272 273 // Lua scripts for match verification. 274 // The verifier can access: 275 // * `context`: The context as a string. 276 // * `match`: The groups of the regex match as an array, each group gives 277 // * `begin`: span start 278 // * `end`: span end 279 // * `text`: the text 280 // The verifier is expected to return a boolean, indicating whether the 281 // verification succeeded or not. 282 lua_verifier:[string]; 283} 284 285// List of regex patterns. 286namespace libtextclassifier3.DatetimeModelPattern_; 287table Regex { 288 pattern:string (shared); 289 290 // The ith entry specifies the type of the ith capturing group. 291 // This is used to decide how the matched content has to be parsed. 292 groups:[DatetimeGroupType]; 293 294 compressed_pattern:CompressedBuffer; 295} 296 297namespace libtextclassifier3; 298table DatetimeModelPattern { 299 regexes:[DatetimeModelPattern_.Regex]; 300 301 // List of locale indices in DatetimeModel that represent the locales that 302 // these patterns should be used for. If empty, can be used for all locales. 303 locales:[int]; 304 305 // The final score to assign to the results of this pattern. 306 target_classification_score:float = 1; 307 308 // Priority score used for conflict resolution with the other models. 309 priority_score:float = 0; 310 311 // The modes for which to apply the patterns. 312 enabled_modes:ModeFlag = ALL; 313 314 // The annotation usecases for which to apply the patterns. 315 // This is a flag field for values of AnnotationUsecase. 316 enabled_annotation_usecases:uint = 4294967295; 317} 318 319namespace libtextclassifier3; 320table DatetimeModelExtractor { 321 extractor:DatetimeExtractorType; 322 pattern:string (shared); 323 locales:[int]; 324 compressed_pattern:CompressedBuffer; 325} 326 327namespace libtextclassifier3; 328table DatetimeModel { 329 // List of BCP 47 locale strings representing all locales supported by the 330 // model. The individual patterns refer back to them using an index. 331 locales:[string]; 332 333 patterns:[DatetimeModelPattern]; 334 extractors:[DatetimeModelExtractor]; 335 336 // If true, will use the extractors for determining the match location as 337 // opposed to using the location where the global pattern matched. 338 use_extractors_for_locating:bool = true; 339 340 // List of locale ids, rules of whose are always run, after the requested 341 // ones. 342 default_locales:[int]; 343 344 // If true, will generate the alternative interpretations for ambiguous 345 // datetime expressions. 346 generate_alternative_interpretations_when_ambiguous:bool = false; 347 348 // If true, will compile the regexes only on first use. 349 lazy_regex_compilation:bool = true; 350 351 // If true, will give only future dates (when the day is not specified). 352 prefer_future_for_unspecified_date:bool = false; 353} 354 355// Configuration for the tokenizer. 356namespace libtextclassifier3; 357table GrammarTokenizerOptions { 358 tokenization_type:TokenizationType = ICU; 359 360 // If true, white space tokens will be kept when using the icu tokenizer. 361 icu_preserve_whitespace_tokens:bool = false; 362 363 // Codepoint ranges that determine what role the different codepoints play 364 // during tokenized. The ranges must not overlap. 365 tokenization_codepoint_config:[TokenizationCodepointRange]; 366 367 // A set of codepoint ranges to use in the mixed tokenization mode to identify 368 // stretches of tokens to re-tokenize using the internal tokenizer. 369 internal_tokenizer_codepoint_ranges:[CodepointRange]; 370 371 // If true, tokens will be also split when the codepoint's script_id changes 372 // as defined in TokenizationCodepointRange. 373 tokenize_on_script_change:bool = false; 374} 375 376namespace libtextclassifier3.DatetimeModelLibrary_; 377table Item { 378 key:string (shared); 379 value:DatetimeModel; 380} 381 382// A set of named DateTime models. 383namespace libtextclassifier3; 384table DatetimeModelLibrary { 385 models:[DatetimeModelLibrary_.Item]; 386} 387 388// Classification result to instantiate for a rule match. 389namespace libtextclassifier3.GrammarModel_; 390table RuleClassificationResult { 391 // The name of the collection. 392 collection_name:string (shared); 393 394 // The score. 395 target_classification_score:float = 1; 396 397 // The priority score used for conflict resolution with the other models. 398 priority_score:float = 0; 399 400 // Behaviour of capturing matches. 401 capturing_group:[CapturingGroup]; 402 403 // Entity data to set for a match. 404 serialized_entity_data:string (shared); 405 406 // Enabled modes. 407 enabled_modes:ModeFlag = ALL; 408 409 entity_data:EntityData; 410} 411 412// Configuration for grammar based annotators. 413namespace libtextclassifier3; 414table GrammarModel { 415 // The grammar rules. 416 rules:grammar.RulesSet; 417 418 // Deprecated. Used only for the old implementation of the grammar model. 419 rule_classification_result:[GrammarModel_.RuleClassificationResult]; 420 421 // Number of tokens in the context to use for classification and text 422 // selection suggestion. 423 // A value -1 uses the full context. 424 context_left_num_tokens:int; 425 426 context_right_num_tokens:int; 427 428 // Grammar specific tokenizer options. 429 tokenizer_options:GrammarTokenizerOptions; 430 431 // The score. 432 target_classification_score:float = 1; 433 434 // The priority score used for conflict resolution with the other models. 435 priority_score:float = 1; 436 437 // Global enabled modes. Use this instead of 438 // `rule_classification_result.enabled_modes`. 439 enabled_modes:ModeFlag = ALL; 440} 441 442namespace libtextclassifier3.MoneyParsingOptions_; 443table QuantitiesNameToExponentEntry { 444 key:string (key, shared); 445 value:int; 446} 447 448namespace libtextclassifier3; 449table MoneyParsingOptions { 450 // Separators (codepoints) marking decimal or thousand in the money amount. 451 separators:[int]; 452 453 // Mapping between a quantity string (e.g. "million") and the power of 10 454 // it multiplies the amount with (e.g. 6 in case of "million"). 455 // NOTE: The entries need to be sorted by key since we use LookupByKey. 456 quantities_name_to_exponent:[MoneyParsingOptions_.QuantitiesNameToExponentEntry]; 457} 458 459namespace libtextclassifier3.ModelTriggeringOptions_; 460table CollectionToPriorityEntry { 461 key:string (key, shared); 462 value:float; 463} 464 465// Options controlling the output of the Tensorflow Lite models. 466namespace libtextclassifier3; 467table ModelTriggeringOptions { 468 // Lower bound threshold for filtering annotation model outputs. 469 min_annotate_confidence:float = 0; 470 471 // The modes for which to enable the models. 472 enabled_modes:ModeFlag = ALL; 473 474 // Comma-separated list of locales (BCP 47 tags) that dictionary 475 // classification supports. 476 dictionary_locales:string (shared); 477 478 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 479 // are used to prevent triggering on input in unsupported languages. If 480 // empty, the model will trigger on all inputs. 481 locales:string (shared); 482 483 // Priority score assigned to the "other" class from ML model. 484 other_collection_priority_score:float = -1000; 485 486 // Priority score assigned to knowledge engine annotations. 487 knowledge_priority_score:float = 0; 488 reserved_7:int16 (deprecated); 489 490 // Apply a factor to the priority score for entities that are added to this 491 // map. Key: collection type e.g. "address", "phone"..., Value: float number. 492 // NOTE: The entries here need to be sorted since we use LookupByKey. 493 collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry]; 494 495 // Enabled modes for the knowledge engine model. 496 knowledge_enabled_modes:ModeFlag = ALL; 497 498 // Enabled modes for the experimental model. 499 experimental_enabled_modes:ModeFlag = ALL; 500 501 // Enabled modes for the installed app model. 502 installed_app_enabled_modes:ModeFlag = ALL; 503} 504 505// Options controlling the output of the classifier. 506namespace libtextclassifier3; 507table OutputOptions { 508 // Lists of collection names that will be filtered out at the output: 509 // - For annotation, the spans of given collection are simply dropped. 510 // - For classification, the result is mapped to the class "other". 511 // - For selection, the spans of given class are returned as 512 // single-selection. 513 filtered_collections_annotation:[string]; 514 515 filtered_collections_classification:[string]; 516 filtered_collections_selection:[string]; 517} 518 519namespace libtextclassifier3.Model_; 520table EmbeddingPruningMask { 521 // If true, use pruning mask. In this case, we use mask 522 // pruning_mask to determine the mapping of hashed-charactergrams. 523 enabled:bool; 524 525 // Packing of the binary pruning mask into uint64 values. 526 pruning_mask:[ulong] (force_align: 16); 527 528 // Number of buckets before pruning. 529 full_num_buckets:int; 530 531 // Index of row of compressed embedding matrix to which all pruned buckets 532 // are mapped. 533 pruned_row_bucket_id:int; 534} 535 536namespace libtextclassifier3.Model_; 537table ConflictResolutionOptions { 538 // If true, will prioritize the longest annotation during conflict 539 // resolution. 540 prioritize_longest_annotation:bool = false; 541 542 // If true, the annotator will perform conflict resolution between the 543 // different sub-annotators also in the RAW mode. If false, no conflict 544 // resolution will be performed in RAW mode. 545 do_conflict_resolution_in_raw_mode:bool = true; 546} 547 548namespace libtextclassifier3; 549table Model { 550 // Comma-separated list of locales supported by the model as BCP 47 tags. 551 locales:string (shared); 552 553 version:int; 554 555 // A name for the model that can be used for e.g. logging. 556 name:string (shared); 557 558 selection_feature_options:FeatureProcessorOptions; 559 classification_feature_options:FeatureProcessorOptions; 560 561 // Tensorflow Lite models. 562 selection_model:[ubyte] (force_align: 16); 563 564 classification_model:[ubyte] (force_align: 16); 565 embedding_model:[ubyte] (force_align: 16); 566 567 // Options for the different models. 568 selection_options:SelectionModelOptions; 569 570 classification_options:ClassificationModelOptions; 571 regex_model:RegexModel; 572 datetime_model:DatetimeModel; 573 574 // Options controlling the output of the models. 575 triggering_options:ModelTriggeringOptions; 576 577 // Global switch that controls if SuggestSelection(), ClassifyText() and 578 // Annotate() will run. If a mode is disabled it returns empty/no-op results. 579 enabled_modes:ModeFlag = ALL; 580 581 // If true, will snap the selections that consist only of whitespaces to the 582 // containing suggested span. Otherwise, no suggestion is proposed, since the 583 // selections are not part of any token. 584 snap_whitespace_selections:bool = true; 585 586 // Global configuration for the output of SuggestSelection(), ClassifyText() 587 // and Annotate(). 588 output_options:OutputOptions; 589 590 // Configures how Intents should be generated on Android. 591 android_intent_options:AndroidIntentFactoryOptions; 592 593 intent_options:IntentFactoryModel; 594 595 // Model resources. 596 resources:ResourcePool; 597 598 // Schema data for handling entity data. 599 entity_data_schema:[ubyte]; 600 601 number_annotator_options:NumberAnnotatorOptions; 602 duration_annotator_options:DurationAnnotatorOptions; 603 604 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 605 // are used to prevent triggering on input in unsupported languages. If 606 // empty, the model will trigger on all inputs. 607 triggering_locales:string (shared); 608 609 embedding_pruning_mask:Model_.EmbeddingPruningMask; 610 reserved_25:int16 (deprecated); 611 contact_annotator_options:ContactAnnotatorOptions; 612 money_parsing_options:MoneyParsingOptions; 613 translate_annotator_options:TranslateAnnotatorOptions; 614 grammar_model:GrammarModel; 615 conflict_resolution_options:Model_.ConflictResolutionOptions; 616 experimental_model:ExperimentalModel; 617 pod_ner_model:PodNerModel; 618 vocab_model:VocabModel; 619 datetime_grammar_model:GrammarModel; 620} 621 622// Method for selecting the center token. 623namespace libtextclassifier3.FeatureProcessorOptions_; 624enum CenterTokenSelectionMethod : int { 625 DEFAULT_CENTER_TOKEN_METHOD = 0, 626 // Invalid option. 627 628 // Use click indices to determine the center token. 629 CENTER_TOKEN_FROM_CLICK = 1, 630 631 // Use selection indices to get a token range, and select the middle of it 632 // as the center token. 633 CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, 634} 635 636// Bounds-sensitive feature extraction configuration. 637namespace libtextclassifier3.FeatureProcessorOptions_; 638table BoundsSensitiveFeatures { 639 // Enables the extraction of bounds-sensitive features, instead of the click 640 // context features. 641 enabled:bool; 642 643 // The numbers of tokens to extract in specific locations relative to the 644 // bounds. 645 // Immediately before the span. 646 num_tokens_before:int; 647 648 // Inside the span, aligned with the beginning. 649 num_tokens_inside_left:int; 650 651 // Inside the span, aligned with the end. 652 num_tokens_inside_right:int; 653 654 // Immediately after the span. 655 num_tokens_after:int; 656 657 // If true, also extracts the tokens of the entire span and adds up their 658 // features forming one "token" to include in the extracted features. 659 include_inside_bag:bool; 660 661 // If true, includes the selection length (in the number of tokens) as a 662 // feature. 663 include_inside_length:bool; 664 665 // If true, for selection, single token spans are not run through the model 666 // and their score is assumed to be zero. 667 score_single_token_spans_as_zero:bool; 668} 669 670namespace libtextclassifier3; 671table FeatureProcessorOptions { 672 // Number of buckets used for hashing charactergrams. 673 num_buckets:int = -1; 674 675 // Size of the embedding. 676 embedding_size:int = -1; 677 678 // Number of bits for quantization for embeddings. 679 embedding_quantization_bits:int = 8; 680 681 // Context size defines the number of words to the left and to the right of 682 // the selected word to be used as context. For example, if context size is 683 // N, then we take N words to the left and N words to the right of the 684 // selected word as its context. 685 context_size:int = -1; 686 687 // Maximum number of words of the context to select in total. 688 max_selection_span:int = -1; 689 690 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 691 // character trigrams etc. 692 chargram_orders:[int]; 693 694 // Maximum length of a word, in codepoints. 695 max_word_length:int = 20; 696 697 // If true, will use the unicode-aware functionality for extracting features. 698 unicode_aware_features:bool = false; 699 700 // Whether to extract the token case feature. 701 extract_case_feature:bool = false; 702 703 // Whether to extract the selection mask feature. 704 extract_selection_mask_feature:bool = false; 705 706 // List of regexps to run over each token. For each regexp, if there is a 707 // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. 708 regexp_feature:[string]; 709 710 // Whether to remap all digits to a single number. 711 remap_digits:bool = false; 712 713 // Whether to lower-case each token before generating hashgrams. 714 lowercase_tokens:bool; 715 716 // If true, the selection classifier output will contain only the selections 717 // that are feasible (e.g., those that are shorter than max_selection_span), 718 // if false, the output will be a complete cross-product of possible 719 // selections to the left and possible selections to the right, including the 720 // infeasible ones. 721 // NOTE: Exists mainly for compatibility with older models that were trained 722 // with the non-reduced output space. 723 selection_reduced_output_space:bool = true; 724 725 // Collection names. 726 collections:[string]; 727 728 // An index of collection in collections to be used if a collection name can't 729 // be mapped to an id. 730 default_collection:int = -1; 731 732 // If true, will split the input by lines, and only use the line that contains 733 // the clicked token. 734 only_use_line_with_click:bool = false; 735 736 // If true, will split tokens that contain the selection boundary, at the 737 // position of the boundary. 738 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" 739 split_tokens_on_selection_boundaries:bool = false; 740 741 // Codepoint ranges that determine how different codepoints are tokenized. 742 // The ranges must not overlap. 743 tokenization_codepoint_config:[TokenizationCodepointRange]; 744 745 center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod; 746 747 // If true, span boundaries will be snapped to containing tokens and not 748 // required to exactly match token boundaries. 749 snap_label_span_boundaries_to_containing_tokens:bool; 750 751 // A set of codepoint ranges supported by the model. 752 supported_codepoint_ranges:[CodepointRange]; 753 754 // A set of codepoint ranges to use in the mixed tokenization mode to identify 755 // stretches of tokens to re-tokenize using the internal tokenizer. 756 internal_tokenizer_codepoint_ranges:[CodepointRange]; 757 758 // Minimum ratio of supported codepoints in the input context. If the ratio 759 // is lower than this, the feature computation will fail. 760 min_supported_codepoint_ratio:float = 0; 761 762 // Used for versioning the format of features the model expects. 763 // - feature_version == 0: 764 // For each token the features consist of: 765 // - chargram embeddings 766 // - dense features 767 // Chargram embeddings for tokens are concatenated first together, 768 // and at the end, the dense features for the tokens are concatenated 769 // to it. So the resulting feature vector has two regions. 770 feature_version:int = 0; 771 772 tokenization_type:TokenizationType = INTERNAL_TOKENIZER; 773 icu_preserve_whitespace_tokens:bool = false; 774 775 // List of codepoints that will be stripped from beginning and end of 776 // predicted spans. 777 ignored_span_boundary_codepoints:[int]; 778 779 bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures; 780 781 // List of allowed charactergrams. The extracted charactergrams are filtered 782 // using this list, and charactergrams that are not present are interpreted as 783 // out-of-vocabulary. 784 // If no allowed_chargrams are specified, all charactergrams are allowed. 785 // The field is typed as bytes type to allow non-UTF8 chargrams. 786 allowed_chargrams:[string]; 787 788 // If true, tokens will be also split when the codepoint's script_id changes 789 // as defined in TokenizationCodepointRange. 790 tokenize_on_script_change:bool = false; 791 792 // If true, the pipe character '|' will be used as a newline character when 793 // splitting lines. 794 use_pipe_character_for_newline:bool = true; 795} 796 797namespace libtextclassifier3; 798table NumberAnnotatorOptions { 799 // If true, number and percentage annotations will be produced. 800 enabled:bool = false; 801 802 // Score to assign to the annotated numbers and percentages in the annotator. 803 score:float = 1; 804 805 // Number priority score used for conflict resolution with the other models. 806 priority_score:float = 0; 807 808 // The modes in which to enable number and percentage annotations. 809 enabled_modes:ModeFlag = ALL; 810 811 // The annotation usecases for which to produce number annotations. 812 // This is a flag field for values of AnnotationUsecase. 813 enabled_annotation_usecases:uint = 4294967295; 814 815 // [Deprecated] A list of codepoints that can form a prefix of a valid number. 816 allowed_prefix_codepoints:[int]; 817 818 // [Deprecated] A list of codepoints that can form a suffix of a valid number. 819 allowed_suffix_codepoints:[int]; 820 821 // [Deprecated] List of codepoints that will be stripped from beginning of 822 // predicted spans. 823 ignored_prefix_span_boundary_codepoints:[int]; 824 825 // [Deprecated] List of codepoints that will be stripped from end of predicted 826 // spans. 827 ignored_suffix_span_boundary_codepoints:[int]; 828 829 // [Deprecated] If true, percent annotations will be produced. 830 enable_percentage:bool = false; 831 832 // Zero separated and ordered list of suffixes that mark a percent. 833 percentage_pieces_string:string (shared); 834 835 // [Deprecated] List of suffixes offsets in the percent_pieces_string string. 836 percentage_pieces_offsets:[int]; 837 838 // Priority score for the percentage annotation. 839 percentage_priority_score:float = 1; 840 841 // Float number priority score used for conflict resolution with the other 842 // models. 843 float_number_priority_score:float = 0; 844 845 // The maximum number of digits an annotated number can have. Requirement: 846 // the value should be less or equal to 20. 847 max_number_of_digits:int = 20; 848 849 // The annotation usecases for which to produce percentage annotations. 850 // This is a flag field for values of AnnotationUsecase. 851 percentage_annotation_usecases:uint = 2; 852} 853 854// DurationAnnotator is so far tailored for English and Japanese only. 855namespace libtextclassifier3; 856table DurationAnnotatorOptions { 857 // If true, duration annotations will be produced. 858 enabled:bool = false; 859 860 // Score to assign to the annotated durations from the annotator. 861 score:float = 1; 862 863 // Priority score used for conflict resolution with the other models. 864 priority_score:float = 0; 865 866 // The modes in which to enable duration annotations. 867 enabled_modes:ModeFlag = ALL; 868 869 // The annotation usecases for which to produce duration annotations. 870 enabled_annotation_usecases:uint = 4294967295; 871 872 // Durations typically look like XX hours and XX minutes etc... The list of 873 // strings below enumerate variants of "hours", "minutes", etc. in these 874 // expressions. These are verbatim strings that are matched against tokens in 875 // the input. 876 week_expressions:[string]; 877 878 day_expressions:[string]; 879 hour_expressions:[string]; 880 minute_expressions:[string]; 881 second_expressions:[string]; 882 883 // List of expressions that doesn't break a duration expression (can become 884 // a part of it) but has not semantic meaning. 885 filler_expressions:[string]; 886 887 // List of expressions that mean half of a unit of duration (e.g. "half an 888 // hour"). 889 half_expressions:[string]; 890 891 // Set of condepoints that can split the Annotator tokens to sub-tokens for 892 // sub-token matching. 893 sub_token_separator_codepoints:[int]; 894 895 // If this is true, unit must be associated with quantity. For example, a 896 // phrase "minute" is not parsed as one minute duration if this is true. 897 require_quantity:bool; 898 899 // If this is true, dangling quantity is included in the annotation. For 900 // example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds. 901 enable_dangling_quantity_interpretation:bool = true; 902} 903 904namespace libtextclassifier3; 905table ContactAnnotatorOptions { 906 // Supported for English genitives only so far. 907 enable_declension:bool; 908 909 // For each language there is a customized list of supported declensions. 910 language:string (shared); 911 912 // Enabled modes. 913 enabled_modes:ModeFlag = ALL; 914} 915 916namespace libtextclassifier3.TranslateAnnotatorOptions_; 917enum Algorithm : int { 918 DEFAULT_ALGORITHM = 0, 919 BACKOFF = 1, 920} 921 922// Backoff is the algorithm shipped with Android Q. 923namespace libtextclassifier3.TranslateAnnotatorOptions_; 924table BackoffOptions { 925 // The minimum size of text to prefer for detection (in codepoints). 926 min_text_size:int = 20; 927 928 // For reducing the score when text is less than the preferred size. 929 penalize_ratio:float = 1; 930 931 // Original detection score to surrounding text detection score ratios. 932 subject_text_score_ratio:float = 0.4; 933} 934 935namespace libtextclassifier3; 936table TranslateAnnotatorOptions { 937 enabled:bool = false; 938 939 // Score to assign to the classification results. 940 score:float = 1; 941 942 // Priority score used for conflict resolution with the other models. 943 priority_score:float; 944 945 algorithm:TranslateAnnotatorOptions_.Algorithm; 946 backoff_options:TranslateAnnotatorOptions_.BackoffOptions; 947 948 // Enabled modes. 949 enabled_modes:ModeFlag = CLASSIFICATION; 950} 951 952namespace libtextclassifier3.PodNerModel_; 953table Collection { 954 // Collection's name (e.g., "location", "person"). 955 name:string (shared); 956 957 // Priority scores used for conflict resolution with the other annotators 958 // when the annotation is made over a single/multi token text. 959 single_token_priority_score:float; 960 961 multi_token_priority_score:float; 962} 963 964namespace libtextclassifier3.PodNerModel_.Label_; 965enum BoiseType : int { 966 NONE = 0, 967 BEGIN = 1, 968 O = 2, 969 // No label. 970 971 INTERMEDIATE = 3, 972 SINGLE = 4, 973 END = 5, 974} 975 976namespace libtextclassifier3.PodNerModel_.Label_; 977enum MentionType : int { 978 UNDEFINED = 0, 979 NAM = 1, 980 NOM = 2, 981} 982 983namespace libtextclassifier3.PodNerModel_; 984table Label { 985 boise_type:Label_.BoiseType; 986 mention_type:Label_.MentionType; 987 collection_id:int; 988 // points to the collections array above. 989} 990 991namespace libtextclassifier3; 992table PodNerModel { 993 tflite_model:[ubyte]; 994 word_piece_vocab:[ubyte]; 995 lowercase_input:bool = true; 996 997 // Index of mention_logits tensor in the output of the tflite model. Can 998 // be found in the textproto output after model is converted to tflite. 999 logits_index_in_output_tensor:int = 0; 1000 1001 // Whether to append a period at the end of an input that doesn't already 1002 // end in punctuation. 1003 append_final_period:bool = false; 1004 1005 // Priority score used for conflict resolution with the other models. Used 1006 // only if collections_array is empty. 1007 priority_score:float = 0; 1008 1009 // Maximum number of wordpieces supported by the model. 1010 max_num_wordpieces:int = 128; 1011 1012 // In case of long text (number of wordpieces greater than the max) we use 1013 // sliding window approach, this determines the number of overlapping 1014 // wordpieces between two consecutive windows. This overlap enables context 1015 // for each word NER annotates. 1016 sliding_window_num_wordpieces_overlap:int = 20; 1017 reserved_9:int16 (deprecated); 1018 1019 // The possible labels the ner model can output. If empty the default labels 1020 // will be used. 1021 labels:[PodNerModel_.Label]; 1022 1023 // If the ratio of unknown wordpieces in the input text is greater than this 1024 // maximum, the text won't be annotated. 1025 max_ratio_unknown_wordpieces:float = 0.1; 1026 1027 // Possible collections for labeled entities. 1028 collections:[PodNerModel_.Collection]; 1029 1030 // Minimum word-length and wordpieces-length required for the text to be 1031 // annotated. 1032 min_number_of_tokens:int = 1; 1033 1034 min_number_of_wordpieces:int = 1; 1035 1036 // Enabled modes. 1037 enabled_modes:ModeFlag = ALL; 1038} 1039 1040namespace libtextclassifier3; 1041table VocabModel { 1042 // A trie that stores a list of vocabs that triggers "Define". A id is 1043 // returned when looking up a vocab from the trie and the id can be used 1044 // to access more information about that vocab. The marisa trie library 1045 // requires 8-byte alignment because the first thing in a marisa trie is a 1046 // 64-bit integer. 1047 vocab_trie:[ubyte] (force_align: 8); 1048 1049 // A bit vector that tells if the vocab should trigger "Define" for users of 1050 // beginner proficiency only. To look up the bit vector, use the id returned 1051 // by the trie. 1052 beginner_level:BitVectorData; 1053 1054 // A sorted list of indices of vocabs that should not trigger "Define" if 1055 // its leading character is in upper case. The indices are those returned by 1056 // trie. You may perform binary search to look up an index. 1057 do_not_trigger_in_upper_case:BitVectorData; 1058 1059 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 1060 // are used to prevent triggering on input in unsupported languages. If 1061 // empty, the model will trigger on all inputs. 1062 triggering_locales:string (shared); 1063 1064 // The final score to assign to the results of the vocab model 1065 target_classification_score:float = 1; 1066 1067 // Priority score used for conflict resolution with the other models. 1068 priority_score:float = 0; 1069 1070 // Enabled modes. 1071 enabled_modes:ModeFlag = ANNOTATION_AND_CLASSIFICATION; 1072} 1073 1074root_type libtextclassifier3.Model; 1075