1// 2// Copyright (C) 2018 The Android Open Source Project 3// 4// Licensed under the Apache License, Version 2.0 (the "License"); 5// you may not use this file except in compliance with the License. 6// You may obtain a copy of the License at 7// 8// http://www.apache.org/licenses/LICENSE-2.0 9// 10// Unless required by applicable law or agreed to in writing, software 11// distributed under the License is distributed on an "AS IS" BASIS, 12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13// See the License for the specific language governing permissions and 14// limitations under the License. 15// 16 17include "annotator/entity-data.fbs"; 18include "annotator/experimental/experimental.fbs"; 19include "utils/codepoint-range.fbs"; 20include "utils/container/bit-vector.fbs"; 21include "utils/flatbuffers/flatbuffers.fbs"; 22include "utils/grammar/rules.fbs"; 23include "utils/intents/intent-config.fbs"; 24include "utils/normalization.fbs"; 25include "utils/resources.fbs"; 26include "utils/tokenizer.fbs"; 27include "utils/zlib/buffer.fbs"; 28 29file_identifier "TC2 "; 30 31// The possible model modes, represents a bit field. 32namespace libtextclassifier3; 33enum ModeFlag : int { 34 NONE = 0, 35 ANNOTATION = 1, 36 CLASSIFICATION = 2, 37 ANNOTATION_AND_CLASSIFICATION = 3, 38 SELECTION = 4, 39 ANNOTATION_AND_SELECTION = 5, 40 CLASSIFICATION_AND_SELECTION = 6, 41 ALL = 7, 42} 43 44// Enum for specifying the annotation usecase. 45namespace libtextclassifier3; 46enum AnnotationUsecase : int { 47 // Results are optimized for Smart{Select,Share,Linkify}. 48 ANNOTATION_USECASE_SMART = 0, 49 // Smart{Select,Share,Linkify} 50 51 // Results are optimized for using TextClassifier as an infrastructure that 52 // annotates as much as possible. 53 ANNOTATION_USECASE_RAW = 1, 54} 55 56namespace libtextclassifier3; 57enum DatetimeExtractorType : int { 58 UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, 59 AM = 1, 60 PM = 2, 61 JANUARY = 3, 62 FEBRUARY = 4, 63 MARCH = 5, 64 APRIL = 6, 65 MAY = 7, 66 JUNE = 8, 67 JULY = 9, 68 AUGUST = 10, 69 SEPTEMBER = 11, 70 OCTOBER = 12, 71 NOVEMBER = 13, 72 DECEMBER = 14, 73 NEXT = 15, 74 NEXT_OR_SAME = 16, 75 LAST = 17, 76 NOW = 18, 77 TOMORROW = 19, 78 YESTERDAY = 20, 79 PAST = 21, 80 FUTURE = 22, 81 DAY = 23, 82 WEEK = 24, 83 MONTH = 25, 84 YEAR = 26, 85 MONDAY = 27, 86 TUESDAY = 28, 87 WEDNESDAY = 29, 88 THURSDAY = 30, 89 FRIDAY = 31, 90 SATURDAY = 32, 91 SUNDAY = 33, 92 DAYS = 34, 93 WEEKS = 35, 94 MONTHS = 36, 95 96 // TODO(zilka): Make the following 3 values singular for consistency. 97 HOURS = 37, 98 99 MINUTES = 38, 100 SECONDS = 39, 101 YEARS = 40, 102 DIGITS = 41, 103 SIGNEDDIGITS = 42, 104 ZERO = 43, 105 ONE = 44, 106 TWO = 45, 107 THREE = 46, 108 FOUR = 47, 109 FIVE = 48, 110 SIX = 49, 111 SEVEN = 50, 112 EIGHT = 51, 113 NINE = 52, 114 TEN = 53, 115 ELEVEN = 54, 116 TWELVE = 55, 117 THIRTEEN = 56, 118 FOURTEEN = 57, 119 FIFTEEN = 58, 120 SIXTEEN = 59, 121 SEVENTEEN = 60, 122 EIGHTEEN = 61, 123 NINETEEN = 62, 124 TWENTY = 63, 125 THIRTY = 64, 126 FORTY = 65, 127 FIFTY = 66, 128 SIXTY = 67, 129 SEVENTY = 68, 130 EIGHTY = 69, 131 NINETY = 70, 132 HUNDRED = 71, 133 THOUSAND = 72, 134 NOON = 73, 135 MIDNIGHT = 74, 136} 137 138namespace libtextclassifier3; 139enum DatetimeGroupType : int { 140 GROUP_UNKNOWN = 0, 141 GROUP_UNUSED = 1, 142 GROUP_YEAR = 2, 143 GROUP_MONTH = 3, 144 GROUP_DAY = 4, 145 GROUP_HOUR = 5, 146 GROUP_MINUTE = 6, 147 GROUP_SECOND = 7, 148 GROUP_AMPM = 8, 149 GROUP_RELATIONDISTANCE = 9, 150 GROUP_RELATION = 10, 151 GROUP_RELATIONTYPE = 11, 152 153 // Dummy groups serve just as an inflator of the selection. E.g. we might want 154 // to select more text than was contained in an envelope of all extractor 155 // spans. 156 GROUP_DUMMY1 = 12, 157 158 GROUP_DUMMY2 = 13, 159 GROUP_ABSOLUTETIME = 14, 160} 161 162// Options for the model that predicts text selection. 163namespace libtextclassifier3; 164table SelectionModelOptions { 165 // If true, before the selection is returned, the unpaired brackets contained 166 // in the predicted selection are stripped from the both selection ends. 167 // The bracket codepoints are defined in the Unicode standard: 168 // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt 169 strip_unpaired_brackets:bool = true; 170 171 // Number of hypothetical click positions on either side of the actual click 172 // to consider in order to enforce symmetry. 173 symmetry_context_size:int; 174 175 // Number of examples to bundle in one batch for inference. 176 batch_size:int = 1024; 177 178 // Whether to always classify a suggested selection or only on demand. 179 always_classify_suggested_selection:bool = false; 180} 181 182// Options for the model that classifies a text selection. 183namespace libtextclassifier3; 184table ClassificationModelOptions { 185 // Limits for phone numbers. 186 phone_min_num_digits:int = 7; 187 188 phone_max_num_digits:int = 15; 189 190 // Limits for addresses. 191 address_min_num_tokens:int; 192 193 // Maximum number of tokens to attempt a classification (-1 is unlimited). 194 max_num_tokens:int = -1; 195} 196 197// Options for post-checks, checksums and verification to apply on a match. 198namespace libtextclassifier3; 199table VerificationOptions { 200 verify_luhn_checksum:bool = false; 201 202 // Lua verifier to use. 203 // Index of the lua verifier in the model. 204 lua_verifier:int = -1; 205} 206 207// Behaviour of rule capturing groups. 208// This specifies how the text and span of a capturing group, in a regular 209// expression or from a capturing match in a grammar rule, should be handled. 210namespace libtextclassifier3; 211table CapturingGroup { 212 // If true, the span of the capturing group will be used to 213 // extend the selection. 214 extend_selection:bool = true; 215 216 // If set, the text of the capturing group will be used to set a field in 217 // the classfication result entity data. 218 entity_field_path:FlatbufferFieldPath; 219 220 // If set, the flatbuffer entity data will be merged with the 221 // classification result entity data. 222 serialized_entity_data:string (shared); 223 224 // If set, normalization to apply before text is used in entity data. 225 normalization_options:NormalizationOptions; 226 227 entity_data:EntityData; 228} 229 230// List of regular expression matchers to check. 231namespace libtextclassifier3.RegexModel_; 232table Pattern { 233 // The name of the collection of a match. 234 collection_name:string (shared); 235 236 // The pattern to check. 237 pattern:string (shared); 238 239 // The modes for which to apply the patterns. 240 enabled_modes:ModeFlag = ALL; 241 242 // The final score to assign to the results of this pattern. 243 target_classification_score:float = 1; 244 245 // Priority score used for conflict resolution with the other models. 246 priority_score:float = 0; 247 248 // If true, will use an approximate matching implementation implemented 249 // using Find() instead of the true Match(). This approximate matching will 250 // use the first Find() result and then check that it spans the whole input. 251 use_approximate_matching:bool = false; 252 253 compressed_pattern:CompressedBuffer; 254 255 // Verification to apply on a match. 256 verification_options:VerificationOptions; 257 258 capturing_group:[CapturingGroup]; 259 260 // Entity data to set for a match. 261 serialized_entity_data:string (shared); 262 263 entity_data:EntityData; 264} 265 266namespace libtextclassifier3; 267table RegexModel { 268 patterns:[RegexModel_.Pattern]; 269 270 // If true, will compile the regexes only on first use. 271 lazy_regex_compilation:bool = true; 272 273 // Lua scripts for match verification. 274 // The verifier can access: 275 // * `context`: The context as a string. 276 // * `match`: The groups of the regex match as an array, each group gives 277 // * `begin`: span start 278 // * `end`: span end 279 // * `text`: the text 280 // The verifier is expected to return a boolean, indicating whether the 281 // verification succeeded or not. 282 lua_verifier:[string]; 283} 284 285// List of regex patterns. 286namespace libtextclassifier3.DatetimeModelPattern_; 287table Regex { 288 pattern:string (shared); 289 290 // The ith entry specifies the type of the ith capturing group. 291 // This is used to decide how the matched content has to be parsed. 292 groups:[DatetimeGroupType]; 293 294 compressed_pattern:CompressedBuffer; 295} 296 297namespace libtextclassifier3; 298table DatetimeModelPattern { 299 regexes:[DatetimeModelPattern_.Regex]; 300 301 // List of locale indices in DatetimeModel that represent the locales that 302 // these patterns should be used for. If empty, can be used for all locales. 303 locales:[int]; 304 305 // The final score to assign to the results of this pattern. 306 target_classification_score:float = 1; 307 308 // Priority score used for conflict resolution with the other models. 309 priority_score:float = 0; 310 311 // The modes for which to apply the patterns. 312 enabled_modes:ModeFlag = ALL; 313 314 // The annotation usecases for which to apply the patterns. 315 // This is a flag field for values of AnnotationUsecase. 316 enabled_annotation_usecases:uint = 4294967295; 317} 318 319namespace libtextclassifier3; 320table DatetimeModelExtractor { 321 extractor:DatetimeExtractorType; 322 pattern:string (shared); 323 locales:[int]; 324 compressed_pattern:CompressedBuffer; 325} 326 327namespace libtextclassifier3; 328table DatetimeModel { 329 // List of BCP 47 locale strings representing all locales supported by the 330 // model. The individual patterns refer back to them using an index. 331 locales:[string]; 332 333 patterns:[DatetimeModelPattern]; 334 extractors:[DatetimeModelExtractor]; 335 336 // If true, will use the extractors for determining the match location as 337 // opposed to using the location where the global pattern matched. 338 use_extractors_for_locating:bool = true; 339 340 // List of locale ids, rules of whose are always run, after the requested 341 // ones. 342 default_locales:[int]; 343 344 // If true, will generate the alternative interpretations for ambiguous 345 // datetime expressions. 346 generate_alternative_interpretations_when_ambiguous:bool = false; 347 348 // If true, will compile the regexes only on first use. 349 lazy_regex_compilation:bool = true; 350 351 // If true, will give only future dates (when the day is not specified). 352 prefer_future_for_unspecified_date:bool = false; 353} 354 355// Configuration for the tokenizer. 356namespace libtextclassifier3; 357table GrammarTokenizerOptions { 358 tokenization_type:TokenizationType = ICU; 359 360 // If true, white space tokens will be kept when using the icu tokenizer. 361 icu_preserve_whitespace_tokens:bool = false; 362 363 // Codepoint ranges that determine what role the different codepoints play 364 // during tokenized. The ranges must not overlap. 365 tokenization_codepoint_config:[TokenizationCodepointRange]; 366 367 // A set of codepoint ranges to use in the mixed tokenization mode to identify 368 // stretches of tokens to re-tokenize using the internal tokenizer. 369 internal_tokenizer_codepoint_ranges:[CodepointRange]; 370 371 // If true, tokens will be also split when the codepoint's script_id changes 372 // as defined in TokenizationCodepointRange. 373 tokenize_on_script_change:bool = false; 374} 375 376namespace libtextclassifier3.DatetimeModelLibrary_; 377table Item { 378 key:string (shared); 379 value:DatetimeModel; 380} 381 382// A set of named DateTime models. 383namespace libtextclassifier3; 384table DatetimeModelLibrary { 385 models:[DatetimeModelLibrary_.Item]; 386} 387 388// Classification result to instantiate for a rule match. 389namespace libtextclassifier3.GrammarModel_; 390table RuleClassificationResult { 391 // The name of the collection. 392 collection_name:string (shared); 393 394 // The score. 395 target_classification_score:float = 1; 396 397 // The priority score used for conflict resolution with the other models. 398 priority_score:float = 0; 399 400 // Behaviour of capturing matches. 401 capturing_group:[CapturingGroup]; 402 403 // Entity data to set for a match. 404 serialized_entity_data:string (shared); 405 406 // Enabled modes. 407 enabled_modes:ModeFlag = ALL; 408 409 entity_data:EntityData; 410} 411 412// Configuration for grammar based annotators. 413namespace libtextclassifier3; 414table GrammarModel { 415 // The grammar rules. 416 rules:grammar.RulesSet; 417 418 rule_classification_result:[GrammarModel_.RuleClassificationResult]; 419 420 // Number of tokens in the context to use for classification and text 421 // selection suggestion. 422 // A value -1 uses the full context. 423 context_left_num_tokens:int; 424 425 context_right_num_tokens:int; 426 427 // Grammar specific tokenizer options. 428 tokenizer_options:GrammarTokenizerOptions; 429 430 // The score. 431 target_classification_score:float = 1; 432 433 // The priority score used for conflict resolution with the other models. 434 priority_score:float = 1; 435} 436 437namespace libtextclassifier3.MoneyParsingOptions_; 438table QuantitiesNameToExponentEntry { 439 key:string (key, shared); 440 value:int; 441} 442 443namespace libtextclassifier3; 444table MoneyParsingOptions { 445 // Separators (codepoints) marking decimal or thousand in the money amount. 446 separators:[int]; 447 448 // Mapping between a quantity string (e.g. "million") and the power of 10 449 // it multiplies the amount with (e.g. 6 in case of "million"). 450 // NOTE: The entries need to be sorted by key since we use LookupByKey. 451 quantities_name_to_exponent:[MoneyParsingOptions_.QuantitiesNameToExponentEntry]; 452} 453 454namespace libtextclassifier3.ModelTriggeringOptions_; 455table CollectionToPriorityEntry { 456 key:string (key, shared); 457 value:float; 458} 459 460// Options controlling the output of the Tensorflow Lite models. 461namespace libtextclassifier3; 462table ModelTriggeringOptions { 463 // Lower bound threshold for filtering annotation model outputs. 464 min_annotate_confidence:float = 0; 465 466 // The modes for which to enable the models. 467 enabled_modes:ModeFlag = ALL; 468 469 // Comma-separated list of locales (BCP 47 tags) that dictionary 470 // classification supports. 471 dictionary_locales:string (shared); 472 473 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 474 // are used to prevent triggering on input in unsupported languages. If 475 // empty, the model will trigger on all inputs. 476 locales:string (shared); 477 478 // Priority score assigned to the "other" class from ML model. 479 other_collection_priority_score:float = -1000; 480 481 // Priority score assigned to knowledge engine annotations. 482 knowledge_priority_score:float = 0; 483 reserved_7:int16 (deprecated); 484 485 // Apply a factor to the priority score for entities that are added to this 486 // map. Key: collection type e.g. "address", "phone"..., Value: float number. 487 // NOTE: The entries here need to be sorted since we use LookupByKey. 488 collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry]; 489} 490 491// Options controlling the output of the classifier. 492namespace libtextclassifier3; 493table OutputOptions { 494 // Lists of collection names that will be filtered out at the output: 495 // - For annotation, the spans of given collection are simply dropped. 496 // - For classification, the result is mapped to the class "other". 497 // - For selection, the spans of given class are returned as 498 // single-selection. 499 filtered_collections_annotation:[string]; 500 501 filtered_collections_classification:[string]; 502 filtered_collections_selection:[string]; 503} 504 505namespace libtextclassifier3.Model_; 506table EmbeddingPruningMask { 507 // If true, use pruning mask. In this case, we use mask 508 // pruning_mask to determine the mapping of hashed-charactergrams. 509 enabled:bool; 510 511 // Packing of the binary pruning mask into uint64 values. 512 pruning_mask:[ulong] (force_align: 16); 513 514 // Number of buckets before pruning. 515 full_num_buckets:int; 516 517 // Index of row of compressed embedding matrix to which all pruned buckets 518 // are mapped. 519 pruned_row_bucket_id:int; 520} 521 522namespace libtextclassifier3.Model_; 523table ConflictResolutionOptions { 524 // If true, will prioritize the longest annotation during conflict 525 // resolution. 526 prioritize_longest_annotation:bool = false; 527 528 // If true, the annotator will perform conflict resolution between the 529 // different sub-annotators also in the RAW mode. If false, no conflict 530 // resolution will be performed in RAW mode. 531 do_conflict_resolution_in_raw_mode:bool = true; 532} 533 534namespace libtextclassifier3; 535table Model { 536 // Comma-separated list of locales supported by the model as BCP 47 tags. 537 locales:string (shared); 538 539 version:int; 540 541 // A name for the model that can be used for e.g. logging. 542 name:string (shared); 543 544 selection_feature_options:FeatureProcessorOptions; 545 classification_feature_options:FeatureProcessorOptions; 546 547 // Tensorflow Lite models. 548 selection_model:[ubyte] (force_align: 16); 549 550 classification_model:[ubyte] (force_align: 16); 551 embedding_model:[ubyte] (force_align: 16); 552 553 // Options for the different models. 554 selection_options:SelectionModelOptions; 555 556 classification_options:ClassificationModelOptions; 557 regex_model:RegexModel; 558 datetime_model:DatetimeModel; 559 560 // Options controlling the output of the models. 561 triggering_options:ModelTriggeringOptions; 562 563 // Global switch that controls if SuggestSelection(), ClassifyText() and 564 // Annotate() will run. If a mode is disabled it returns empty/no-op results. 565 enabled_modes:ModeFlag = ALL; 566 567 // If true, will snap the selections that consist only of whitespaces to the 568 // containing suggested span. Otherwise, no suggestion is proposed, since the 569 // selections are not part of any token. 570 snap_whitespace_selections:bool = true; 571 572 // Global configuration for the output of SuggestSelection(), ClassifyText() 573 // and Annotate(). 574 output_options:OutputOptions; 575 576 // Configures how Intents should be generated on Android. 577 android_intent_options:AndroidIntentFactoryOptions; 578 579 intent_options:IntentFactoryModel; 580 581 // Model resources. 582 resources:ResourcePool; 583 584 // Schema data for handling entity data. 585 entity_data_schema:[ubyte]; 586 587 number_annotator_options:NumberAnnotatorOptions; 588 duration_annotator_options:DurationAnnotatorOptions; 589 590 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 591 // are used to prevent triggering on input in unsupported languages. If 592 // empty, the model will trigger on all inputs. 593 triggering_locales:string (shared); 594 595 embedding_pruning_mask:Model_.EmbeddingPruningMask; 596 reserved_25:int16 (deprecated); 597 contact_annotator_options:ContactAnnotatorOptions; 598 money_parsing_options:MoneyParsingOptions; 599 translate_annotator_options:TranslateAnnotatorOptions; 600 grammar_model:GrammarModel; 601 conflict_resolution_options:Model_.ConflictResolutionOptions; 602 experimental_model:ExperimentalModel; 603 pod_ner_model:PodNerModel; 604 vocab_model:VocabModel; 605 datetime_grammar_model:GrammarModel; 606} 607 608// Method for selecting the center token. 609namespace libtextclassifier3.FeatureProcessorOptions_; 610enum CenterTokenSelectionMethod : int { 611 DEFAULT_CENTER_TOKEN_METHOD = 0, 612 // Invalid option. 613 614 // Use click indices to determine the center token. 615 CENTER_TOKEN_FROM_CLICK = 1, 616 617 // Use selection indices to get a token range, and select the middle of it 618 // as the center token. 619 CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, 620} 621 622// Bounds-sensitive feature extraction configuration. 623namespace libtextclassifier3.FeatureProcessorOptions_; 624table BoundsSensitiveFeatures { 625 // Enables the extraction of bounds-sensitive features, instead of the click 626 // context features. 627 enabled:bool; 628 629 // The numbers of tokens to extract in specific locations relative to the 630 // bounds. 631 // Immediately before the span. 632 num_tokens_before:int; 633 634 // Inside the span, aligned with the beginning. 635 num_tokens_inside_left:int; 636 637 // Inside the span, aligned with the end. 638 num_tokens_inside_right:int; 639 640 // Immediately after the span. 641 num_tokens_after:int; 642 643 // If true, also extracts the tokens of the entire span and adds up their 644 // features forming one "token" to include in the extracted features. 645 include_inside_bag:bool; 646 647 // If true, includes the selection length (in the number of tokens) as a 648 // feature. 649 include_inside_length:bool; 650 651 // If true, for selection, single token spans are not run through the model 652 // and their score is assumed to be zero. 653 score_single_token_spans_as_zero:bool; 654} 655 656namespace libtextclassifier3; 657table FeatureProcessorOptions { 658 // Number of buckets used for hashing charactergrams. 659 num_buckets:int = -1; 660 661 // Size of the embedding. 662 embedding_size:int = -1; 663 664 // Number of bits for quantization for embeddings. 665 embedding_quantization_bits:int = 8; 666 667 // Context size defines the number of words to the left and to the right of 668 // the selected word to be used as context. For example, if context size is 669 // N, then we take N words to the left and N words to the right of the 670 // selected word as its context. 671 context_size:int = -1; 672 673 // Maximum number of words of the context to select in total. 674 max_selection_span:int = -1; 675 676 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 677 // character trigrams etc. 678 chargram_orders:[int]; 679 680 // Maximum length of a word, in codepoints. 681 max_word_length:int = 20; 682 683 // If true, will use the unicode-aware functionality for extracting features. 684 unicode_aware_features:bool = false; 685 686 // Whether to extract the token case feature. 687 extract_case_feature:bool = false; 688 689 // Whether to extract the selection mask feature. 690 extract_selection_mask_feature:bool = false; 691 692 // List of regexps to run over each token. For each regexp, if there is a 693 // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. 694 regexp_feature:[string]; 695 696 // Whether to remap all digits to a single number. 697 remap_digits:bool = false; 698 699 // Whether to lower-case each token before generating hashgrams. 700 lowercase_tokens:bool; 701 702 // If true, the selection classifier output will contain only the selections 703 // that are feasible (e.g., those that are shorter than max_selection_span), 704 // if false, the output will be a complete cross-product of possible 705 // selections to the left and possible selections to the right, including the 706 // infeasible ones. 707 // NOTE: Exists mainly for compatibility with older models that were trained 708 // with the non-reduced output space. 709 selection_reduced_output_space:bool = true; 710 711 // Collection names. 712 collections:[string]; 713 714 // An index of collection in collections to be used if a collection name can't 715 // be mapped to an id. 716 default_collection:int = -1; 717 718 // If true, will split the input by lines, and only use the line that contains 719 // the clicked token. 720 only_use_line_with_click:bool = false; 721 722 // If true, will split tokens that contain the selection boundary, at the 723 // position of the boundary. 724 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" 725 split_tokens_on_selection_boundaries:bool = false; 726 727 // Codepoint ranges that determine how different codepoints are tokenized. 728 // The ranges must not overlap. 729 tokenization_codepoint_config:[TokenizationCodepointRange]; 730 731 center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod; 732 733 // If true, span boundaries will be snapped to containing tokens and not 734 // required to exactly match token boundaries. 735 snap_label_span_boundaries_to_containing_tokens:bool; 736 737 // A set of codepoint ranges supported by the model. 738 supported_codepoint_ranges:[CodepointRange]; 739 740 // A set of codepoint ranges to use in the mixed tokenization mode to identify 741 // stretches of tokens to re-tokenize using the internal tokenizer. 742 internal_tokenizer_codepoint_ranges:[CodepointRange]; 743 744 // Minimum ratio of supported codepoints in the input context. If the ratio 745 // is lower than this, the feature computation will fail. 746 min_supported_codepoint_ratio:float = 0; 747 748 // Used for versioning the format of features the model expects. 749 // - feature_version == 0: 750 // For each token the features consist of: 751 // - chargram embeddings 752 // - dense features 753 // Chargram embeddings for tokens are concatenated first together, 754 // and at the end, the dense features for the tokens are concatenated 755 // to it. So the resulting feature vector has two regions. 756 feature_version:int = 0; 757 758 tokenization_type:TokenizationType = INTERNAL_TOKENIZER; 759 icu_preserve_whitespace_tokens:bool = false; 760 761 // List of codepoints that will be stripped from beginning and end of 762 // predicted spans. 763 ignored_span_boundary_codepoints:[int]; 764 765 bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures; 766 767 // List of allowed charactergrams. The extracted charactergrams are filtered 768 // using this list, and charactergrams that are not present are interpreted as 769 // out-of-vocabulary. 770 // If no allowed_chargrams are specified, all charactergrams are allowed. 771 // The field is typed as bytes type to allow non-UTF8 chargrams. 772 allowed_chargrams:[string]; 773 774 // If true, tokens will be also split when the codepoint's script_id changes 775 // as defined in TokenizationCodepointRange. 776 tokenize_on_script_change:bool = false; 777 778 // If true, the pipe character '|' will be used as a newline character when 779 // splitting lines. 780 use_pipe_character_for_newline:bool = true; 781} 782 783namespace libtextclassifier3; 784table NumberAnnotatorOptions { 785 // If true, number and percentage annotations will be produced. 786 enabled:bool = false; 787 788 // Score to assign to the annotated numbers and percentages in the annotator. 789 score:float = 1; 790 791 // Number priority score used for conflict resolution with the other models. 792 priority_score:float = 0; 793 794 // The modes in which to enable number and percentage annotations. 795 enabled_modes:ModeFlag = ALL; 796 797 // The annotation usecases for which to produce number annotations. 798 // This is a flag field for values of AnnotationUsecase. 799 enabled_annotation_usecases:uint = 4294967295; 800 801 // [Deprecated] A list of codepoints that can form a prefix of a valid number. 802 allowed_prefix_codepoints:[int]; 803 804 // [Deprecated] A list of codepoints that can form a suffix of a valid number. 805 allowed_suffix_codepoints:[int]; 806 807 // [Deprecated] List of codepoints that will be stripped from beginning of 808 // predicted spans. 809 ignored_prefix_span_boundary_codepoints:[int]; 810 811 // [Deprecated] List of codepoints that will be stripped from end of predicted 812 // spans. 813 ignored_suffix_span_boundary_codepoints:[int]; 814 815 // [Deprecated] If true, percent annotations will be produced. 816 enable_percentage:bool = false; 817 818 // Zero separated and ordered list of suffixes that mark a percent. 819 percentage_pieces_string:string (shared); 820 821 // [Deprecated] List of suffixes offsets in the percent_pieces_string string. 822 percentage_pieces_offsets:[int]; 823 824 // Priority score for the percentage annotation. 825 percentage_priority_score:float = 1; 826 827 // Float number priority score used for conflict resolution with the other 828 // models. 829 float_number_priority_score:float = 0; 830 831 // The maximum number of digits an annotated number can have. Requirement: 832 // the value should be less or equal to 20. 833 max_number_of_digits:int = 20; 834 835 // The annotation usecases for which to produce percentage annotations. 836 // This is a flag field for values of AnnotationUsecase. 837 percentage_annotation_usecases:uint = 2; 838} 839 840// DurationAnnotator is so far tailored for English and Japanese only. 841namespace libtextclassifier3; 842table DurationAnnotatorOptions { 843 // If true, duration annotations will be produced. 844 enabled:bool = false; 845 846 // Score to assign to the annotated durations from the annotator. 847 score:float = 1; 848 849 // Priority score used for conflict resolution with the other models. 850 priority_score:float = 0; 851 852 // The modes in which to enable duration annotations. 853 enabled_modes:ModeFlag = ALL; 854 855 // The annotation usecases for which to produce duration annotations. 856 enabled_annotation_usecases:uint = 4294967295; 857 858 // Durations typically look like XX hours and XX minutes etc... The list of 859 // strings below enumerate variants of "hours", "minutes", etc. in these 860 // expressions. These are verbatim strings that are matched against tokens in 861 // the input. 862 week_expressions:[string]; 863 864 day_expressions:[string]; 865 hour_expressions:[string]; 866 minute_expressions:[string]; 867 second_expressions:[string]; 868 869 // List of expressions that doesn't break a duration expression (can become 870 // a part of it) but has not semantic meaning. 871 filler_expressions:[string]; 872 873 // List of expressions that mean half of a unit of duration (e.g. "half an 874 // hour"). 875 half_expressions:[string]; 876 877 // Set of condepoints that can split the Annotator tokens to sub-tokens for 878 // sub-token matching. 879 sub_token_separator_codepoints:[int]; 880 881 // If this is true, unit must be associated with quantity. For example, a 882 // phrase "minute" is not parsed as one minute duration if this is true. 883 require_quantity:bool; 884 885 // If this is true, dangling quantity is included in the annotation. For 886 // example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds. 887 enable_dangling_quantity_interpretation:bool = true; 888} 889 890namespace libtextclassifier3; 891table ContactAnnotatorOptions { 892 // Supported for English genitives only so far. 893 enable_declension:bool; 894 895 // For each language there is a customized list of supported declensions. 896 language:string (shared); 897} 898 899namespace libtextclassifier3.TranslateAnnotatorOptions_; 900enum Algorithm : int { 901 DEFAULT_ALGORITHM = 0, 902 BACKOFF = 1, 903} 904 905// Backoff is the algorithm shipped with Android Q. 906namespace libtextclassifier3.TranslateAnnotatorOptions_; 907table BackoffOptions { 908 // The minimum size of text to prefer for detection (in codepoints). 909 min_text_size:int = 20; 910 911 // For reducing the score when text is less than the preferred size. 912 penalize_ratio:float = 1; 913 914 // Original detection score to surrounding text detection score ratios. 915 subject_text_score_ratio:float = 0.4; 916} 917 918namespace libtextclassifier3; 919table TranslateAnnotatorOptions { 920 enabled:bool = false; 921 922 // Score to assign to the classification results. 923 score:float = 1; 924 925 // Priority score used for conflict resolution with the other models. 926 priority_score:float; 927 928 algorithm:TranslateAnnotatorOptions_.Algorithm; 929 backoff_options:TranslateAnnotatorOptions_.BackoffOptions; 930} 931 932namespace libtextclassifier3.PodNerModel_; 933table Collection { 934 // Collection's name (e.g., "location", "person"). 935 name:string (shared); 936 937 // Priority scores used for conflict resolution with the other annotators 938 // when the annotation is made over a single/multi token text. 939 single_token_priority_score:float; 940 941 multi_token_priority_score:float; 942} 943 944namespace libtextclassifier3.PodNerModel_.Label_; 945enum BoiseType : int { 946 NONE = 0, 947 BEGIN = 1, 948 O = 2, 949 // No label. 950 951 INTERMEDIATE = 3, 952 SINGLE = 4, 953 END = 5, 954} 955 956namespace libtextclassifier3.PodNerModel_.Label_; 957enum MentionType : int { 958 UNDEFINED = 0, 959 NAM = 1, 960 NOM = 2, 961} 962 963namespace libtextclassifier3.PodNerModel_; 964table Label { 965 boise_type:Label_.BoiseType; 966 mention_type:Label_.MentionType; 967 collection_id:int; 968 // points to the collections array above. 969} 970 971namespace libtextclassifier3; 972table PodNerModel { 973 tflite_model:[ubyte]; 974 word_piece_vocab:[ubyte]; 975 lowercase_input:bool = true; 976 977 // Index of mention_logits tensor in the output of the tflite model. Can 978 // be found in the textproto output after model is converted to tflite. 979 logits_index_in_output_tensor:int = 0; 980 981 // Whether to append a period at the end of an input that doesn't already 982 // end in punctuation. 983 append_final_period:bool = false; 984 985 // Priority score used for conflict resolution with the other models. Used 986 // only if collections_array is empty. 987 priority_score:float = 0; 988 989 // Maximum number of wordpieces supported by the model. 990 max_num_wordpieces:int = 128; 991 992 // In case of long text (number of wordpieces greater than the max) we use 993 // sliding window approach, this determines the number of overlapping 994 // wordpieces between two consecutive windows. This overlap enables context 995 // for each word NER annotates. 996 sliding_window_num_wordpieces_overlap:int = 20; 997 reserved_9:int16 (deprecated); 998 999 // The possible labels the ner model can output. If empty the default labels 1000 // will be used. 1001 labels:[PodNerModel_.Label]; 1002 1003 // If the ratio of unknown wordpieces in the input text is greater than this 1004 // maximum, the text won't be annotated. 1005 max_ratio_unknown_wordpieces:float = 0.1; 1006 1007 // Possible collections for labeled entities. 1008 collections:[PodNerModel_.Collection]; 1009 1010 // Minimum word-length and wordpieces-length required for the text to be 1011 // annotated. 1012 min_number_of_tokens:int = 1; 1013 1014 min_number_of_wordpieces:int = 1; 1015} 1016 1017namespace libtextclassifier3; 1018table VocabModel { 1019 // A trie that stores a list of vocabs that triggers "Define". A id is 1020 // returned when looking up a vocab from the trie and the id can be used 1021 // to access more information about that vocab. The marisa trie library 1022 // requires 8-byte alignment because the first thing in a marisa trie is a 1023 // 64-bit integer. 1024 vocab_trie:[ubyte] (force_align: 8); 1025 1026 // A bit vector that tells if the vocab should trigger "Define" for users of 1027 // beginner proficiency only. To look up the bit vector, use the id returned 1028 // by the trie. 1029 beginner_level:BitVectorData; 1030 1031 // A sorted list of indices of vocabs that should not trigger "Define" if 1032 // its leading character is in upper case. The indices are those returned by 1033 // trie. You may perform binary search to look up an index. 1034 do_not_trigger_in_upper_case:BitVectorData; 1035 1036 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 1037 // are used to prevent triggering on input in unsupported languages. If 1038 // empty, the model will trigger on all inputs. 1039 triggering_locales:string (shared); 1040 1041 // The final score to assign to the results of the vocab model 1042 target_classification_score:float = 1; 1043 1044 // Priority score used for conflict resolution with the other models. 1045 priority_score:float = 0; 1046} 1047 1048root_type libtextclassifier3.Model; 1049