1// 2// Copyright (C) 2018 The Android Open Source Project 3// 4// Licensed under the Apache License, Version 2.0 (the "License"); 5// you may not use this file except in compliance with the License. 6// You may obtain a copy of the License at 7// 8// http://www.apache.org/licenses/LICENSE-2.0 9// 10// Unless required by applicable law or agreed to in writing, software 11// distributed under the License is distributed on an "AS IS" BASIS, 12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13// See the License for the specific language governing permissions and 14// limitations under the License. 15// 16 17include "annotator/entity-data.fbs"; 18include "annotator/experimental/experimental.fbs"; 19include "annotator/grammar/dates/dates.fbs"; 20include "utils/codepoint-range.fbs"; 21include "utils/flatbuffers.fbs"; 22include "utils/grammar/rules.fbs"; 23include "utils/intents/intent-config.fbs"; 24include "utils/normalization.fbs"; 25include "utils/resources.fbs"; 26include "utils/tokenizer.fbs"; 27include "utils/zlib/buffer.fbs"; 28 29file_identifier "TC2 "; 30 31// The possible model modes, represents a bit field. 32namespace libtextclassifier3; 33enum ModeFlag : int { 34 NONE = 0, 35 ANNOTATION = 1, 36 CLASSIFICATION = 2, 37 ANNOTATION_AND_CLASSIFICATION = 3, 38 SELECTION = 4, 39 ANNOTATION_AND_SELECTION = 5, 40 CLASSIFICATION_AND_SELECTION = 6, 41 ALL = 7, 42} 43 44// Enum for specifying the annotation usecase. 45namespace libtextclassifier3; 46enum AnnotationUsecase : int { 47 // Results are optimized for Smart{Select,Share,Linkify}. 48 ANNOTATION_USECASE_SMART = 0, 49 // Smart{Select,Share,Linkify} 50 51 // Results are optimized for using TextClassifier as an infrastructure that 52 // annotates as much as possible. 53 ANNOTATION_USECASE_RAW = 1, 54} 55 56namespace libtextclassifier3; 57enum DatetimeExtractorType : int { 58 UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, 59 AM = 1, 60 PM = 2, 61 JANUARY = 3, 62 FEBRUARY = 4, 63 MARCH = 5, 64 APRIL = 6, 65 MAY = 7, 66 JUNE = 8, 67 JULY = 9, 68 AUGUST = 10, 69 SEPTEMBER = 11, 70 OCTOBER = 12, 71 NOVEMBER = 13, 72 DECEMBER = 14, 73 NEXT = 15, 74 NEXT_OR_SAME = 16, 75 LAST = 17, 76 NOW = 18, 77 TOMORROW = 19, 78 YESTERDAY = 20, 79 PAST = 21, 80 FUTURE = 22, 81 DAY = 23, 82 WEEK = 24, 83 MONTH = 25, 84 YEAR = 26, 85 MONDAY = 27, 86 TUESDAY = 28, 87 WEDNESDAY = 29, 88 THURSDAY = 30, 89 FRIDAY = 31, 90 SATURDAY = 32, 91 SUNDAY = 33, 92 DAYS = 34, 93 WEEKS = 35, 94 MONTHS = 36, 95 96 // TODO(zilka): Make the following 3 values singular for consistency. 97 HOURS = 37, 98 99 MINUTES = 38, 100 SECONDS = 39, 101 YEARS = 40, 102 DIGITS = 41, 103 SIGNEDDIGITS = 42, 104 ZERO = 43, 105 ONE = 44, 106 TWO = 45, 107 THREE = 46, 108 FOUR = 47, 109 FIVE = 48, 110 SIX = 49, 111 SEVEN = 50, 112 EIGHT = 51, 113 NINE = 52, 114 TEN = 53, 115 ELEVEN = 54, 116 TWELVE = 55, 117 THIRTEEN = 56, 118 FOURTEEN = 57, 119 FIFTEEN = 58, 120 SIXTEEN = 59, 121 SEVENTEEN = 60, 122 EIGHTEEN = 61, 123 NINETEEN = 62, 124 TWENTY = 63, 125 THIRTY = 64, 126 FORTY = 65, 127 FIFTY = 66, 128 SIXTY = 67, 129 SEVENTY = 68, 130 EIGHTY = 69, 131 NINETY = 70, 132 HUNDRED = 71, 133 THOUSAND = 72, 134} 135 136namespace libtextclassifier3; 137enum DatetimeGroupType : int { 138 GROUP_UNKNOWN = 0, 139 GROUP_UNUSED = 1, 140 GROUP_YEAR = 2, 141 GROUP_MONTH = 3, 142 GROUP_DAY = 4, 143 GROUP_HOUR = 5, 144 GROUP_MINUTE = 6, 145 GROUP_SECOND = 7, 146 GROUP_AMPM = 8, 147 GROUP_RELATIONDISTANCE = 9, 148 GROUP_RELATION = 10, 149 GROUP_RELATIONTYPE = 11, 150 151 // Dummy groups serve just as an inflator of the selection. E.g. we might want 152 // to select more text than was contained in an envelope of all extractor 153 // spans. 154 GROUP_DUMMY1 = 12, 155 156 GROUP_DUMMY2 = 13, 157} 158 159// Options for the model that predicts text selection. 160namespace libtextclassifier3; 161table SelectionModelOptions { 162 // If true, before the selection is returned, the unpaired brackets contained 163 // in the predicted selection are stripped from the both selection ends. 164 // The bracket codepoints are defined in the Unicode standard: 165 // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt 166 strip_unpaired_brackets:bool = true; 167 168 // Number of hypothetical click positions on either side of the actual click 169 // to consider in order to enforce symmetry. 170 symmetry_context_size:int; 171 172 // Number of examples to bundle in one batch for inference. 173 batch_size:int = 1024; 174 175 // Whether to always classify a suggested selection or only on demand. 176 always_classify_suggested_selection:bool = false; 177} 178 179// Options for the model that classifies a text selection. 180namespace libtextclassifier3; 181table ClassificationModelOptions { 182 // Limits for phone numbers. 183 phone_min_num_digits:int = 7; 184 185 phone_max_num_digits:int = 15; 186 187 // Limits for addresses. 188 address_min_num_tokens:int; 189 190 // Maximum number of tokens to attempt a classification (-1 is unlimited). 191 max_num_tokens:int = -1; 192} 193 194// Options for post-checks, checksums and verification to apply on a match. 195namespace libtextclassifier3; 196table VerificationOptions { 197 verify_luhn_checksum:bool = false; 198 199 // Lua verifier to use. 200 // Index of the lua verifier in the model. 201 lua_verifier:int = -1; 202} 203 204// Behaviour of rule capturing groups. 205// This specifies how the text and span of a capturing group, in a regular 206// expression or from a capturing match in a grammar rule, should be handled. 207namespace libtextclassifier3; 208table CapturingGroup { 209 // If true, the span of the capturing group will be used to 210 // extend the selection. 211 extend_selection:bool = true; 212 213 // If set, the text of the capturing group will be used to set a field in 214 // the classfication result entity data. 215 entity_field_path:FlatbufferFieldPath; 216 217 // If set, the flatbuffer entity data will be merged with the 218 // classification result entity data. 219 serialized_entity_data:string (shared); 220 221 // If set, normalization to apply before text is used in entity data. 222 normalization_options:NormalizationOptions; 223 224 entity_data:EntityData; 225} 226 227// List of regular expression matchers to check. 228namespace libtextclassifier3.RegexModel_; 229table Pattern { 230 // The name of the collection of a match. 231 collection_name:string (shared); 232 233 // The pattern to check. 234 pattern:string (shared); 235 236 // The modes for which to apply the patterns. 237 enabled_modes:ModeFlag = ALL; 238 239 // The final score to assign to the results of this pattern. 240 target_classification_score:float = 1; 241 242 // Priority score used for conflict resolution with the other models. 243 priority_score:float = 0; 244 245 // If true, will use an approximate matching implementation implemented 246 // using Find() instead of the true Match(). This approximate matching will 247 // use the first Find() result and then check that it spans the whole input. 248 use_approximate_matching:bool = false; 249 250 compressed_pattern:CompressedBuffer; 251 252 // Verification to apply on a match. 253 verification_options:VerificationOptions; 254 255 capturing_group:[CapturingGroup]; 256 257 // Entity data to set for a match. 258 serialized_entity_data:string (shared); 259 260 entity_data:EntityData; 261} 262 263namespace libtextclassifier3; 264table RegexModel { 265 patterns:[RegexModel_.Pattern]; 266 267 // If true, will compile the regexes only on first use. 268 lazy_regex_compilation:bool = true; 269 270 // Lua scripts for match verification. 271 // The verifier can access: 272 // * `context`: The context as a string. 273 // * `match`: The groups of the regex match as an array, each group gives 274 // * `begin`: span start 275 // * `end`: span end 276 // * `text`: the text 277 // The verifier is expected to return a boolean, indicating whether the 278 // verification succeeded or not. 279 lua_verifier:[string]; 280} 281 282// List of regex patterns. 283namespace libtextclassifier3.DatetimeModelPattern_; 284table Regex { 285 pattern:string (shared); 286 287 // The ith entry specifies the type of the ith capturing group. 288 // This is used to decide how the matched content has to be parsed. 289 groups:[DatetimeGroupType]; 290 291 compressed_pattern:CompressedBuffer; 292} 293 294namespace libtextclassifier3; 295table DatetimeModelPattern { 296 regexes:[DatetimeModelPattern_.Regex]; 297 298 // List of locale indices in DatetimeModel that represent the locales that 299 // these patterns should be used for. If empty, can be used for all locales. 300 locales:[int]; 301 302 // The final score to assign to the results of this pattern. 303 target_classification_score:float = 1; 304 305 // Priority score used for conflict resolution with the other models. 306 priority_score:float = 0; 307 308 // The modes for which to apply the patterns. 309 enabled_modes:ModeFlag = ALL; 310 311 // The annotation usecases for which to apply the patterns. 312 // This is a flag field for values of AnnotationUsecase. 313 enabled_annotation_usecases:uint = 4294967295; 314} 315 316namespace libtextclassifier3; 317table DatetimeModelExtractor { 318 extractor:DatetimeExtractorType; 319 pattern:string (shared); 320 locales:[int]; 321 compressed_pattern:CompressedBuffer; 322} 323 324namespace libtextclassifier3; 325table DatetimeModel { 326 // List of BCP 47 locale strings representing all locales supported by the 327 // model. The individual patterns refer back to them using an index. 328 locales:[string]; 329 330 patterns:[DatetimeModelPattern]; 331 extractors:[DatetimeModelExtractor]; 332 333 // If true, will use the extractors for determining the match location as 334 // opposed to using the location where the global pattern matched. 335 use_extractors_for_locating:bool = true; 336 337 // List of locale ids, rules of whose are always run, after the requested 338 // ones. 339 default_locales:[int]; 340 341 // If true, will generate the alternative interpretations for ambiguous 342 // datetime expressions. 343 generate_alternative_interpretations_when_ambiguous:bool = false; 344 345 // If true, will compile the regexes only on first use. 346 lazy_regex_compilation:bool = true; 347 348 // If true, will give only future dates (when the day is not specified). 349 prefer_future_for_unspecified_date:bool = false; 350} 351 352// Configuration for the tokenizer. 353namespace libtextclassifier3; 354table GrammarTokenizerOptions { 355 tokenization_type:TokenizationType = ICU; 356 357 // If true, white space tokens will be kept when using the icu tokenizer. 358 icu_preserve_whitespace_tokens:bool = false; 359 360 // Codepoint ranges that determine what role the different codepoints play 361 // during tokenized. The ranges must not overlap. 362 tokenization_codepoint_config:[TokenizationCodepointRange]; 363 364 // A set of codepoint ranges to use in the mixed tokenization mode to identify 365 // stretches of tokens to re-tokenize using the internal tokenizer. 366 internal_tokenizer_codepoint_ranges:[CodepointRange]; 367 368 // If true, tokens will be also split when the codepoint's script_id changes 369 // as defined in TokenizationCodepointRange. 370 tokenize_on_script_change:bool = false; 371} 372 373// Options for grammar date/datetime/date range annotations. 374namespace libtextclassifier3.GrammarDatetimeModel_; 375table AnnotationOptions { 376 // If enabled, extract special day offset like today, yesterday, etc. 377 enable_special_day_offset:bool = true; 378 379 // If true, merge the adjacent day of week, time and date. e.g. 380 // "20/2/2016 at 8pm" is extracted as a single instance instead of two 381 // instance: "20/2/2016" and "8pm". 382 merge_adjacent_components:bool = true; 383 384 // List the extra id of requested dates. 385 extra_requested_dates:[string]; 386 387 // If true, try to include preposition to the extracted annotation. e.g. 388 // "at 6pm". if it's false, only 6pm is included. offline-actions has 389 // special requirements to include preposition. 390 include_preposition:bool = true; 391 392 // If enabled, extract range in date annotator. 393 // input: Monday, 5-6pm 394 // If the flag is true, The extracted annotation only contains 1 range 395 // instance which is from Monday 5pm to 6pm. 396 // If the flag is false, The extracted annotation contains two date 397 // instance: "Monday" and "6pm". 398 enable_date_range:bool = true; 399 reserved_6:int16 (deprecated); 400 401 // If enabled, the rule priority score is used to set the priority score of 402 // the annotation. 403 // In case of false the annotation priority score is set from 404 // GrammarDatetimeModel's priority_score 405 use_rule_priority_score:bool = false; 406 407 // If enabled, annotator will try to resolve the ambiguity by generating 408 // possible alternative interpretations of the input text 409 // e.g. '9:45' will be resolved to '9:45 AM' and '9:45 PM'. 410 generate_alternative_interpretations_when_ambiguous:bool; 411 412 // List of spans which grammar will ignore during the match e.g. if 413 // “@” is in the allowed span list and input is “12 March @ 12PM” then “@” 414 // will be ignored and 12 March @ 12PM will be translate to 415 // {Day:12 Month: March Hour: 12 MERIDIAN: PM}. 416 // This can also be achieved by adding additional rules e.g. 417 // <Digit_Day> <Month> <Time> 418 // <Digit_Day> <Month> @ <Time> 419 // Though this is doable in the grammar but requires multiple rules, this 420 // list enables the rule to represent multiple rules. 421 ignored_spans:[string]; 422} 423 424namespace libtextclassifier3; 425table GrammarDatetimeModel { 426 // List of BCP 47 locale strings representing all locales supported by the 427 // model. 428 locales:[string]; 429 430 // If true, will give only future dates (when the day is not specified). 431 prefer_future_for_unspecified_date:bool = false; 432 433 // Grammar specific tokenizer options. 434 grammar_tokenizer_options:GrammarTokenizerOptions; 435 436 // The modes for which to apply the grammars. 437 enabled_modes:ModeFlag = ALL; 438 439 // The datetime grammar rules. 440 datetime_rules:dates.DatetimeRules; 441 442 // The final score to assign to the results of grammar model 443 target_classification_score:float = 1; 444 445 // The priority score used for conflict resolution with the other models. 446 priority_score:float = 0; 447 448 // Options for grammar annotations. 449 annotation_options:GrammarDatetimeModel_.AnnotationOptions; 450} 451 452namespace libtextclassifier3.DatetimeModelLibrary_; 453table Item { 454 key:string (shared); 455 value:DatetimeModel; 456} 457 458// A set of named DateTime models. 459namespace libtextclassifier3; 460table DatetimeModelLibrary { 461 models:[DatetimeModelLibrary_.Item]; 462} 463 464// Classification result to instantiate for a rule match. 465namespace libtextclassifier3.GrammarModel_; 466table RuleClassificationResult { 467 // The name of the collection. 468 collection_name:string (shared); 469 470 // The score. 471 target_classification_score:float = 1; 472 473 // The priority score used for conflict resolution with the other models. 474 priority_score:float = 0; 475 476 // Behaviour of capturing matches. 477 capturing_group:[CapturingGroup]; 478 479 // Entity data to set for a match. 480 serialized_entity_data:string (shared); 481 482 // Enabled modes. 483 enabled_modes:ModeFlag = ALL; 484 485 entity_data:EntityData; 486} 487 488// Configuration for grammar based annotators. 489namespace libtextclassifier3; 490table GrammarModel { 491 // The grammar rules. 492 rules:grammar.RulesSet; 493 494 rule_classification_result:[GrammarModel_.RuleClassificationResult]; 495 496 // Number of tokens in the context to use for classification and text 497 // selection suggestion. 498 // A value -1 uses the full context. 499 context_left_num_tokens:int; 500 501 context_right_num_tokens:int; 502 503 // Grammar specific tokenizer options. 504 tokenizer_options:GrammarTokenizerOptions; 505} 506 507namespace libtextclassifier3; 508table MoneyParsingOptions { 509 // Separators (codepoints) marking decimal or thousand in the money amount. 510 separators:[int]; 511} 512 513namespace libtextclassifier3.ModelTriggeringOptions_; 514table CollectionToPriorityEntry { 515 key:string (key, shared); 516 value:float; 517} 518 519// Options controlling the output of the Tensorflow Lite models. 520namespace libtextclassifier3; 521table ModelTriggeringOptions { 522 // Lower bound threshold for filtering annotation model outputs. 523 min_annotate_confidence:float = 0; 524 525 // The modes for which to enable the models. 526 enabled_modes:ModeFlag = ALL; 527 528 // Comma-separated list of locales (BCP 47 tags) that dictionary 529 // classification supports. 530 dictionary_locales:string (shared); 531 532 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 533 // are used to prevent triggering on input in unsupported languages. If 534 // empty, the model will trigger on all inputs. 535 locales:string (shared); 536 537 // Priority score assigned to the "other" class from ML model. 538 other_collection_priority_score:float = -1000; 539 540 // Priority score assigned to knowledge engine annotations. 541 knowledge_priority_score:float = 0; 542 reserved_7:int16 (deprecated); 543 544 // Apply a factor to the priority score for entities that are added to this 545 // map. Key: collection type e.g. "address", "phone"..., Value: float number. 546 // NOTE: The entries here need to be sorted since we use LookupByKey. 547 collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry]; 548} 549 550// Options controlling the output of the classifier. 551namespace libtextclassifier3; 552table OutputOptions { 553 // Lists of collection names that will be filtered out at the output: 554 // - For annotation, the spans of given collection are simply dropped. 555 // - For classification, the result is mapped to the class "other". 556 // - For selection, the spans of given class are returned as 557 // single-selection. 558 filtered_collections_annotation:[string]; 559 560 filtered_collections_classification:[string]; 561 filtered_collections_selection:[string]; 562} 563 564namespace libtextclassifier3.Model_; 565table EmbeddingPruningMask { 566 // If true, use pruning mask. In this case, we use mask 567 // pruning_mask to determine the mapping of hashed-charactergrams. 568 enabled:bool; 569 570 // Packing of the binary pruning mask into uint64 values. 571 pruning_mask:[ulong] (force_align: 16); 572 573 // Number of buckets before pruning. 574 full_num_buckets:int; 575 576 // Index of row of compressed embedding matrix to which all pruned buckets 577 // are mapped. 578 pruned_row_bucket_id:int; 579} 580 581namespace libtextclassifier3.Model_; 582table ConflictResolutionOptions { 583 // If true, will prioritize the longest annotation during conflict 584 // resolution. 585 prioritize_longest_annotation:bool = false; 586 587 // If true, the annotator will perform conflict resolution between the 588 // different sub-annotators also in the RAW mode. If false, no conflict 589 // resolution will be performed in RAW mode. 590 do_conflict_resolution_in_raw_mode:bool = true; 591} 592 593namespace libtextclassifier3; 594table Model { 595 // Comma-separated list of locales supported by the model as BCP 47 tags. 596 locales:string (shared); 597 598 version:int; 599 600 // A name for the model that can be used for e.g. logging. 601 name:string (shared); 602 603 selection_feature_options:FeatureProcessorOptions; 604 classification_feature_options:FeatureProcessorOptions; 605 606 // Tensorflow Lite models. 607 selection_model:[ubyte] (force_align: 16); 608 609 classification_model:[ubyte] (force_align: 16); 610 embedding_model:[ubyte] (force_align: 16); 611 612 // Options for the different models. 613 selection_options:SelectionModelOptions; 614 615 classification_options:ClassificationModelOptions; 616 regex_model:RegexModel; 617 datetime_model:DatetimeModel; 618 619 // Options controlling the output of the models. 620 triggering_options:ModelTriggeringOptions; 621 622 // Global switch that controls if SuggestSelection(), ClassifyText() and 623 // Annotate() will run. If a mode is disabled it returns empty/no-op results. 624 enabled_modes:ModeFlag = ALL; 625 626 // If true, will snap the selections that consist only of whitespaces to the 627 // containing suggested span. Otherwise, no suggestion is proposed, since the 628 // selections are not part of any token. 629 snap_whitespace_selections:bool = true; 630 631 // Global configuration for the output of SuggestSelection(), ClassifyText() 632 // and Annotate(). 633 output_options:OutputOptions; 634 635 // Configures how Intents should be generated on Android. 636 android_intent_options:AndroidIntentFactoryOptions; 637 638 intent_options:IntentFactoryModel; 639 640 // Model resources. 641 resources:ResourcePool; 642 643 // Schema data for handling entity data. 644 entity_data_schema:[ubyte]; 645 646 number_annotator_options:NumberAnnotatorOptions; 647 duration_annotator_options:DurationAnnotatorOptions; 648 649 // Comma-separated list of locales (BCP 47 tags) that the model supports, that 650 // are used to prevent triggering on input in unsupported languages. If 651 // empty, the model will trigger on all inputs. 652 triggering_locales:string (shared); 653 654 embedding_pruning_mask:Model_.EmbeddingPruningMask; 655 grammar_datetime_model:GrammarDatetimeModel; 656 contact_annotator_options:ContactAnnotatorOptions; 657 money_parsing_options:MoneyParsingOptions; 658 translate_annotator_options:TranslateAnnotatorOptions; 659 grammar_model:GrammarModel; 660 conflict_resolution_options:Model_.ConflictResolutionOptions; 661 experimental_model:ExperimentalModel; 662} 663 664// Method for selecting the center token. 665namespace libtextclassifier3.FeatureProcessorOptions_; 666enum CenterTokenSelectionMethod : int { 667 DEFAULT_CENTER_TOKEN_METHOD = 0, 668 // Invalid option. 669 670 // Use click indices to determine the center token. 671 CENTER_TOKEN_FROM_CLICK = 1, 672 673 // Use selection indices to get a token range, and select the middle of it 674 // as the center token. 675 CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, 676} 677 678// Bounds-sensitive feature extraction configuration. 679namespace libtextclassifier3.FeatureProcessorOptions_; 680table BoundsSensitiveFeatures { 681 // Enables the extraction of bounds-sensitive features, instead of the click 682 // context features. 683 enabled:bool; 684 685 // The numbers of tokens to extract in specific locations relative to the 686 // bounds. 687 // Immediately before the span. 688 num_tokens_before:int; 689 690 // Inside the span, aligned with the beginning. 691 num_tokens_inside_left:int; 692 693 // Inside the span, aligned with the end. 694 num_tokens_inside_right:int; 695 696 // Immediately after the span. 697 num_tokens_after:int; 698 699 // If true, also extracts the tokens of the entire span and adds up their 700 // features forming one "token" to include in the extracted features. 701 include_inside_bag:bool; 702 703 // If true, includes the selection length (in the number of tokens) as a 704 // feature. 705 include_inside_length:bool; 706 707 // If true, for selection, single token spans are not run through the model 708 // and their score is assumed to be zero. 709 score_single_token_spans_as_zero:bool; 710} 711 712namespace libtextclassifier3; 713table FeatureProcessorOptions { 714 // Number of buckets used for hashing charactergrams. 715 num_buckets:int = -1; 716 717 // Size of the embedding. 718 embedding_size:int = -1; 719 720 // Number of bits for quantization for embeddings. 721 embedding_quantization_bits:int = 8; 722 723 // Context size defines the number of words to the left and to the right of 724 // the selected word to be used as context. For example, if context size is 725 // N, then we take N words to the left and N words to the right of the 726 // selected word as its context. 727 context_size:int = -1; 728 729 // Maximum number of words of the context to select in total. 730 max_selection_span:int = -1; 731 732 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 733 // character trigrams etc. 734 chargram_orders:[int]; 735 736 // Maximum length of a word, in codepoints. 737 max_word_length:int = 20; 738 739 // If true, will use the unicode-aware functionality for extracting features. 740 unicode_aware_features:bool = false; 741 742 // Whether to extract the token case feature. 743 extract_case_feature:bool = false; 744 745 // Whether to extract the selection mask feature. 746 extract_selection_mask_feature:bool = false; 747 748 // List of regexps to run over each token. For each regexp, if there is a 749 // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. 750 regexp_feature:[string]; 751 752 // Whether to remap all digits to a single number. 753 remap_digits:bool = false; 754 755 // Whether to lower-case each token before generating hashgrams. 756 lowercase_tokens:bool; 757 758 // If true, the selection classifier output will contain only the selections 759 // that are feasible (e.g., those that are shorter than max_selection_span), 760 // if false, the output will be a complete cross-product of possible 761 // selections to the left and possible selections to the right, including the 762 // infeasible ones. 763 // NOTE: Exists mainly for compatibility with older models that were trained 764 // with the non-reduced output space. 765 selection_reduced_output_space:bool = true; 766 767 // Collection names. 768 collections:[string]; 769 770 // An index of collection in collections to be used if a collection name can't 771 // be mapped to an id. 772 default_collection:int = -1; 773 774 // If true, will split the input by lines, and only use the line that contains 775 // the clicked token. 776 only_use_line_with_click:bool = false; 777 778 // If true, will split tokens that contain the selection boundary, at the 779 // position of the boundary. 780 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" 781 split_tokens_on_selection_boundaries:bool = false; 782 783 // Codepoint ranges that determine how different codepoints are tokenized. 784 // The ranges must not overlap. 785 tokenization_codepoint_config:[TokenizationCodepointRange]; 786 787 center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod; 788 789 // If true, span boundaries will be snapped to containing tokens and not 790 // required to exactly match token boundaries. 791 snap_label_span_boundaries_to_containing_tokens:bool; 792 793 // A set of codepoint ranges supported by the model. 794 supported_codepoint_ranges:[CodepointRange]; 795 796 // A set of codepoint ranges to use in the mixed tokenization mode to identify 797 // stretches of tokens to re-tokenize using the internal tokenizer. 798 internal_tokenizer_codepoint_ranges:[CodepointRange]; 799 800 // Minimum ratio of supported codepoints in the input context. If the ratio 801 // is lower than this, the feature computation will fail. 802 min_supported_codepoint_ratio:float = 0; 803 804 // Used for versioning the format of features the model expects. 805 // - feature_version == 0: 806 // For each token the features consist of: 807 // - chargram embeddings 808 // - dense features 809 // Chargram embeddings for tokens are concatenated first together, 810 // and at the end, the dense features for the tokens are concatenated 811 // to it. So the resulting feature vector has two regions. 812 feature_version:int = 0; 813 814 tokenization_type:TokenizationType = INTERNAL_TOKENIZER; 815 icu_preserve_whitespace_tokens:bool = false; 816 817 // List of codepoints that will be stripped from beginning and end of 818 // predicted spans. 819 ignored_span_boundary_codepoints:[int]; 820 821 bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures; 822 823 // List of allowed charactergrams. The extracted charactergrams are filtered 824 // using this list, and charactergrams that are not present are interpreted as 825 // out-of-vocabulary. 826 // If no allowed_chargrams are specified, all charactergrams are allowed. 827 // The field is typed as bytes type to allow non-UTF8 chargrams. 828 allowed_chargrams:[string]; 829 830 // If true, tokens will be also split when the codepoint's script_id changes 831 // as defined in TokenizationCodepointRange. 832 tokenize_on_script_change:bool = false; 833 834 // If true, the pipe character '|' will be used as a newline character when 835 // splitting lines. 836 use_pipe_character_for_newline:bool = true; 837} 838 839namespace libtextclassifier3; 840table NumberAnnotatorOptions { 841 // If true, number and percentage annotations will be produced. 842 enabled:bool = false; 843 844 // Score to assign to the annotated numbers and percentages in the annotator. 845 score:float = 1; 846 847 // Number priority score used for conflict resolution with the other models. 848 priority_score:float = 0; 849 850 // The modes in which to enable number and percentage annotations. 851 enabled_modes:ModeFlag = ALL; 852 853 // The annotation usecases for which to produce number annotations. 854 // This is a flag field for values of AnnotationUsecase. 855 enabled_annotation_usecases:uint = 4294967295; 856 857 // [Deprecated] A list of codepoints that can form a prefix of a valid number. 858 allowed_prefix_codepoints:[int]; 859 860 // [Deprecated] A list of codepoints that can form a suffix of a valid number. 861 allowed_suffix_codepoints:[int]; 862 863 // [Deprecated] List of codepoints that will be stripped from beginning of 864 // predicted spans. 865 ignored_prefix_span_boundary_codepoints:[int]; 866 867 // [Deprecated] List of codepoints that will be stripped from end of predicted 868 // spans. 869 ignored_suffix_span_boundary_codepoints:[int]; 870 871 // [Deprecated] If true, percent annotations will be produced. 872 enable_percentage:bool = false; 873 874 // Zero separated and ordered list of suffixes that mark a percent. 875 percentage_pieces_string:string (shared); 876 877 // [Deprecated] List of suffixes offsets in the percent_pieces_string string. 878 percentage_pieces_offsets:[int]; 879 880 // Priority score for the percentage annotation. 881 percentage_priority_score:float = 1; 882 883 // Float number priority score used for conflict resolution with the other 884 // models. 885 float_number_priority_score:float = 0; 886 887 // The maximum number of digits an annotated number can have. Requirement: 888 // the value should be less or equal to 20. 889 max_number_of_digits:int = 20; 890 891 // The annotation usecases for which to produce percentage annotations. 892 // This is a flag field for values of AnnotationUsecase. 893 percentage_annotation_usecases:uint = 2; 894} 895 896// DurationAnnotator is so far tailored for English and Japanese only. 897namespace libtextclassifier3; 898table DurationAnnotatorOptions { 899 // If true, duration annotations will be produced. 900 enabled:bool = false; 901 902 // Score to assign to the annotated durations from the annotator. 903 score:float = 1; 904 905 // Priority score used for conflict resolution with the other models. 906 priority_score:float = 0; 907 908 // The modes in which to enable duration annotations. 909 enabled_modes:ModeFlag = ALL; 910 911 // The annotation usecases for which to produce duration annotations. 912 enabled_annotation_usecases:uint = 4294967295; 913 914 // Durations typically look like XX hours and XX minutes etc... The list of 915 // strings below enumerate variants of "hours", "minutes", etc. in these 916 // expressions. These are verbatim strings that are matched against tokens in 917 // the input. 918 week_expressions:[string]; 919 920 day_expressions:[string]; 921 hour_expressions:[string]; 922 minute_expressions:[string]; 923 second_expressions:[string]; 924 925 // List of expressions that doesn't break a duration expression (can become 926 // a part of it) but has not semantic meaning. 927 filler_expressions:[string]; 928 929 // List of expressions that mean half of a unit of duration (e.g. "half an 930 // hour"). 931 half_expressions:[string]; 932 933 // Set of condepoints that can split the Annotator tokens to sub-tokens for 934 // sub-token matching. 935 sub_token_separator_codepoints:[int]; 936 937 // If this is true, unit must be associated with quantity. For example, a 938 // phrase "minute" is not parsed as one minute duration if this is true. 939 require_quantity:bool; 940 941 // If this is true, dangling quantity is included in the annotation. For 942 // example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds. 943 enable_dangling_quantity_interpretation:bool = true; 944} 945 946namespace libtextclassifier3; 947table ContactAnnotatorOptions { 948 // Supported for English genitives only so far. 949 enable_declension:bool; 950 951 // For each language there is a customized list of supported declensions. 952 language:string (shared); 953} 954 955namespace libtextclassifier3.TranslateAnnotatorOptions_; 956enum Algorithm : int { 957 DEFAULT_ALGORITHM = 0, 958 BACKOFF = 1, 959} 960 961// Backoff is the algorithm shipped with Android Q. 962namespace libtextclassifier3.TranslateAnnotatorOptions_; 963table BackoffOptions { 964 // The minimum size of text to prefer for detection (in codepoints). 965 min_text_size:int = 20; 966 967 // For reducing the score when text is less than the preferred size. 968 penalize_ratio:float = 1; 969 970 // Original detection score to surrounding text detection score ratios. 971 subject_text_score_ratio:float = 0.4; 972} 973 974namespace libtextclassifier3; 975table TranslateAnnotatorOptions { 976 enabled:bool = false; 977 978 // Score to assign to the classification results. 979 score:float = 1; 980 981 // Priority score used for conflict resolution with the other models. 982 priority_score:float; 983 984 algorithm:TranslateAnnotatorOptions_.Algorithm; 985 backoff_options:TranslateAnnotatorOptions_.BackoffOptions; 986} 987 988root_type libtextclassifier3.Model; 989