• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2018 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17include "annotator/entity-data.fbs";
18include "annotator/experimental/experimental.fbs";
19include "annotator/grammar/dates/dates.fbs";
20include "utils/codepoint-range.fbs";
21include "utils/flatbuffers.fbs";
22include "utils/grammar/rules.fbs";
23include "utils/intents/intent-config.fbs";
24include "utils/normalization.fbs";
25include "utils/resources.fbs";
26include "utils/tokenizer.fbs";
27include "utils/zlib/buffer.fbs";
28
29file_identifier "TC2 ";
30
31// The possible model modes, represents a bit field.
32namespace libtextclassifier3;
33enum ModeFlag : int {
34  NONE = 0,
35  ANNOTATION = 1,
36  CLASSIFICATION = 2,
37  ANNOTATION_AND_CLASSIFICATION = 3,
38  SELECTION = 4,
39  ANNOTATION_AND_SELECTION = 5,
40  CLASSIFICATION_AND_SELECTION = 6,
41  ALL = 7,
42}
43
44// Enum for specifying the annotation usecase.
45namespace libtextclassifier3;
46enum AnnotationUsecase : int {
47  // Results are optimized for Smart{Select,Share,Linkify}.
48  ANNOTATION_USECASE_SMART = 0,
49  // Smart{Select,Share,Linkify}
50
51  // Results are optimized for using TextClassifier as an infrastructure that
52  // annotates as much as possible.
53  ANNOTATION_USECASE_RAW = 1,
54}
55
56namespace libtextclassifier3;
57enum DatetimeExtractorType : int {
58  UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
59  AM = 1,
60  PM = 2,
61  JANUARY = 3,
62  FEBRUARY = 4,
63  MARCH = 5,
64  APRIL = 6,
65  MAY = 7,
66  JUNE = 8,
67  JULY = 9,
68  AUGUST = 10,
69  SEPTEMBER = 11,
70  OCTOBER = 12,
71  NOVEMBER = 13,
72  DECEMBER = 14,
73  NEXT = 15,
74  NEXT_OR_SAME = 16,
75  LAST = 17,
76  NOW = 18,
77  TOMORROW = 19,
78  YESTERDAY = 20,
79  PAST = 21,
80  FUTURE = 22,
81  DAY = 23,
82  WEEK = 24,
83  MONTH = 25,
84  YEAR = 26,
85  MONDAY = 27,
86  TUESDAY = 28,
87  WEDNESDAY = 29,
88  THURSDAY = 30,
89  FRIDAY = 31,
90  SATURDAY = 32,
91  SUNDAY = 33,
92  DAYS = 34,
93  WEEKS = 35,
94  MONTHS = 36,
95
96  // TODO(zilka): Make the following 3 values singular for consistency.
97  HOURS = 37,
98
99  MINUTES = 38,
100  SECONDS = 39,
101  YEARS = 40,
102  DIGITS = 41,
103  SIGNEDDIGITS = 42,
104  ZERO = 43,
105  ONE = 44,
106  TWO = 45,
107  THREE = 46,
108  FOUR = 47,
109  FIVE = 48,
110  SIX = 49,
111  SEVEN = 50,
112  EIGHT = 51,
113  NINE = 52,
114  TEN = 53,
115  ELEVEN = 54,
116  TWELVE = 55,
117  THIRTEEN = 56,
118  FOURTEEN = 57,
119  FIFTEEN = 58,
120  SIXTEEN = 59,
121  SEVENTEEN = 60,
122  EIGHTEEN = 61,
123  NINETEEN = 62,
124  TWENTY = 63,
125  THIRTY = 64,
126  FORTY = 65,
127  FIFTY = 66,
128  SIXTY = 67,
129  SEVENTY = 68,
130  EIGHTY = 69,
131  NINETY = 70,
132  HUNDRED = 71,
133  THOUSAND = 72,
134}
135
136namespace libtextclassifier3;
137enum DatetimeGroupType : int {
138  GROUP_UNKNOWN = 0,
139  GROUP_UNUSED = 1,
140  GROUP_YEAR = 2,
141  GROUP_MONTH = 3,
142  GROUP_DAY = 4,
143  GROUP_HOUR = 5,
144  GROUP_MINUTE = 6,
145  GROUP_SECOND = 7,
146  GROUP_AMPM = 8,
147  GROUP_RELATIONDISTANCE = 9,
148  GROUP_RELATION = 10,
149  GROUP_RELATIONTYPE = 11,
150
151  // Dummy groups serve just as an inflator of the selection. E.g. we might want
152  // to select more text than was contained in an envelope of all extractor
153  // spans.
154  GROUP_DUMMY1 = 12,
155
156  GROUP_DUMMY2 = 13,
157}
158
159// Options for the model that predicts text selection.
160namespace libtextclassifier3;
161table SelectionModelOptions {
162  // If true, before the selection is returned, the unpaired brackets contained
163  // in the predicted selection are stripped from the both selection ends.
164  // The bracket codepoints are defined in the Unicode standard:
165  // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
166  strip_unpaired_brackets:bool = true;
167
168  // Number of hypothetical click positions on either side of the actual click
169  // to consider in order to enforce symmetry.
170  symmetry_context_size:int;
171
172  // Number of examples to bundle in one batch for inference.
173  batch_size:int = 1024;
174
175  // Whether to always classify a suggested selection or only on demand.
176  always_classify_suggested_selection:bool = false;
177}
178
179// Options for the model that classifies a text selection.
180namespace libtextclassifier3;
181table ClassificationModelOptions {
182  // Limits for phone numbers.
183  phone_min_num_digits:int = 7;
184
185  phone_max_num_digits:int = 15;
186
187  // Limits for addresses.
188  address_min_num_tokens:int;
189
190  // Maximum number of tokens to attempt a classification (-1 is unlimited).
191  max_num_tokens:int = -1;
192}
193
194// Options for post-checks, checksums and verification to apply on a match.
195namespace libtextclassifier3;
196table VerificationOptions {
197  verify_luhn_checksum:bool = false;
198
199  // Lua verifier to use.
200  // Index of the lua verifier in the model.
201  lua_verifier:int = -1;
202}
203
204// Behaviour of rule capturing groups.
205// This specifies how the text and span of a capturing group, in a regular
206// expression or from a capturing match in a grammar rule, should be handled.
207namespace libtextclassifier3;
208table CapturingGroup {
209  // If true, the span of the capturing group will be used to
210  // extend the selection.
211  extend_selection:bool = true;
212
213  // If set, the text of the capturing group will be used to set a field in
214  // the classfication result entity data.
215  entity_field_path:FlatbufferFieldPath;
216
217  // If set, the flatbuffer entity data will be merged with the
218  // classification result entity data.
219  serialized_entity_data:string (shared);
220
221  // If set, normalization to apply before text is used in entity data.
222  normalization_options:NormalizationOptions;
223
224  entity_data:EntityData;
225}
226
227// List of regular expression matchers to check.
228namespace libtextclassifier3.RegexModel_;
229table Pattern {
230  // The name of the collection of a match.
231  collection_name:string (shared);
232
233  // The pattern to check.
234  pattern:string (shared);
235
236  // The modes for which to apply the patterns.
237  enabled_modes:ModeFlag = ALL;
238
239  // The final score to assign to the results of this pattern.
240  target_classification_score:float = 1;
241
242  // Priority score used for conflict resolution with the other models.
243  priority_score:float = 0;
244
245  // If true, will use an approximate matching implementation implemented
246  // using Find() instead of the true Match(). This approximate matching will
247  // use the first Find() result and then check that it spans the whole input.
248  use_approximate_matching:bool = false;
249
250  compressed_pattern:CompressedBuffer;
251
252  // Verification to apply on a match.
253  verification_options:VerificationOptions;
254
255  capturing_group:[CapturingGroup];
256
257  // Entity data to set for a match.
258  serialized_entity_data:string (shared);
259
260  entity_data:EntityData;
261}
262
263namespace libtextclassifier3;
264table RegexModel {
265  patterns:[RegexModel_.Pattern];
266
267  // If true, will compile the regexes only on first use.
268  lazy_regex_compilation:bool = true;
269
270  // Lua scripts for match verification.
271  // The verifier can access:
272  // * `context`: The context as a string.
273  // * `match`: The groups of the regex match as an array, each group gives
274  // * `begin`: span start
275  // * `end`: span end
276  // * `text`: the text
277  // The verifier is expected to return a boolean, indicating whether the
278  // verification succeeded or not.
279  lua_verifier:[string];
280}
281
282// List of regex patterns.
283namespace libtextclassifier3.DatetimeModelPattern_;
284table Regex {
285  pattern:string (shared);
286
287  // The ith entry specifies the type of the ith capturing group.
288  // This is used to decide how the matched content has to be parsed.
289  groups:[DatetimeGroupType];
290
291  compressed_pattern:CompressedBuffer;
292}
293
294namespace libtextclassifier3;
295table DatetimeModelPattern {
296  regexes:[DatetimeModelPattern_.Regex];
297
298  // List of locale indices in DatetimeModel that represent the locales that
299  // these patterns should be used for. If empty, can be used for all locales.
300  locales:[int];
301
302  // The final score to assign to the results of this pattern.
303  target_classification_score:float = 1;
304
305  // Priority score used for conflict resolution with the other models.
306  priority_score:float = 0;
307
308  // The modes for which to apply the patterns.
309  enabled_modes:ModeFlag = ALL;
310
311  // The annotation usecases for which to apply the patterns.
312  // This is a flag field for values of AnnotationUsecase.
313  enabled_annotation_usecases:uint = 4294967295;
314}
315
316namespace libtextclassifier3;
317table DatetimeModelExtractor {
318  extractor:DatetimeExtractorType;
319  pattern:string (shared);
320  locales:[int];
321  compressed_pattern:CompressedBuffer;
322}
323
324namespace libtextclassifier3;
325table DatetimeModel {
326  // List of BCP 47 locale strings representing all locales supported by the
327  // model. The individual patterns refer back to them using an index.
328  locales:[string];
329
330  patterns:[DatetimeModelPattern];
331  extractors:[DatetimeModelExtractor];
332
333  // If true, will use the extractors for determining the match location as
334  // opposed to using the location where the global pattern matched.
335  use_extractors_for_locating:bool = true;
336
337  // List of locale ids, rules of whose are always run, after the requested
338  // ones.
339  default_locales:[int];
340
341  // If true, will generate the alternative interpretations for ambiguous
342  // datetime expressions.
343  generate_alternative_interpretations_when_ambiguous:bool = false;
344
345  // If true, will compile the regexes only on first use.
346  lazy_regex_compilation:bool = true;
347
348  // If true, will give only future dates (when the day is not specified).
349  prefer_future_for_unspecified_date:bool = false;
350}
351
352// Configuration for the tokenizer.
353namespace libtextclassifier3;
354table GrammarTokenizerOptions {
355  tokenization_type:TokenizationType = ICU;
356
357  // If true, white space tokens will be kept when using the icu tokenizer.
358  icu_preserve_whitespace_tokens:bool = false;
359
360  // Codepoint ranges that determine what role the different codepoints play
361  // during tokenized. The ranges must not overlap.
362  tokenization_codepoint_config:[TokenizationCodepointRange];
363
364  // A set of codepoint ranges to use in the mixed tokenization mode to identify
365  // stretches of tokens to re-tokenize using the internal tokenizer.
366  internal_tokenizer_codepoint_ranges:[CodepointRange];
367
368  // If true, tokens will be also split when the codepoint's script_id changes
369  // as defined in TokenizationCodepointRange.
370  tokenize_on_script_change:bool = false;
371}
372
373// Options for grammar date/datetime/date range annotations.
374namespace libtextclassifier3.GrammarDatetimeModel_;
375table AnnotationOptions {
376  // If enabled, extract special day offset like today, yesterday, etc.
377  enable_special_day_offset:bool = true;
378
379  // If true, merge the adjacent day of week, time and date. e.g.
380  // "20/2/2016 at 8pm" is extracted as a single instance instead of two
381  // instance: "20/2/2016" and "8pm".
382  merge_adjacent_components:bool = true;
383
384  // List the extra id of requested dates.
385  extra_requested_dates:[string];
386
387  // If true, try to include preposition to the extracted annotation. e.g.
388  // "at 6pm". if it's false, only 6pm is included. offline-actions has
389  // special requirements to include preposition.
390  include_preposition:bool = true;
391
392  // If enabled, extract range in date annotator.
393  // input: Monday, 5-6pm
394  // If the flag is true, The extracted annotation only contains 1 range
395  // instance which is from Monday 5pm to 6pm.
396  // If the flag is false, The extracted annotation contains two date
397  // instance: "Monday" and "6pm".
398  enable_date_range:bool = true;
399  reserved_6:int16 (deprecated);
400
401  // If enabled, the rule priority score is used to set the priority score of
402  // the annotation.
403  // In case of false the annotation priority score is set from
404  // GrammarDatetimeModel's priority_score
405  use_rule_priority_score:bool = false;
406
407  // If enabled, annotator will try to resolve the ambiguity by generating
408  // possible alternative interpretations of the input text
409  // e.g. '9:45' will be resolved to '9:45 AM' and '9:45 PM'.
410  generate_alternative_interpretations_when_ambiguous:bool;
411
412  // List of spans which grammar will ignore during the match e.g. if
413  // “@” is in the allowed span list and input is “12 March @ 12PM” then “@”
414  // will be ignored and 12 March @ 12PM will be translate to
415  // {Day:12 Month: March Hour: 12 MERIDIAN: PM}.
416  // This can also be achieved by adding additional rules e.g.
417  // <Digit_Day> <Month> <Time>
418  // <Digit_Day> <Month> @ <Time>
419  // Though this is doable in the grammar but requires multiple rules, this
420  // list enables the rule to represent multiple rules.
421  ignored_spans:[string];
422}
423
424namespace libtextclassifier3;
425table GrammarDatetimeModel {
426  // List of BCP 47 locale strings representing all locales supported by the
427  // model.
428  locales:[string];
429
430  // If true, will give only future dates (when the day is not specified).
431  prefer_future_for_unspecified_date:bool = false;
432
433  // Grammar specific tokenizer options.
434  grammar_tokenizer_options:GrammarTokenizerOptions;
435
436  // The modes for which to apply the grammars.
437  enabled_modes:ModeFlag = ALL;
438
439  // The datetime grammar rules.
440  datetime_rules:dates.DatetimeRules;
441
442  // The final score to assign to the results of grammar model
443  target_classification_score:float = 1;
444
445  // The priority score used for conflict resolution with the other models.
446  priority_score:float = 0;
447
448  // Options for grammar annotations.
449  annotation_options:GrammarDatetimeModel_.AnnotationOptions;
450}
451
452namespace libtextclassifier3.DatetimeModelLibrary_;
453table Item {
454  key:string (shared);
455  value:DatetimeModel;
456}
457
458// A set of named DateTime models.
459namespace libtextclassifier3;
460table DatetimeModelLibrary {
461  models:[DatetimeModelLibrary_.Item];
462}
463
464// Classification result to instantiate for a rule match.
465namespace libtextclassifier3.GrammarModel_;
466table RuleClassificationResult {
467  // The name of the collection.
468  collection_name:string (shared);
469
470  // The score.
471  target_classification_score:float = 1;
472
473  // The priority score used for conflict resolution with the other models.
474  priority_score:float = 0;
475
476  // Behaviour of capturing matches.
477  capturing_group:[CapturingGroup];
478
479  // Entity data to set for a match.
480  serialized_entity_data:string (shared);
481
482  // Enabled modes.
483  enabled_modes:ModeFlag = ALL;
484
485  entity_data:EntityData;
486}
487
488// Configuration for grammar based annotators.
489namespace libtextclassifier3;
490table GrammarModel {
491  // The grammar rules.
492  rules:grammar.RulesSet;
493
494  rule_classification_result:[GrammarModel_.RuleClassificationResult];
495
496  // Number of tokens in the context to use for classification and text
497  // selection suggestion.
498  // A value -1 uses the full context.
499  context_left_num_tokens:int;
500
501  context_right_num_tokens:int;
502
503  // Grammar specific tokenizer options.
504  tokenizer_options:GrammarTokenizerOptions;
505}
506
507namespace libtextclassifier3;
508table MoneyParsingOptions {
509  // Separators (codepoints) marking decimal or thousand in the money amount.
510  separators:[int];
511}
512
513namespace libtextclassifier3.ModelTriggeringOptions_;
514table CollectionToPriorityEntry {
515  key:string (key, shared);
516  value:float;
517}
518
519// Options controlling the output of the Tensorflow Lite models.
520namespace libtextclassifier3;
521table ModelTriggeringOptions {
522  // Lower bound threshold for filtering annotation model outputs.
523  min_annotate_confidence:float = 0;
524
525  // The modes for which to enable the models.
526  enabled_modes:ModeFlag = ALL;
527
528  // Comma-separated list of locales (BCP 47 tags) that dictionary
529  // classification supports.
530  dictionary_locales:string (shared);
531
532  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
533  // are used to prevent  triggering on input in unsupported languages. If
534  // empty, the model will trigger on all inputs.
535  locales:string (shared);
536
537  // Priority score assigned to the "other" class from ML model.
538  other_collection_priority_score:float = -1000;
539
540  // Priority score assigned to knowledge engine annotations.
541  knowledge_priority_score:float = 0;
542  reserved_7:int16 (deprecated);
543
544  // Apply a factor to the priority score for entities that are added to this
545  // map. Key: collection type e.g. "address", "phone"..., Value: float number.
546  // NOTE: The entries here need to be sorted since we use LookupByKey.
547  collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry];
548}
549
550// Options controlling the output of the classifier.
551namespace libtextclassifier3;
552table OutputOptions {
553  // Lists of collection names that will be filtered out at the output:
554  // - For annotation, the spans of given collection are simply dropped.
555  // - For classification, the result is mapped to the class "other".
556  // - For selection, the spans of given class are returned as
557  // single-selection.
558  filtered_collections_annotation:[string];
559
560  filtered_collections_classification:[string];
561  filtered_collections_selection:[string];
562}
563
564namespace libtextclassifier3.Model_;
565table EmbeddingPruningMask {
566  // If true, use pruning mask. In this case, we use mask
567  // pruning_mask to determine the mapping of hashed-charactergrams.
568  enabled:bool;
569
570  // Packing of the binary pruning mask into uint64 values.
571  pruning_mask:[ulong] (force_align: 16);
572
573  // Number of buckets before pruning.
574  full_num_buckets:int;
575
576  // Index of row of compressed embedding matrix to which all pruned buckets
577  // are mapped.
578  pruned_row_bucket_id:int;
579}
580
581namespace libtextclassifier3.Model_;
582table ConflictResolutionOptions {
583  // If true, will prioritize the longest annotation during conflict
584  // resolution.
585  prioritize_longest_annotation:bool = false;
586
587  // If true, the annotator will perform conflict resolution between the
588  // different sub-annotators also in the RAW mode. If false, no conflict
589  // resolution will be performed in RAW mode.
590  do_conflict_resolution_in_raw_mode:bool = true;
591}
592
593namespace libtextclassifier3;
594table Model {
595  // Comma-separated list of locales supported by the model as BCP 47 tags.
596  locales:string (shared);
597
598  version:int;
599
600  // A name for the model that can be used for e.g. logging.
601  name:string (shared);
602
603  selection_feature_options:FeatureProcessorOptions;
604  classification_feature_options:FeatureProcessorOptions;
605
606  // Tensorflow Lite models.
607  selection_model:[ubyte] (force_align: 16);
608
609  classification_model:[ubyte] (force_align: 16);
610  embedding_model:[ubyte] (force_align: 16);
611
612  // Options for the different models.
613  selection_options:SelectionModelOptions;
614
615  classification_options:ClassificationModelOptions;
616  regex_model:RegexModel;
617  datetime_model:DatetimeModel;
618
619  // Options controlling the output of the models.
620  triggering_options:ModelTriggeringOptions;
621
622  // Global switch that controls if SuggestSelection(), ClassifyText() and
623  // Annotate() will run. If a mode is disabled it returns empty/no-op results.
624  enabled_modes:ModeFlag = ALL;
625
626  // If true, will snap the selections that consist only of whitespaces to the
627  // containing suggested span. Otherwise, no suggestion is proposed, since the
628  // selections are not part of any token.
629  snap_whitespace_selections:bool = true;
630
631  // Global configuration for the output of SuggestSelection(), ClassifyText()
632  // and Annotate().
633  output_options:OutputOptions;
634
635  // Configures how Intents should be generated on Android.
636  android_intent_options:AndroidIntentFactoryOptions;
637
638  intent_options:IntentFactoryModel;
639
640  // Model resources.
641  resources:ResourcePool;
642
643  // Schema data for handling entity data.
644  entity_data_schema:[ubyte];
645
646  number_annotator_options:NumberAnnotatorOptions;
647  duration_annotator_options:DurationAnnotatorOptions;
648
649  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
650  // are used to prevent  triggering on input in unsupported languages. If
651  // empty, the model will trigger on all inputs.
652  triggering_locales:string (shared);
653
654  embedding_pruning_mask:Model_.EmbeddingPruningMask;
655  grammar_datetime_model:GrammarDatetimeModel;
656  contact_annotator_options:ContactAnnotatorOptions;
657  money_parsing_options:MoneyParsingOptions;
658  translate_annotator_options:TranslateAnnotatorOptions;
659  grammar_model:GrammarModel;
660  conflict_resolution_options:Model_.ConflictResolutionOptions;
661  experimental_model:ExperimentalModel;
662}
663
664// Method for selecting the center token.
665namespace libtextclassifier3.FeatureProcessorOptions_;
666enum CenterTokenSelectionMethod : int {
667  DEFAULT_CENTER_TOKEN_METHOD = 0,
668  // Invalid option.
669
670  // Use click indices to determine the center token.
671  CENTER_TOKEN_FROM_CLICK = 1,
672
673  // Use selection indices to get a token range, and select the middle of it
674  // as the center token.
675  CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
676}
677
678// Bounds-sensitive feature extraction configuration.
679namespace libtextclassifier3.FeatureProcessorOptions_;
680table BoundsSensitiveFeatures {
681  // Enables the extraction of bounds-sensitive features, instead of the click
682  // context features.
683  enabled:bool;
684
685  // The numbers of tokens to extract in specific locations relative to the
686  // bounds.
687  // Immediately before the span.
688  num_tokens_before:int;
689
690  // Inside the span, aligned with the beginning.
691  num_tokens_inside_left:int;
692
693  // Inside the span, aligned with the end.
694  num_tokens_inside_right:int;
695
696  // Immediately after the span.
697  num_tokens_after:int;
698
699  // If true, also extracts the tokens of the entire span and adds up their
700  // features forming one "token" to include in the extracted features.
701  include_inside_bag:bool;
702
703  // If true, includes the selection length (in the number of tokens) as a
704  // feature.
705  include_inside_length:bool;
706
707  // If true, for selection, single token spans are not run through the model
708  // and their score is assumed to be zero.
709  score_single_token_spans_as_zero:bool;
710}
711
712namespace libtextclassifier3;
713table FeatureProcessorOptions {
714  // Number of buckets used for hashing charactergrams.
715  num_buckets:int = -1;
716
717  // Size of the embedding.
718  embedding_size:int = -1;
719
720  // Number of bits for quantization for embeddings.
721  embedding_quantization_bits:int = 8;
722
723  // Context size defines the number of words to the left and to the right of
724  // the selected word to be used as context. For example, if context size is
725  // N, then we take N words to the left and N words to the right of the
726  // selected word as its context.
727  context_size:int = -1;
728
729  // Maximum number of words of the context to select in total.
730  max_selection_span:int = -1;
731
732  // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
733  // character trigrams etc.
734  chargram_orders:[int];
735
736  // Maximum length of a word, in codepoints.
737  max_word_length:int = 20;
738
739  // If true, will use the unicode-aware functionality for extracting features.
740  unicode_aware_features:bool = false;
741
742  // Whether to extract the token case feature.
743  extract_case_feature:bool = false;
744
745  // Whether to extract the selection mask feature.
746  extract_selection_mask_feature:bool = false;
747
748  // List of regexps to run over each token. For each regexp, if there is a
749  // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
750  regexp_feature:[string];
751
752  // Whether to remap all digits to a single number.
753  remap_digits:bool = false;
754
755  // Whether to lower-case each token before generating hashgrams.
756  lowercase_tokens:bool;
757
758  // If true, the selection classifier output will contain only the selections
759  // that are feasible (e.g., those that are shorter than max_selection_span),
760  // if false, the output will be a complete cross-product of possible
761  // selections to the left and possible selections to the right, including the
762  // infeasible ones.
763  // NOTE: Exists mainly for compatibility with older models that were trained
764  // with the non-reduced output space.
765  selection_reduced_output_space:bool = true;
766
767  // Collection names.
768  collections:[string];
769
770  // An index of collection in collections to be used if a collection name can't
771  // be mapped to an id.
772  default_collection:int = -1;
773
774  // If true, will split the input by lines, and only use the line that contains
775  // the clicked token.
776  only_use_line_with_click:bool = false;
777
778  // If true, will split tokens that contain the selection boundary, at the
779  // position of the boundary.
780  // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
781  split_tokens_on_selection_boundaries:bool = false;
782
783  // Codepoint ranges that determine how different codepoints are tokenized.
784  // The ranges must not overlap.
785  tokenization_codepoint_config:[TokenizationCodepointRange];
786
787  center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod;
788
789  // If true, span boundaries will be snapped to containing tokens and not
790  // required to exactly match token boundaries.
791  snap_label_span_boundaries_to_containing_tokens:bool;
792
793  // A set of codepoint ranges supported by the model.
794  supported_codepoint_ranges:[CodepointRange];
795
796  // A set of codepoint ranges to use in the mixed tokenization mode to identify
797  // stretches of tokens to re-tokenize using the internal tokenizer.
798  internal_tokenizer_codepoint_ranges:[CodepointRange];
799
800  // Minimum ratio of supported codepoints in the input context. If the ratio
801  // is lower than this, the feature computation will fail.
802  min_supported_codepoint_ratio:float = 0;
803
804  // Used for versioning the format of features the model expects.
805  // - feature_version == 0:
806  // For each token the features consist of:
807  // - chargram embeddings
808  // - dense features
809  // Chargram embeddings for tokens are concatenated first together,
810  // and at the end, the dense features for the tokens are concatenated
811  // to it. So the resulting feature vector has two regions.
812  feature_version:int = 0;
813
814  tokenization_type:TokenizationType = INTERNAL_TOKENIZER;
815  icu_preserve_whitespace_tokens:bool = false;
816
817  // List of codepoints that will be stripped from beginning and end of
818  // predicted spans.
819  ignored_span_boundary_codepoints:[int];
820
821  bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures;
822
823  // List of allowed charactergrams. The extracted charactergrams are filtered
824  // using this list, and charactergrams that are not present are interpreted as
825  // out-of-vocabulary.
826  // If no allowed_chargrams are specified, all charactergrams are allowed.
827  // The field is typed as bytes type to allow non-UTF8 chargrams.
828  allowed_chargrams:[string];
829
830  // If true, tokens will be also split when the codepoint's script_id changes
831  // as defined in TokenizationCodepointRange.
832  tokenize_on_script_change:bool = false;
833
834  // If true, the pipe character '|' will be used as a newline character when
835  // splitting lines.
836  use_pipe_character_for_newline:bool = true;
837}
838
839namespace libtextclassifier3;
840table NumberAnnotatorOptions {
841  // If true, number and percentage annotations will be produced.
842  enabled:bool = false;
843
844  // Score to assign to the annotated numbers and percentages in the annotator.
845  score:float = 1;
846
847  // Number priority score used for conflict resolution with the other models.
848  priority_score:float = 0;
849
850  // The modes in which to enable number and percentage annotations.
851  enabled_modes:ModeFlag = ALL;
852
853  // The annotation usecases for which to produce number annotations.
854  // This is a flag field for values of AnnotationUsecase.
855  enabled_annotation_usecases:uint = 4294967295;
856
857  // [Deprecated] A list of codepoints that can form a prefix of a valid number.
858  allowed_prefix_codepoints:[int];
859
860  // [Deprecated] A list of codepoints that can form a suffix of a valid number.
861  allowed_suffix_codepoints:[int];
862
863  // [Deprecated] List of codepoints that will be stripped from beginning of
864  // predicted spans.
865  ignored_prefix_span_boundary_codepoints:[int];
866
867  // [Deprecated] List of codepoints that will be stripped from end of predicted
868  // spans.
869  ignored_suffix_span_boundary_codepoints:[int];
870
871  // [Deprecated] If true, percent annotations will be produced.
872  enable_percentage:bool = false;
873
874  // Zero separated and ordered list of suffixes that mark a percent.
875  percentage_pieces_string:string (shared);
876
877  // [Deprecated] List of suffixes offsets in the percent_pieces_string string.
878  percentage_pieces_offsets:[int];
879
880  // Priority score for the percentage annotation.
881  percentage_priority_score:float = 1;
882
883  // Float number priority score used for conflict resolution with the other
884  // models.
885  float_number_priority_score:float = 0;
886
887  // The maximum number of digits an annotated number can have. Requirement:
888  // the value should be less or equal to 20.
889  max_number_of_digits:int = 20;
890
891  // The annotation usecases for which to produce percentage annotations.
892  // This is a flag field for values of AnnotationUsecase.
893  percentage_annotation_usecases:uint = 2;
894}
895
896// DurationAnnotator is so far tailored for English and Japanese only.
897namespace libtextclassifier3;
898table DurationAnnotatorOptions {
899  // If true, duration annotations will be produced.
900  enabled:bool = false;
901
902  // Score to assign to the annotated durations from the annotator.
903  score:float = 1;
904
905  // Priority score used for conflict resolution with the other models.
906  priority_score:float = 0;
907
908  // The modes in which to enable duration annotations.
909  enabled_modes:ModeFlag = ALL;
910
911  // The annotation usecases for which to produce duration annotations.
912  enabled_annotation_usecases:uint = 4294967295;
913
914  // Durations typically look like XX hours and XX minutes etc... The list of
915  // strings below enumerate variants of "hours", "minutes", etc. in these
916  // expressions. These are verbatim strings that are matched against tokens in
917  // the input.
918  week_expressions:[string];
919
920  day_expressions:[string];
921  hour_expressions:[string];
922  minute_expressions:[string];
923  second_expressions:[string];
924
925  // List of expressions that doesn't break a duration expression (can become
926  // a part of it) but has not semantic meaning.
927  filler_expressions:[string];
928
929  // List of expressions that mean half of a unit of duration (e.g. "half an
930  // hour").
931  half_expressions:[string];
932
933  // Set of condepoints that can split the Annotator tokens to sub-tokens for
934  // sub-token matching.
935  sub_token_separator_codepoints:[int];
936
937  // If this is true, unit must be associated with quantity. For example, a
938  // phrase "minute" is not parsed as one minute duration if this is true.
939  require_quantity:bool;
940
941  // If this is true, dangling quantity is included in the annotation. For
942  // example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds.
943  enable_dangling_quantity_interpretation:bool = true;
944}
945
946namespace libtextclassifier3;
947table ContactAnnotatorOptions {
948  // Supported for English genitives only so far.
949  enable_declension:bool;
950
951  // For each language there is a customized list of supported declensions.
952  language:string (shared);
953}
954
955namespace libtextclassifier3.TranslateAnnotatorOptions_;
956enum Algorithm : int {
957  DEFAULT_ALGORITHM = 0,
958  BACKOFF = 1,
959}
960
961// Backoff is the algorithm shipped with Android Q.
962namespace libtextclassifier3.TranslateAnnotatorOptions_;
963table BackoffOptions {
964  // The minimum size of text to prefer for detection (in codepoints).
965  min_text_size:int = 20;
966
967  // For reducing the score when text is less than the preferred size.
968  penalize_ratio:float = 1;
969
970  // Original detection score to surrounding text detection score ratios.
971  subject_text_score_ratio:float = 0.4;
972}
973
974namespace libtextclassifier3;
975table TranslateAnnotatorOptions {
976  enabled:bool = false;
977
978  // Score to assign to the classification results.
979  score:float = 1;
980
981  // Priority score used for conflict resolution with the other models.
982  priority_score:float;
983
984  algorithm:TranslateAnnotatorOptions_.Algorithm;
985  backoff_options:TranslateAnnotatorOptions_.BackoffOptions;
986}
987
988root_type libtextclassifier3.Model;
989