• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2018 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17include "annotator/entity-data.fbs";
18include "annotator/experimental/experimental.fbs";
19include "utils/codepoint-range.fbs";
20include "utils/container/bit-vector.fbs";
21include "utils/flatbuffers/flatbuffers.fbs";
22include "utils/grammar/rules.fbs";
23include "utils/intents/intent-config.fbs";
24include "utils/normalization.fbs";
25include "utils/resources.fbs";
26include "utils/tokenizer.fbs";
27include "utils/zlib/buffer.fbs";
28
29file_identifier "TC2 ";
30
31// The possible model modes, represents a bit field.
32namespace libtextclassifier3;
33enum ModeFlag : int {
34  NONE = 0,
35  ANNOTATION = 1,
36  CLASSIFICATION = 2,
37  ANNOTATION_AND_CLASSIFICATION = 3,
38  SELECTION = 4,
39  ANNOTATION_AND_SELECTION = 5,
40  CLASSIFICATION_AND_SELECTION = 6,
41  ALL = 7,
42}
43
44// Enum for specifying the annotation usecase.
45namespace libtextclassifier3;
46enum AnnotationUsecase : int {
47  // Results are optimized for Smart{Select,Share,Linkify}.
48  ANNOTATION_USECASE_SMART = 0,
49  // Smart{Select,Share,Linkify}
50
51  // Results are optimized for using TextClassifier as an infrastructure that
52  // annotates as much as possible.
53  ANNOTATION_USECASE_RAW = 1,
54}
55
56namespace libtextclassifier3;
57enum DatetimeExtractorType : int {
58  UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
59  AM = 1,
60  PM = 2,
61  JANUARY = 3,
62  FEBRUARY = 4,
63  MARCH = 5,
64  APRIL = 6,
65  MAY = 7,
66  JUNE = 8,
67  JULY = 9,
68  AUGUST = 10,
69  SEPTEMBER = 11,
70  OCTOBER = 12,
71  NOVEMBER = 13,
72  DECEMBER = 14,
73  NEXT = 15,
74  NEXT_OR_SAME = 16,
75  LAST = 17,
76  NOW = 18,
77  TOMORROW = 19,
78  YESTERDAY = 20,
79  PAST = 21,
80  FUTURE = 22,
81  DAY = 23,
82  WEEK = 24,
83  MONTH = 25,
84  YEAR = 26,
85  MONDAY = 27,
86  TUESDAY = 28,
87  WEDNESDAY = 29,
88  THURSDAY = 30,
89  FRIDAY = 31,
90  SATURDAY = 32,
91  SUNDAY = 33,
92  DAYS = 34,
93  WEEKS = 35,
94  MONTHS = 36,
95
96  // TODO(zilka): Make the following 3 values singular for consistency.
97  HOURS = 37,
98
99  MINUTES = 38,
100  SECONDS = 39,
101  YEARS = 40,
102  DIGITS = 41,
103  SIGNEDDIGITS = 42,
104  ZERO = 43,
105  ONE = 44,
106  TWO = 45,
107  THREE = 46,
108  FOUR = 47,
109  FIVE = 48,
110  SIX = 49,
111  SEVEN = 50,
112  EIGHT = 51,
113  NINE = 52,
114  TEN = 53,
115  ELEVEN = 54,
116  TWELVE = 55,
117  THIRTEEN = 56,
118  FOURTEEN = 57,
119  FIFTEEN = 58,
120  SIXTEEN = 59,
121  SEVENTEEN = 60,
122  EIGHTEEN = 61,
123  NINETEEN = 62,
124  TWENTY = 63,
125  THIRTY = 64,
126  FORTY = 65,
127  FIFTY = 66,
128  SIXTY = 67,
129  SEVENTY = 68,
130  EIGHTY = 69,
131  NINETY = 70,
132  HUNDRED = 71,
133  THOUSAND = 72,
134  NOON = 73,
135  MIDNIGHT = 74,
136}
137
138namespace libtextclassifier3;
139enum DatetimeGroupType : int {
140  GROUP_UNKNOWN = 0,
141  GROUP_UNUSED = 1,
142  GROUP_YEAR = 2,
143  GROUP_MONTH = 3,
144  GROUP_DAY = 4,
145  GROUP_HOUR = 5,
146  GROUP_MINUTE = 6,
147  GROUP_SECOND = 7,
148  GROUP_AMPM = 8,
149  GROUP_RELATIONDISTANCE = 9,
150  GROUP_RELATION = 10,
151  GROUP_RELATIONTYPE = 11,
152
153  // Dummy groups serve just as an inflator of the selection. E.g. we might want
154  // to select more text than was contained in an envelope of all extractor
155  // spans.
156  GROUP_DUMMY1 = 12,
157
158  GROUP_DUMMY2 = 13,
159  GROUP_ABSOLUTETIME = 14,
160}
161
162// Options for the model that predicts text selection.
163namespace libtextclassifier3;
164table SelectionModelOptions {
165  // If true, before the selection is returned, the unpaired brackets contained
166  // in the predicted selection are stripped from the both selection ends.
167  // The bracket codepoints are defined in the Unicode standard:
168  // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
169  strip_unpaired_brackets:bool = true;
170
171  // Number of hypothetical click positions on either side of the actual click
172  // to consider in order to enforce symmetry.
173  symmetry_context_size:int;
174
175  // Number of examples to bundle in one batch for inference.
176  batch_size:int = 1024;
177
178  // Whether to always classify a suggested selection or only on demand.
179  always_classify_suggested_selection:bool = false;
180}
181
182// Options for the model that classifies a text selection.
183namespace libtextclassifier3;
184table ClassificationModelOptions {
185  // Limits for phone numbers.
186  phone_min_num_digits:int = 7;
187
188  phone_max_num_digits:int = 15;
189
190  // Limits for addresses.
191  address_min_num_tokens:int;
192
193  // Maximum number of tokens to attempt a classification (-1 is unlimited).
194  max_num_tokens:int = -1;
195}
196
197// Options for post-checks, checksums and verification to apply on a match.
198namespace libtextclassifier3;
199table VerificationOptions {
200  verify_luhn_checksum:bool = false;
201
202  // Lua verifier to use.
203  // Index of the lua verifier in the model.
204  lua_verifier:int = -1;
205}
206
207// Behaviour of rule capturing groups.
208// This specifies how the text and span of a capturing group, in a regular
209// expression or from a capturing match in a grammar rule, should be handled.
210namespace libtextclassifier3;
211table CapturingGroup {
212  // If true, the span of the capturing group will be used to
213  // extend the selection.
214  extend_selection:bool = true;
215
216  // If set, the text of the capturing group will be used to set a field in
217  // the classfication result entity data.
218  entity_field_path:FlatbufferFieldPath;
219
220  // If set, the flatbuffer entity data will be merged with the
221  // classification result entity data.
222  serialized_entity_data:string (shared);
223
224  // If set, normalization to apply before text is used in entity data.
225  normalization_options:NormalizationOptions;
226
227  entity_data:EntityData;
228}
229
230// List of regular expression matchers to check.
231namespace libtextclassifier3.RegexModel_;
232table Pattern {
233  // The name of the collection of a match.
234  collection_name:string (shared);
235
236  // The pattern to check.
237  pattern:string (shared);
238
239  // The modes for which to apply the patterns.
240  enabled_modes:ModeFlag = ALL;
241
242  // The final score to assign to the results of this pattern.
243  target_classification_score:float = 1;
244
245  // Priority score used for conflict resolution with the other models.
246  priority_score:float = 0;
247
248  // If true, will use an approximate matching implementation implemented
249  // using Find() instead of the true Match(). This approximate matching will
250  // use the first Find() result and then check that it spans the whole input.
251  use_approximate_matching:bool = false;
252
253  compressed_pattern:CompressedBuffer;
254
255  // Verification to apply on a match.
256  verification_options:VerificationOptions;
257
258  capturing_group:[CapturingGroup];
259
260  // Entity data to set for a match.
261  serialized_entity_data:string (shared);
262
263  entity_data:EntityData;
264}
265
266namespace libtextclassifier3;
267table RegexModel {
268  patterns:[RegexModel_.Pattern];
269
270  // If true, will compile the regexes only on first use.
271  lazy_regex_compilation:bool = true;
272
273  // Lua scripts for match verification.
274  // The verifier can access:
275  // * `context`: The context as a string.
276  // * `match`: The groups of the regex match as an array, each group gives
277  // * `begin`: span start
278  // * `end`: span end
279  // * `text`: the text
280  // The verifier is expected to return a boolean, indicating whether the
281  // verification succeeded or not.
282  lua_verifier:[string];
283}
284
285// List of regex patterns.
286namespace libtextclassifier3.DatetimeModelPattern_;
287table Regex {
288  pattern:string (shared);
289
290  // The ith entry specifies the type of the ith capturing group.
291  // This is used to decide how the matched content has to be parsed.
292  groups:[DatetimeGroupType];
293
294  compressed_pattern:CompressedBuffer;
295}
296
297namespace libtextclassifier3;
298table DatetimeModelPattern {
299  regexes:[DatetimeModelPattern_.Regex];
300
301  // List of locale indices in DatetimeModel that represent the locales that
302  // these patterns should be used for. If empty, can be used for all locales.
303  locales:[int];
304
305  // The final score to assign to the results of this pattern.
306  target_classification_score:float = 1;
307
308  // Priority score used for conflict resolution with the other models.
309  priority_score:float = 0;
310
311  // The modes for which to apply the patterns.
312  enabled_modes:ModeFlag = ALL;
313
314  // The annotation usecases for which to apply the patterns.
315  // This is a flag field for values of AnnotationUsecase.
316  enabled_annotation_usecases:uint = 4294967295;
317}
318
319namespace libtextclassifier3;
320table DatetimeModelExtractor {
321  extractor:DatetimeExtractorType;
322  pattern:string (shared);
323  locales:[int];
324  compressed_pattern:CompressedBuffer;
325}
326
327namespace libtextclassifier3;
328table DatetimeModel {
329  // List of BCP 47 locale strings representing all locales supported by the
330  // model. The individual patterns refer back to them using an index.
331  locales:[string];
332
333  patterns:[DatetimeModelPattern];
334  extractors:[DatetimeModelExtractor];
335
336  // If true, will use the extractors for determining the match location as
337  // opposed to using the location where the global pattern matched.
338  use_extractors_for_locating:bool = true;
339
340  // List of locale ids, rules of whose are always run, after the requested
341  // ones.
342  default_locales:[int];
343
344  // If true, will generate the alternative interpretations for ambiguous
345  // datetime expressions.
346  generate_alternative_interpretations_when_ambiguous:bool = false;
347
348  // If true, will compile the regexes only on first use.
349  lazy_regex_compilation:bool = true;
350
351  // If true, will give only future dates (when the day is not specified).
352  prefer_future_for_unspecified_date:bool = false;
353}
354
355// Configuration for the tokenizer.
356namespace libtextclassifier3;
357table GrammarTokenizerOptions {
358  tokenization_type:TokenizationType = ICU;
359
360  // If true, white space tokens will be kept when using the icu tokenizer.
361  icu_preserve_whitespace_tokens:bool = false;
362
363  // Codepoint ranges that determine what role the different codepoints play
364  // during tokenized. The ranges must not overlap.
365  tokenization_codepoint_config:[TokenizationCodepointRange];
366
367  // A set of codepoint ranges to use in the mixed tokenization mode to identify
368  // stretches of tokens to re-tokenize using the internal tokenizer.
369  internal_tokenizer_codepoint_ranges:[CodepointRange];
370
371  // If true, tokens will be also split when the codepoint's script_id changes
372  // as defined in TokenizationCodepointRange.
373  tokenize_on_script_change:bool = false;
374}
375
376namespace libtextclassifier3.DatetimeModelLibrary_;
377table Item {
378  key:string (shared);
379  value:DatetimeModel;
380}
381
382// A set of named DateTime models.
383namespace libtextclassifier3;
384table DatetimeModelLibrary {
385  models:[DatetimeModelLibrary_.Item];
386}
387
388// Classification result to instantiate for a rule match.
389namespace libtextclassifier3.GrammarModel_;
390table RuleClassificationResult {
391  // The name of the collection.
392  collection_name:string (shared);
393
394  // The score.
395  target_classification_score:float = 1;
396
397  // The priority score used for conflict resolution with the other models.
398  priority_score:float = 0;
399
400  // Behaviour of capturing matches.
401  capturing_group:[CapturingGroup];
402
403  // Entity data to set for a match.
404  serialized_entity_data:string (shared);
405
406  // Enabled modes.
407  enabled_modes:ModeFlag = ALL;
408
409  entity_data:EntityData;
410}
411
412// Configuration for grammar based annotators.
413namespace libtextclassifier3;
414table GrammarModel {
415  // The grammar rules.
416  rules:grammar.RulesSet;
417
418  // Deprecated. Used only for the old implementation of the grammar model.
419  rule_classification_result:[GrammarModel_.RuleClassificationResult];
420
421  // Number of tokens in the context to use for classification and text
422  // selection suggestion.
423  // A value -1 uses the full context.
424  context_left_num_tokens:int;
425
426  context_right_num_tokens:int;
427
428  // Grammar specific tokenizer options.
429  tokenizer_options:GrammarTokenizerOptions;
430
431  // The score.
432  target_classification_score:float = 1;
433
434  // The priority score used for conflict resolution with the other models.
435  priority_score:float = 1;
436
437  // Global enabled modes. Use this instead of
438  // `rule_classification_result.enabled_modes`.
439  enabled_modes:ModeFlag = ALL;
440}
441
442namespace libtextclassifier3.MoneyParsingOptions_;
443table QuantitiesNameToExponentEntry {
444  key:string (key, shared);
445  value:int;
446}
447
448namespace libtextclassifier3;
449table MoneyParsingOptions {
450  // Separators (codepoints) marking decimal or thousand in the money amount.
451  separators:[int];
452
453  // Mapping between a quantity string (e.g. "million") and the power of 10
454  // it multiplies the amount with (e.g. 6 in case of "million").
455  // NOTE: The entries need to be sorted by key since we use LookupByKey.
456  quantities_name_to_exponent:[MoneyParsingOptions_.QuantitiesNameToExponentEntry];
457}
458
459namespace libtextclassifier3.ModelTriggeringOptions_;
460table CollectionToPriorityEntry {
461  key:string (key, shared);
462  value:float;
463}
464
465// Options controlling the output of the Tensorflow Lite models.
466namespace libtextclassifier3;
467table ModelTriggeringOptions {
468  // Lower bound threshold for filtering annotation model outputs.
469  min_annotate_confidence:float = 0;
470
471  // The modes for which to enable the models.
472  enabled_modes:ModeFlag = ALL;
473
474  // Comma-separated list of locales (BCP 47 tags) that dictionary
475  // classification supports.
476  dictionary_locales:string (shared);
477
478  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
479  // are used to prevent  triggering on input in unsupported languages. If
480  // empty, the model will trigger on all inputs.
481  locales:string (shared);
482
483  // Priority score assigned to the "other" class from ML model.
484  other_collection_priority_score:float = -1000;
485
486  // Priority score assigned to knowledge engine annotations.
487  knowledge_priority_score:float = 0;
488  reserved_7:int16 (deprecated);
489
490  // Apply a factor to the priority score for entities that are added to this
491  // map. Key: collection type e.g. "address", "phone"..., Value: float number.
492  // NOTE: The entries here need to be sorted since we use LookupByKey.
493  collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry];
494
495  // Enabled modes for the knowledge engine model.
496  knowledge_enabled_modes:ModeFlag = ALL;
497
498  // Enabled modes for the experimental model.
499  experimental_enabled_modes:ModeFlag = ALL;
500
501  // Enabled modes for the installed app model.
502  installed_app_enabled_modes:ModeFlag = ALL;
503}
504
505// Options controlling the output of the classifier.
506namespace libtextclassifier3;
507table OutputOptions {
508  // Lists of collection names that will be filtered out at the output:
509  // - For annotation, the spans of given collection are simply dropped.
510  // - For classification, the result is mapped to the class "other".
511  // - For selection, the spans of given class are returned as
512  // single-selection.
513  filtered_collections_annotation:[string];
514
515  filtered_collections_classification:[string];
516  filtered_collections_selection:[string];
517}
518
519namespace libtextclassifier3.Model_;
520table EmbeddingPruningMask {
521  // If true, use pruning mask. In this case, we use mask
522  // pruning_mask to determine the mapping of hashed-charactergrams.
523  enabled:bool;
524
525  // Packing of the binary pruning mask into uint64 values.
526  pruning_mask:[ulong] (force_align: 16);
527
528  // Number of buckets before pruning.
529  full_num_buckets:int;
530
531  // Index of row of compressed embedding matrix to which all pruned buckets
532  // are mapped.
533  pruned_row_bucket_id:int;
534}
535
536namespace libtextclassifier3.Model_;
537table ConflictResolutionOptions {
538  // If true, will prioritize the longest annotation during conflict
539  // resolution.
540  prioritize_longest_annotation:bool = false;
541
542  // If true, the annotator will perform conflict resolution between the
543  // different sub-annotators also in the RAW mode. If false, no conflict
544  // resolution will be performed in RAW mode.
545  do_conflict_resolution_in_raw_mode:bool = true;
546}
547
548namespace libtextclassifier3;
549table Model {
550  // Comma-separated list of locales supported by the model as BCP 47 tags.
551  locales:string (shared);
552
553  version:int;
554
555  // A name for the model that can be used for e.g. logging.
556  name:string (shared);
557
558  selection_feature_options:FeatureProcessorOptions;
559  classification_feature_options:FeatureProcessorOptions;
560
561  // Tensorflow Lite models.
562  selection_model:[ubyte] (force_align: 16);
563
564  classification_model:[ubyte] (force_align: 16);
565  embedding_model:[ubyte] (force_align: 16);
566
567  // Options for the different models.
568  selection_options:SelectionModelOptions;
569
570  classification_options:ClassificationModelOptions;
571  regex_model:RegexModel;
572  datetime_model:DatetimeModel;
573
574  // Options controlling the output of the models.
575  triggering_options:ModelTriggeringOptions;
576
577  // Global switch that controls if SuggestSelection(), ClassifyText() and
578  // Annotate() will run. If a mode is disabled it returns empty/no-op results.
579  enabled_modes:ModeFlag = ALL;
580
581  // If true, will snap the selections that consist only of whitespaces to the
582  // containing suggested span. Otherwise, no suggestion is proposed, since the
583  // selections are not part of any token.
584  snap_whitespace_selections:bool = true;
585
586  // Global configuration for the output of SuggestSelection(), ClassifyText()
587  // and Annotate().
588  output_options:OutputOptions;
589
590  // Configures how Intents should be generated on Android.
591  android_intent_options:AndroidIntentFactoryOptions;
592
593  intent_options:IntentFactoryModel;
594
595  // Model resources.
596  resources:ResourcePool;
597
598  // Schema data for handling entity data.
599  entity_data_schema:[ubyte];
600
601  number_annotator_options:NumberAnnotatorOptions;
602  duration_annotator_options:DurationAnnotatorOptions;
603
604  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
605  // are used to prevent  triggering on input in unsupported languages. If
606  // empty, the model will trigger on all inputs.
607  triggering_locales:string (shared);
608
609  embedding_pruning_mask:Model_.EmbeddingPruningMask;
610  reserved_25:int16 (deprecated);
611  contact_annotator_options:ContactAnnotatorOptions;
612  money_parsing_options:MoneyParsingOptions;
613  translate_annotator_options:TranslateAnnotatorOptions;
614  grammar_model:GrammarModel;
615  conflict_resolution_options:Model_.ConflictResolutionOptions;
616  experimental_model:ExperimentalModel;
617  pod_ner_model:PodNerModel;
618  vocab_model:VocabModel;
619  datetime_grammar_model:GrammarModel;
620}
621
622// Method for selecting the center token.
623namespace libtextclassifier3.FeatureProcessorOptions_;
624enum CenterTokenSelectionMethod : int {
625  DEFAULT_CENTER_TOKEN_METHOD = 0,
626  // Invalid option.
627
628  // Use click indices to determine the center token.
629  CENTER_TOKEN_FROM_CLICK = 1,
630
631  // Use selection indices to get a token range, and select the middle of it
632  // as the center token.
633  CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
634}
635
636// Bounds-sensitive feature extraction configuration.
637namespace libtextclassifier3.FeatureProcessorOptions_;
638table BoundsSensitiveFeatures {
639  // Enables the extraction of bounds-sensitive features, instead of the click
640  // context features.
641  enabled:bool;
642
643  // The numbers of tokens to extract in specific locations relative to the
644  // bounds.
645  // Immediately before the span.
646  num_tokens_before:int;
647
648  // Inside the span, aligned with the beginning.
649  num_tokens_inside_left:int;
650
651  // Inside the span, aligned with the end.
652  num_tokens_inside_right:int;
653
654  // Immediately after the span.
655  num_tokens_after:int;
656
657  // If true, also extracts the tokens of the entire span and adds up their
658  // features forming one "token" to include in the extracted features.
659  include_inside_bag:bool;
660
661  // If true, includes the selection length (in the number of tokens) as a
662  // feature.
663  include_inside_length:bool;
664
665  // If true, for selection, single token spans are not run through the model
666  // and their score is assumed to be zero.
667  score_single_token_spans_as_zero:bool;
668}
669
670namespace libtextclassifier3;
671table FeatureProcessorOptions {
672  // Number of buckets used for hashing charactergrams.
673  num_buckets:int = -1;
674
675  // Size of the embedding.
676  embedding_size:int = -1;
677
678  // Number of bits for quantization for embeddings.
679  embedding_quantization_bits:int = 8;
680
681  // Context size defines the number of words to the left and to the right of
682  // the selected word to be used as context. For example, if context size is
683  // N, then we take N words to the left and N words to the right of the
684  // selected word as its context.
685  context_size:int = -1;
686
687  // Maximum number of words of the context to select in total.
688  max_selection_span:int = -1;
689
690  // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
691  // character trigrams etc.
692  chargram_orders:[int];
693
694  // Maximum length of a word, in codepoints.
695  max_word_length:int = 20;
696
697  // If true, will use the unicode-aware functionality for extracting features.
698  unicode_aware_features:bool = false;
699
700  // Whether to extract the token case feature.
701  extract_case_feature:bool = false;
702
703  // Whether to extract the selection mask feature.
704  extract_selection_mask_feature:bool = false;
705
706  // List of regexps to run over each token. For each regexp, if there is a
707  // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
708  regexp_feature:[string];
709
710  // Whether to remap all digits to a single number.
711  remap_digits:bool = false;
712
713  // Whether to lower-case each token before generating hashgrams.
714  lowercase_tokens:bool;
715
716  // If true, the selection classifier output will contain only the selections
717  // that are feasible (e.g., those that are shorter than max_selection_span),
718  // if false, the output will be a complete cross-product of possible
719  // selections to the left and possible selections to the right, including the
720  // infeasible ones.
721  // NOTE: Exists mainly for compatibility with older models that were trained
722  // with the non-reduced output space.
723  selection_reduced_output_space:bool = true;
724
725  // Collection names.
726  collections:[string];
727
728  // An index of collection in collections to be used if a collection name can't
729  // be mapped to an id.
730  default_collection:int = -1;
731
732  // If true, will split the input by lines, and only use the line that contains
733  // the clicked token.
734  only_use_line_with_click:bool = false;
735
736  // If true, will split tokens that contain the selection boundary, at the
737  // position of the boundary.
738  // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
739  split_tokens_on_selection_boundaries:bool = false;
740
741  // Codepoint ranges that determine how different codepoints are tokenized.
742  // The ranges must not overlap.
743  tokenization_codepoint_config:[TokenizationCodepointRange];
744
745  center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod;
746
747  // If true, span boundaries will be snapped to containing tokens and not
748  // required to exactly match token boundaries.
749  snap_label_span_boundaries_to_containing_tokens:bool;
750
751  // A set of codepoint ranges supported by the model.
752  supported_codepoint_ranges:[CodepointRange];
753
754  // A set of codepoint ranges to use in the mixed tokenization mode to identify
755  // stretches of tokens to re-tokenize using the internal tokenizer.
756  internal_tokenizer_codepoint_ranges:[CodepointRange];
757
758  // Minimum ratio of supported codepoints in the input context. If the ratio
759  // is lower than this, the feature computation will fail.
760  min_supported_codepoint_ratio:float = 0;
761
762  // Used for versioning the format of features the model expects.
763  // - feature_version == 0:
764  // For each token the features consist of:
765  // - chargram embeddings
766  // - dense features
767  // Chargram embeddings for tokens are concatenated first together,
768  // and at the end, the dense features for the tokens are concatenated
769  // to it. So the resulting feature vector has two regions.
770  feature_version:int = 0;
771
772  tokenization_type:TokenizationType = INTERNAL_TOKENIZER;
773  icu_preserve_whitespace_tokens:bool = false;
774
775  // List of codepoints that will be stripped from beginning and end of
776  // predicted spans.
777  ignored_span_boundary_codepoints:[int];
778
779  bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures;
780
781  // List of allowed charactergrams. The extracted charactergrams are filtered
782  // using this list, and charactergrams that are not present are interpreted as
783  // out-of-vocabulary.
784  // If no allowed_chargrams are specified, all charactergrams are allowed.
785  // The field is typed as bytes type to allow non-UTF8 chargrams.
786  allowed_chargrams:[string];
787
788  // If true, tokens will be also split when the codepoint's script_id changes
789  // as defined in TokenizationCodepointRange.
790  tokenize_on_script_change:bool = false;
791
792  // If true, the pipe character '|' will be used as a newline character when
793  // splitting lines.
794  use_pipe_character_for_newline:bool = true;
795}
796
797namespace libtextclassifier3;
798table NumberAnnotatorOptions {
799  // If true, number and percentage annotations will be produced.
800  enabled:bool = false;
801
802  // Score to assign to the annotated numbers and percentages in the annotator.
803  score:float = 1;
804
805  // Number priority score used for conflict resolution with the other models.
806  priority_score:float = 0;
807
808  // The modes in which to enable number and percentage annotations.
809  enabled_modes:ModeFlag = ALL;
810
811  // The annotation usecases for which to produce number annotations.
812  // This is a flag field for values of AnnotationUsecase.
813  enabled_annotation_usecases:uint = 4294967295;
814
815  // [Deprecated] A list of codepoints that can form a prefix of a valid number.
816  allowed_prefix_codepoints:[int];
817
818  // [Deprecated] A list of codepoints that can form a suffix of a valid number.
819  allowed_suffix_codepoints:[int];
820
821  // [Deprecated] List of codepoints that will be stripped from beginning of
822  // predicted spans.
823  ignored_prefix_span_boundary_codepoints:[int];
824
825  // [Deprecated] List of codepoints that will be stripped from end of predicted
826  // spans.
827  ignored_suffix_span_boundary_codepoints:[int];
828
829  // [Deprecated] If true, percent annotations will be produced.
830  enable_percentage:bool = false;
831
832  // Zero separated and ordered list of suffixes that mark a percent.
833  percentage_pieces_string:string (shared);
834
835  // [Deprecated] List of suffixes offsets in the percent_pieces_string string.
836  percentage_pieces_offsets:[int];
837
838  // Priority score for the percentage annotation.
839  percentage_priority_score:float = 1;
840
841  // Float number priority score used for conflict resolution with the other
842  // models.
843  float_number_priority_score:float = 0;
844
845  // The maximum number of digits an annotated number can have. Requirement:
846  // the value should be less or equal to 20.
847  max_number_of_digits:int = 20;
848
849  // The annotation usecases for which to produce percentage annotations.
850  // This is a flag field for values of AnnotationUsecase.
851  percentage_annotation_usecases:uint = 2;
852}
853
854// DurationAnnotator is so far tailored for English and Japanese only.
855namespace libtextclassifier3;
856table DurationAnnotatorOptions {
857  // If true, duration annotations will be produced.
858  enabled:bool = false;
859
860  // Score to assign to the annotated durations from the annotator.
861  score:float = 1;
862
863  // Priority score used for conflict resolution with the other models.
864  priority_score:float = 0;
865
866  // The modes in which to enable duration annotations.
867  enabled_modes:ModeFlag = ALL;
868
869  // The annotation usecases for which to produce duration annotations.
870  enabled_annotation_usecases:uint = 4294967295;
871
872  // Durations typically look like XX hours and XX minutes etc... The list of
873  // strings below enumerate variants of "hours", "minutes", etc. in these
874  // expressions. These are verbatim strings that are matched against tokens in
875  // the input.
876  week_expressions:[string];
877
878  day_expressions:[string];
879  hour_expressions:[string];
880  minute_expressions:[string];
881  second_expressions:[string];
882
883  // List of expressions that doesn't break a duration expression (can become
884  // a part of it) but has not semantic meaning.
885  filler_expressions:[string];
886
887  // List of expressions that mean half of a unit of duration (e.g. "half an
888  // hour").
889  half_expressions:[string];
890
891  // Set of condepoints that can split the Annotator tokens to sub-tokens for
892  // sub-token matching.
893  sub_token_separator_codepoints:[int];
894
895  // If this is true, unit must be associated with quantity. For example, a
896  // phrase "minute" is not parsed as one minute duration if this is true.
897  require_quantity:bool;
898
899  // If this is true, dangling quantity is included in the annotation. For
900  // example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds.
901  enable_dangling_quantity_interpretation:bool = true;
902}
903
904namespace libtextclassifier3;
905table ContactAnnotatorOptions {
906  // Supported for English genitives only so far.
907  enable_declension:bool;
908
909  // For each language there is a customized list of supported declensions.
910  language:string (shared);
911
912  // Enabled modes.
913  enabled_modes:ModeFlag = ALL;
914}
915
916namespace libtextclassifier3.TranslateAnnotatorOptions_;
917enum Algorithm : int {
918  DEFAULT_ALGORITHM = 0,
919  BACKOFF = 1,
920}
921
922// Backoff is the algorithm shipped with Android Q.
923namespace libtextclassifier3.TranslateAnnotatorOptions_;
924table BackoffOptions {
925  // The minimum size of text to prefer for detection (in codepoints).
926  min_text_size:int = 20;
927
928  // For reducing the score when text is less than the preferred size.
929  penalize_ratio:float = 1;
930
931  // Original detection score to surrounding text detection score ratios.
932  subject_text_score_ratio:float = 0.4;
933}
934
935namespace libtextclassifier3;
936table TranslateAnnotatorOptions {
937  enabled:bool = false;
938
939  // Score to assign to the classification results.
940  score:float = 1;
941
942  // Priority score used for conflict resolution with the other models.
943  priority_score:float;
944
945  algorithm:TranslateAnnotatorOptions_.Algorithm;
946  backoff_options:TranslateAnnotatorOptions_.BackoffOptions;
947
948  // Enabled modes.
949  enabled_modes:ModeFlag = CLASSIFICATION;
950}
951
952namespace libtextclassifier3.PodNerModel_;
953table Collection {
954  // Collection's name (e.g., "location", "person").
955  name:string (shared);
956
957  // Priority scores used for conflict resolution with the other annotators
958  // when the annotation is made over a single/multi token text.
959  single_token_priority_score:float;
960
961  multi_token_priority_score:float;
962}
963
964namespace libtextclassifier3.PodNerModel_.Label_;
965enum BoiseType : int {
966  NONE = 0,
967  BEGIN = 1,
968  O = 2,
969  // No label.
970
971  INTERMEDIATE = 3,
972  SINGLE = 4,
973  END = 5,
974}
975
976namespace libtextclassifier3.PodNerModel_.Label_;
977enum MentionType : int {
978  UNDEFINED = 0,
979  NAM = 1,
980  NOM = 2,
981}
982
983namespace libtextclassifier3.PodNerModel_;
984table Label {
985  boise_type:Label_.BoiseType;
986  mention_type:Label_.MentionType;
987  collection_id:int;
988  // points to the collections array above.
989}
990
991namespace libtextclassifier3;
992table PodNerModel {
993  tflite_model:[ubyte];
994  word_piece_vocab:[ubyte];
995  lowercase_input:bool = true;
996
997  // Index of mention_logits tensor in the output of the tflite model. Can
998  // be found in the textproto output after model is converted to tflite.
999  logits_index_in_output_tensor:int = 0;
1000
1001  // Whether to append a period at the end of an input that doesn't already
1002  // end in punctuation.
1003  append_final_period:bool = false;
1004
1005  // Priority score used for conflict resolution with the other models. Used
1006  // only if collections_array is empty.
1007  priority_score:float = 0;
1008
1009  // Maximum number of wordpieces supported by the model.
1010  max_num_wordpieces:int = 128;
1011
1012  // In case of long text (number of wordpieces greater than the max) we use
1013  // sliding window approach, this determines the number of overlapping
1014  // wordpieces between two consecutive windows. This overlap enables context
1015  // for each word NER annotates.
1016  sliding_window_num_wordpieces_overlap:int = 20;
1017  reserved_9:int16 (deprecated);
1018
1019  // The possible labels the ner model can output. If empty the default labels
1020  // will be used.
1021  labels:[PodNerModel_.Label];
1022
1023  // If the ratio of unknown wordpieces in the input text is greater than this
1024  // maximum, the text won't be annotated.
1025  max_ratio_unknown_wordpieces:float = 0.1;
1026
1027  // Possible collections for labeled entities.
1028  collections:[PodNerModel_.Collection];
1029
1030  // Minimum word-length and wordpieces-length required for the text to be
1031  // annotated.
1032  min_number_of_tokens:int = 1;
1033
1034  min_number_of_wordpieces:int = 1;
1035
1036  // Enabled modes.
1037  enabled_modes:ModeFlag = ALL;
1038}
1039
1040namespace libtextclassifier3;
1041table VocabModel {
1042  // A trie that stores a list of vocabs that triggers "Define". A id is
1043  // returned when looking up a vocab from the trie and the id can be used
1044  // to access more information about that vocab. The marisa trie library
1045  // requires 8-byte alignment because the first thing in a marisa trie is a
1046  // 64-bit integer.
1047  vocab_trie:[ubyte] (force_align: 8);
1048
1049  // A bit vector that tells if the vocab should trigger "Define" for users of
1050  // beginner proficiency only. To look up the bit vector, use the id returned
1051  // by the trie.
1052  beginner_level:BitVectorData;
1053
1054  // A sorted list of indices of vocabs that should not trigger "Define" if
1055  // its leading character is in upper case. The indices are those returned by
1056  // trie. You may perform binary search to look up an index.
1057  do_not_trigger_in_upper_case:BitVectorData;
1058
1059  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
1060  // are used to prevent  triggering on input in unsupported languages. If
1061  // empty, the model will trigger on all inputs.
1062  triggering_locales:string (shared);
1063
1064  // The final score to assign to the results of the vocab model
1065  target_classification_score:float = 1;
1066
1067  // Priority score used for conflict resolution with the other models.
1068  priority_score:float = 0;
1069
1070  // Enabled modes.
1071  enabled_modes:ModeFlag = ANNOTATION_AND_CLASSIFICATION;
1072}
1073
1074root_type libtextclassifier3.Model;
1075