• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2018 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17include "annotator/entity-data.fbs";
18include "annotator/experimental/experimental.fbs";
19include "utils/codepoint-range.fbs";
20include "utils/container/bit-vector.fbs";
21include "utils/flatbuffers/flatbuffers.fbs";
22include "utils/grammar/rules.fbs";
23include "utils/intents/intent-config.fbs";
24include "utils/normalization.fbs";
25include "utils/resources.fbs";
26include "utils/tokenizer.fbs";
27include "utils/zlib/buffer.fbs";
28
29file_identifier "TC2 ";
30
31// The possible model modes, represents a bit field.
32namespace libtextclassifier3;
33enum ModeFlag : int {
34  NONE = 0,
35  ANNOTATION = 1,
36  CLASSIFICATION = 2,
37  ANNOTATION_AND_CLASSIFICATION = 3,
38  SELECTION = 4,
39  ANNOTATION_AND_SELECTION = 5,
40  CLASSIFICATION_AND_SELECTION = 6,
41  ALL = 7,
42}
43
44// Enum for specifying the annotation usecase.
45namespace libtextclassifier3;
46enum AnnotationUsecase : int {
47  // Results are optimized for Smart{Select,Share,Linkify}.
48  ANNOTATION_USECASE_SMART = 0,
49  // Smart{Select,Share,Linkify}
50
51  // Results are optimized for using TextClassifier as an infrastructure that
52  // annotates as much as possible.
53  ANNOTATION_USECASE_RAW = 1,
54}
55
56namespace libtextclassifier3;
57enum DatetimeExtractorType : int {
58  UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
59  AM = 1,
60  PM = 2,
61  JANUARY = 3,
62  FEBRUARY = 4,
63  MARCH = 5,
64  APRIL = 6,
65  MAY = 7,
66  JUNE = 8,
67  JULY = 9,
68  AUGUST = 10,
69  SEPTEMBER = 11,
70  OCTOBER = 12,
71  NOVEMBER = 13,
72  DECEMBER = 14,
73  NEXT = 15,
74  NEXT_OR_SAME = 16,
75  LAST = 17,
76  NOW = 18,
77  TOMORROW = 19,
78  YESTERDAY = 20,
79  PAST = 21,
80  FUTURE = 22,
81  DAY = 23,
82  WEEK = 24,
83  MONTH = 25,
84  YEAR = 26,
85  MONDAY = 27,
86  TUESDAY = 28,
87  WEDNESDAY = 29,
88  THURSDAY = 30,
89  FRIDAY = 31,
90  SATURDAY = 32,
91  SUNDAY = 33,
92  DAYS = 34,
93  WEEKS = 35,
94  MONTHS = 36,
95
96  // TODO(zilka): Make the following 3 values singular for consistency.
97  HOURS = 37,
98
99  MINUTES = 38,
100  SECONDS = 39,
101  YEARS = 40,
102  DIGITS = 41,
103  SIGNEDDIGITS = 42,
104  ZERO = 43,
105  ONE = 44,
106  TWO = 45,
107  THREE = 46,
108  FOUR = 47,
109  FIVE = 48,
110  SIX = 49,
111  SEVEN = 50,
112  EIGHT = 51,
113  NINE = 52,
114  TEN = 53,
115  ELEVEN = 54,
116  TWELVE = 55,
117  THIRTEEN = 56,
118  FOURTEEN = 57,
119  FIFTEEN = 58,
120  SIXTEEN = 59,
121  SEVENTEEN = 60,
122  EIGHTEEN = 61,
123  NINETEEN = 62,
124  TWENTY = 63,
125  THIRTY = 64,
126  FORTY = 65,
127  FIFTY = 66,
128  SIXTY = 67,
129  SEVENTY = 68,
130  EIGHTY = 69,
131  NINETY = 70,
132  HUNDRED = 71,
133  THOUSAND = 72,
134  NOON = 73,
135  MIDNIGHT = 74,
136}
137
138namespace libtextclassifier3;
139enum DatetimeGroupType : int {
140  GROUP_UNKNOWN = 0,
141  GROUP_UNUSED = 1,
142  GROUP_YEAR = 2,
143  GROUP_MONTH = 3,
144  GROUP_DAY = 4,
145  GROUP_HOUR = 5,
146  GROUP_MINUTE = 6,
147  GROUP_SECOND = 7,
148  GROUP_AMPM = 8,
149  GROUP_RELATIONDISTANCE = 9,
150  GROUP_RELATION = 10,
151  GROUP_RELATIONTYPE = 11,
152
153  // Dummy groups serve just as an inflator of the selection. E.g. we might want
154  // to select more text than was contained in an envelope of all extractor
155  // spans.
156  GROUP_DUMMY1 = 12,
157
158  GROUP_DUMMY2 = 13,
159  GROUP_ABSOLUTETIME = 14,
160}
161
162// Options for the model that predicts text selection.
163namespace libtextclassifier3;
164table SelectionModelOptions {
165  // If true, before the selection is returned, the unpaired brackets contained
166  // in the predicted selection are stripped from the both selection ends.
167  // The bracket codepoints are defined in the Unicode standard:
168  // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
169  strip_unpaired_brackets:bool = true;
170
171  // Number of hypothetical click positions on either side of the actual click
172  // to consider in order to enforce symmetry.
173  symmetry_context_size:int;
174
175  // Number of examples to bundle in one batch for inference.
176  batch_size:int = 1024;
177
178  // Whether to always classify a suggested selection or only on demand.
179  always_classify_suggested_selection:bool = false;
180}
181
182// Options for the model that classifies a text selection.
183namespace libtextclassifier3;
184table ClassificationModelOptions {
185  // Limits for phone numbers.
186  phone_min_num_digits:int = 7;
187
188  phone_max_num_digits:int = 15;
189
190  // Limits for addresses.
191  address_min_num_tokens:int;
192
193  // Maximum number of tokens to attempt a classification (-1 is unlimited).
194  max_num_tokens:int = -1;
195}
196
197// Options for post-checks, checksums and verification to apply on a match.
198namespace libtextclassifier3;
199table VerificationOptions {
200  verify_luhn_checksum:bool = false;
201
202  // Lua verifier to use.
203  // Index of the lua verifier in the model.
204  lua_verifier:int = -1;
205}
206
207// Behaviour of rule capturing groups.
208// This specifies how the text and span of a capturing group, in a regular
209// expression or from a capturing match in a grammar rule, should be handled.
210namespace libtextclassifier3;
211table CapturingGroup {
212  // If true, the span of the capturing group will be used to
213  // extend the selection.
214  extend_selection:bool = true;
215
216  // If set, the text of the capturing group will be used to set a field in
217  // the classfication result entity data.
218  entity_field_path:FlatbufferFieldPath;
219
220  // If set, the flatbuffer entity data will be merged with the
221  // classification result entity data.
222  serialized_entity_data:string (shared);
223
224  // If set, normalization to apply before text is used in entity data.
225  normalization_options:NormalizationOptions;
226
227  entity_data:EntityData;
228}
229
230// List of regular expression matchers to check.
231namespace libtextclassifier3.RegexModel_;
232table Pattern {
233  // The name of the collection of a match.
234  collection_name:string (shared);
235
236  // The pattern to check.
237  pattern:string (shared);
238
239  // The modes for which to apply the patterns.
240  enabled_modes:ModeFlag = ALL;
241
242  // The final score to assign to the results of this pattern.
243  target_classification_score:float = 1;
244
245  // Priority score used for conflict resolution with the other models.
246  priority_score:float = 0;
247
248  // If true, will use an approximate matching implementation implemented
249  // using Find() instead of the true Match(). This approximate matching will
250  // use the first Find() result and then check that it spans the whole input.
251  use_approximate_matching:bool = false;
252
253  compressed_pattern:CompressedBuffer;
254
255  // Verification to apply on a match.
256  verification_options:VerificationOptions;
257
258  capturing_group:[CapturingGroup];
259
260  // Entity data to set for a match.
261  serialized_entity_data:string (shared);
262
263  entity_data:EntityData;
264}
265
266namespace libtextclassifier3;
267table RegexModel {
268  patterns:[RegexModel_.Pattern];
269
270  // If true, will compile the regexes only on first use.
271  lazy_regex_compilation:bool = true;
272
273  // Lua scripts for match verification.
274  // The verifier can access:
275  // * `context`: The context as a string.
276  // * `match`: The groups of the regex match as an array, each group gives
277  // * `begin`: span start
278  // * `end`: span end
279  // * `text`: the text
280  // The verifier is expected to return a boolean, indicating whether the
281  // verification succeeded or not.
282  lua_verifier:[string];
283}
284
285// List of regex patterns.
286namespace libtextclassifier3.DatetimeModelPattern_;
287table Regex {
288  pattern:string (shared);
289
290  // The ith entry specifies the type of the ith capturing group.
291  // This is used to decide how the matched content has to be parsed.
292  groups:[DatetimeGroupType];
293
294  compressed_pattern:CompressedBuffer;
295}
296
297namespace libtextclassifier3;
298table DatetimeModelPattern {
299  regexes:[DatetimeModelPattern_.Regex];
300
301  // List of locale indices in DatetimeModel that represent the locales that
302  // these patterns should be used for. If empty, can be used for all locales.
303  locales:[int];
304
305  // The final score to assign to the results of this pattern.
306  target_classification_score:float = 1;
307
308  // Priority score used for conflict resolution with the other models.
309  priority_score:float = 0;
310
311  // The modes for which to apply the patterns.
312  enabled_modes:ModeFlag = ALL;
313
314  // The annotation usecases for which to apply the patterns.
315  // This is a flag field for values of AnnotationUsecase.
316  enabled_annotation_usecases:uint = 4294967295;
317}
318
319namespace libtextclassifier3;
320table DatetimeModelExtractor {
321  extractor:DatetimeExtractorType;
322  pattern:string (shared);
323  locales:[int];
324  compressed_pattern:CompressedBuffer;
325}
326
327namespace libtextclassifier3;
328table DatetimeModel {
329  // List of BCP 47 locale strings representing all locales supported by the
330  // model. The individual patterns refer back to them using an index.
331  locales:[string];
332
333  patterns:[DatetimeModelPattern];
334  extractors:[DatetimeModelExtractor];
335
336  // If true, will use the extractors for determining the match location as
337  // opposed to using the location where the global pattern matched.
338  use_extractors_for_locating:bool = true;
339
340  // List of locale ids, rules of whose are always run, after the requested
341  // ones.
342  default_locales:[int];
343
344  // If true, will generate the alternative interpretations for ambiguous
345  // datetime expressions.
346  generate_alternative_interpretations_when_ambiguous:bool = false;
347
348  // If true, will compile the regexes only on first use.
349  lazy_regex_compilation:bool = true;
350
351  // If true, will give only future dates (when the day is not specified).
352  prefer_future_for_unspecified_date:bool = false;
353}
354
355// Configuration for the tokenizer.
356namespace libtextclassifier3;
357table GrammarTokenizerOptions {
358  tokenization_type:TokenizationType = ICU;
359
360  // If true, white space tokens will be kept when using the icu tokenizer.
361  icu_preserve_whitespace_tokens:bool = false;
362
363  // Codepoint ranges that determine what role the different codepoints play
364  // during tokenized. The ranges must not overlap.
365  tokenization_codepoint_config:[TokenizationCodepointRange];
366
367  // A set of codepoint ranges to use in the mixed tokenization mode to identify
368  // stretches of tokens to re-tokenize using the internal tokenizer.
369  internal_tokenizer_codepoint_ranges:[CodepointRange];
370
371  // If true, tokens will be also split when the codepoint's script_id changes
372  // as defined in TokenizationCodepointRange.
373  tokenize_on_script_change:bool = false;
374}
375
376namespace libtextclassifier3.DatetimeModelLibrary_;
377table Item {
378  key:string (shared);
379  value:DatetimeModel;
380}
381
382// A set of named DateTime models.
383namespace libtextclassifier3;
384table DatetimeModelLibrary {
385  models:[DatetimeModelLibrary_.Item];
386}
387
388// Classification result to instantiate for a rule match.
389namespace libtextclassifier3.GrammarModel_;
390table RuleClassificationResult {
391  // The name of the collection.
392  collection_name:string (shared);
393
394  // The score.
395  target_classification_score:float = 1;
396
397  // The priority score used for conflict resolution with the other models.
398  priority_score:float = 0;
399
400  // Behaviour of capturing matches.
401  capturing_group:[CapturingGroup];
402
403  // Entity data to set for a match.
404  serialized_entity_data:string (shared);
405
406  // Enabled modes.
407  enabled_modes:ModeFlag = ALL;
408
409  entity_data:EntityData;
410}
411
412// Configuration for grammar based annotators.
413namespace libtextclassifier3;
414table GrammarModel {
415  // The grammar rules.
416  rules:grammar.RulesSet;
417
418  rule_classification_result:[GrammarModel_.RuleClassificationResult];
419
420  // Number of tokens in the context to use for classification and text
421  // selection suggestion.
422  // A value -1 uses the full context.
423  context_left_num_tokens:int;
424
425  context_right_num_tokens:int;
426
427  // Grammar specific tokenizer options.
428  tokenizer_options:GrammarTokenizerOptions;
429
430  // The score.
431  target_classification_score:float = 1;
432
433  // The priority score used for conflict resolution with the other models.
434  priority_score:float = 1;
435}
436
437namespace libtextclassifier3.MoneyParsingOptions_;
438table QuantitiesNameToExponentEntry {
439  key:string (key, shared);
440  value:int;
441}
442
443namespace libtextclassifier3;
444table MoneyParsingOptions {
445  // Separators (codepoints) marking decimal or thousand in the money amount.
446  separators:[int];
447
448  // Mapping between a quantity string (e.g. "million") and the power of 10
449  // it multiplies the amount with (e.g. 6 in case of "million").
450  // NOTE: The entries need to be sorted by key since we use LookupByKey.
451  quantities_name_to_exponent:[MoneyParsingOptions_.QuantitiesNameToExponentEntry];
452}
453
454namespace libtextclassifier3.ModelTriggeringOptions_;
455table CollectionToPriorityEntry {
456  key:string (key, shared);
457  value:float;
458}
459
460// Options controlling the output of the Tensorflow Lite models.
461namespace libtextclassifier3;
462table ModelTriggeringOptions {
463  // Lower bound threshold for filtering annotation model outputs.
464  min_annotate_confidence:float = 0;
465
466  // The modes for which to enable the models.
467  enabled_modes:ModeFlag = ALL;
468
469  // Comma-separated list of locales (BCP 47 tags) that dictionary
470  // classification supports.
471  dictionary_locales:string (shared);
472
473  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
474  // are used to prevent  triggering on input in unsupported languages. If
475  // empty, the model will trigger on all inputs.
476  locales:string (shared);
477
478  // Priority score assigned to the "other" class from ML model.
479  other_collection_priority_score:float = -1000;
480
481  // Priority score assigned to knowledge engine annotations.
482  knowledge_priority_score:float = 0;
483  reserved_7:int16 (deprecated);
484
485  // Apply a factor to the priority score for entities that are added to this
486  // map. Key: collection type e.g. "address", "phone"..., Value: float number.
487  // NOTE: The entries here need to be sorted since we use LookupByKey.
488  collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry];
489}
490
491// Options controlling the output of the classifier.
492namespace libtextclassifier3;
493table OutputOptions {
494  // Lists of collection names that will be filtered out at the output:
495  // - For annotation, the spans of given collection are simply dropped.
496  // - For classification, the result is mapped to the class "other".
497  // - For selection, the spans of given class are returned as
498  // single-selection.
499  filtered_collections_annotation:[string];
500
501  filtered_collections_classification:[string];
502  filtered_collections_selection:[string];
503}
504
505namespace libtextclassifier3.Model_;
506table EmbeddingPruningMask {
507  // If true, use pruning mask. In this case, we use mask
508  // pruning_mask to determine the mapping of hashed-charactergrams.
509  enabled:bool;
510
511  // Packing of the binary pruning mask into uint64 values.
512  pruning_mask:[ulong] (force_align: 16);
513
514  // Number of buckets before pruning.
515  full_num_buckets:int;
516
517  // Index of row of compressed embedding matrix to which all pruned buckets
518  // are mapped.
519  pruned_row_bucket_id:int;
520}
521
522namespace libtextclassifier3.Model_;
523table ConflictResolutionOptions {
524  // If true, will prioritize the longest annotation during conflict
525  // resolution.
526  prioritize_longest_annotation:bool = false;
527
528  // If true, the annotator will perform conflict resolution between the
529  // different sub-annotators also in the RAW mode. If false, no conflict
530  // resolution will be performed in RAW mode.
531  do_conflict_resolution_in_raw_mode:bool = true;
532}
533
534namespace libtextclassifier3;
535table Model {
536  // Comma-separated list of locales supported by the model as BCP 47 tags.
537  locales:string (shared);
538
539  version:int;
540
541  // A name for the model that can be used for e.g. logging.
542  name:string (shared);
543
544  selection_feature_options:FeatureProcessorOptions;
545  classification_feature_options:FeatureProcessorOptions;
546
547  // Tensorflow Lite models.
548  selection_model:[ubyte] (force_align: 16);
549
550  classification_model:[ubyte] (force_align: 16);
551  embedding_model:[ubyte] (force_align: 16);
552
553  // Options for the different models.
554  selection_options:SelectionModelOptions;
555
556  classification_options:ClassificationModelOptions;
557  regex_model:RegexModel;
558  datetime_model:DatetimeModel;
559
560  // Options controlling the output of the models.
561  triggering_options:ModelTriggeringOptions;
562
563  // Global switch that controls if SuggestSelection(), ClassifyText() and
564  // Annotate() will run. If a mode is disabled it returns empty/no-op results.
565  enabled_modes:ModeFlag = ALL;
566
567  // If true, will snap the selections that consist only of whitespaces to the
568  // containing suggested span. Otherwise, no suggestion is proposed, since the
569  // selections are not part of any token.
570  snap_whitespace_selections:bool = true;
571
572  // Global configuration for the output of SuggestSelection(), ClassifyText()
573  // and Annotate().
574  output_options:OutputOptions;
575
576  // Configures how Intents should be generated on Android.
577  android_intent_options:AndroidIntentFactoryOptions;
578
579  intent_options:IntentFactoryModel;
580
581  // Model resources.
582  resources:ResourcePool;
583
584  // Schema data for handling entity data.
585  entity_data_schema:[ubyte];
586
587  number_annotator_options:NumberAnnotatorOptions;
588  duration_annotator_options:DurationAnnotatorOptions;
589
590  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
591  // are used to prevent  triggering on input in unsupported languages. If
592  // empty, the model will trigger on all inputs.
593  triggering_locales:string (shared);
594
595  embedding_pruning_mask:Model_.EmbeddingPruningMask;
596  reserved_25:int16 (deprecated);
597  contact_annotator_options:ContactAnnotatorOptions;
598  money_parsing_options:MoneyParsingOptions;
599  translate_annotator_options:TranslateAnnotatorOptions;
600  grammar_model:GrammarModel;
601  conflict_resolution_options:Model_.ConflictResolutionOptions;
602  experimental_model:ExperimentalModel;
603  pod_ner_model:PodNerModel;
604  vocab_model:VocabModel;
605  datetime_grammar_model:GrammarModel;
606}
607
608// Method for selecting the center token.
609namespace libtextclassifier3.FeatureProcessorOptions_;
610enum CenterTokenSelectionMethod : int {
611  DEFAULT_CENTER_TOKEN_METHOD = 0,
612  // Invalid option.
613
614  // Use click indices to determine the center token.
615  CENTER_TOKEN_FROM_CLICK = 1,
616
617  // Use selection indices to get a token range, and select the middle of it
618  // as the center token.
619  CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
620}
621
622// Bounds-sensitive feature extraction configuration.
623namespace libtextclassifier3.FeatureProcessorOptions_;
624table BoundsSensitiveFeatures {
625  // Enables the extraction of bounds-sensitive features, instead of the click
626  // context features.
627  enabled:bool;
628
629  // The numbers of tokens to extract in specific locations relative to the
630  // bounds.
631  // Immediately before the span.
632  num_tokens_before:int;
633
634  // Inside the span, aligned with the beginning.
635  num_tokens_inside_left:int;
636
637  // Inside the span, aligned with the end.
638  num_tokens_inside_right:int;
639
640  // Immediately after the span.
641  num_tokens_after:int;
642
643  // If true, also extracts the tokens of the entire span and adds up their
644  // features forming one "token" to include in the extracted features.
645  include_inside_bag:bool;
646
647  // If true, includes the selection length (in the number of tokens) as a
648  // feature.
649  include_inside_length:bool;
650
651  // If true, for selection, single token spans are not run through the model
652  // and their score is assumed to be zero.
653  score_single_token_spans_as_zero:bool;
654}
655
656namespace libtextclassifier3;
657table FeatureProcessorOptions {
658  // Number of buckets used for hashing charactergrams.
659  num_buckets:int = -1;
660
661  // Size of the embedding.
662  embedding_size:int = -1;
663
664  // Number of bits for quantization for embeddings.
665  embedding_quantization_bits:int = 8;
666
667  // Context size defines the number of words to the left and to the right of
668  // the selected word to be used as context. For example, if context size is
669  // N, then we take N words to the left and N words to the right of the
670  // selected word as its context.
671  context_size:int = -1;
672
673  // Maximum number of words of the context to select in total.
674  max_selection_span:int = -1;
675
676  // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
677  // character trigrams etc.
678  chargram_orders:[int];
679
680  // Maximum length of a word, in codepoints.
681  max_word_length:int = 20;
682
683  // If true, will use the unicode-aware functionality for extracting features.
684  unicode_aware_features:bool = false;
685
686  // Whether to extract the token case feature.
687  extract_case_feature:bool = false;
688
689  // Whether to extract the selection mask feature.
690  extract_selection_mask_feature:bool = false;
691
692  // List of regexps to run over each token. For each regexp, if there is a
693  // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
694  regexp_feature:[string];
695
696  // Whether to remap all digits to a single number.
697  remap_digits:bool = false;
698
699  // Whether to lower-case each token before generating hashgrams.
700  lowercase_tokens:bool;
701
702  // If true, the selection classifier output will contain only the selections
703  // that are feasible (e.g., those that are shorter than max_selection_span),
704  // if false, the output will be a complete cross-product of possible
705  // selections to the left and possible selections to the right, including the
706  // infeasible ones.
707  // NOTE: Exists mainly for compatibility with older models that were trained
708  // with the non-reduced output space.
709  selection_reduced_output_space:bool = true;
710
711  // Collection names.
712  collections:[string];
713
714  // An index of collection in collections to be used if a collection name can't
715  // be mapped to an id.
716  default_collection:int = -1;
717
718  // If true, will split the input by lines, and only use the line that contains
719  // the clicked token.
720  only_use_line_with_click:bool = false;
721
722  // If true, will split tokens that contain the selection boundary, at the
723  // position of the boundary.
724  // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
725  split_tokens_on_selection_boundaries:bool = false;
726
727  // Codepoint ranges that determine how different codepoints are tokenized.
728  // The ranges must not overlap.
729  tokenization_codepoint_config:[TokenizationCodepointRange];
730
731  center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod;
732
733  // If true, span boundaries will be snapped to containing tokens and not
734  // required to exactly match token boundaries.
735  snap_label_span_boundaries_to_containing_tokens:bool;
736
737  // A set of codepoint ranges supported by the model.
738  supported_codepoint_ranges:[CodepointRange];
739
740  // A set of codepoint ranges to use in the mixed tokenization mode to identify
741  // stretches of tokens to re-tokenize using the internal tokenizer.
742  internal_tokenizer_codepoint_ranges:[CodepointRange];
743
744  // Minimum ratio of supported codepoints in the input context. If the ratio
745  // is lower than this, the feature computation will fail.
746  min_supported_codepoint_ratio:float = 0;
747
748  // Used for versioning the format of features the model expects.
749  // - feature_version == 0:
750  // For each token the features consist of:
751  // - chargram embeddings
752  // - dense features
753  // Chargram embeddings for tokens are concatenated first together,
754  // and at the end, the dense features for the tokens are concatenated
755  // to it. So the resulting feature vector has two regions.
756  feature_version:int = 0;
757
758  tokenization_type:TokenizationType = INTERNAL_TOKENIZER;
759  icu_preserve_whitespace_tokens:bool = false;
760
761  // List of codepoints that will be stripped from beginning and end of
762  // predicted spans.
763  ignored_span_boundary_codepoints:[int];
764
765  bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures;
766
767  // List of allowed charactergrams. The extracted charactergrams are filtered
768  // using this list, and charactergrams that are not present are interpreted as
769  // out-of-vocabulary.
770  // If no allowed_chargrams are specified, all charactergrams are allowed.
771  // The field is typed as bytes type to allow non-UTF8 chargrams.
772  allowed_chargrams:[string];
773
774  // If true, tokens will be also split when the codepoint's script_id changes
775  // as defined in TokenizationCodepointRange.
776  tokenize_on_script_change:bool = false;
777
778  // If true, the pipe character '|' will be used as a newline character when
779  // splitting lines.
780  use_pipe_character_for_newline:bool = true;
781}
782
783namespace libtextclassifier3;
784table NumberAnnotatorOptions {
785  // If true, number and percentage annotations will be produced.
786  enabled:bool = false;
787
788  // Score to assign to the annotated numbers and percentages in the annotator.
789  score:float = 1;
790
791  // Number priority score used for conflict resolution with the other models.
792  priority_score:float = 0;
793
794  // The modes in which to enable number and percentage annotations.
795  enabled_modes:ModeFlag = ALL;
796
797  // The annotation usecases for which to produce number annotations.
798  // This is a flag field for values of AnnotationUsecase.
799  enabled_annotation_usecases:uint = 4294967295;
800
801  // [Deprecated] A list of codepoints that can form a prefix of a valid number.
802  allowed_prefix_codepoints:[int];
803
804  // [Deprecated] A list of codepoints that can form a suffix of a valid number.
805  allowed_suffix_codepoints:[int];
806
807  // [Deprecated] List of codepoints that will be stripped from beginning of
808  // predicted spans.
809  ignored_prefix_span_boundary_codepoints:[int];
810
811  // [Deprecated] List of codepoints that will be stripped from end of predicted
812  // spans.
813  ignored_suffix_span_boundary_codepoints:[int];
814
815  // [Deprecated] If true, percent annotations will be produced.
816  enable_percentage:bool = false;
817
818  // Zero separated and ordered list of suffixes that mark a percent.
819  percentage_pieces_string:string (shared);
820
821  // [Deprecated] List of suffixes offsets in the percent_pieces_string string.
822  percentage_pieces_offsets:[int];
823
824  // Priority score for the percentage annotation.
825  percentage_priority_score:float = 1;
826
827  // Float number priority score used for conflict resolution with the other
828  // models.
829  float_number_priority_score:float = 0;
830
831  // The maximum number of digits an annotated number can have. Requirement:
832  // the value should be less or equal to 20.
833  max_number_of_digits:int = 20;
834
835  // The annotation usecases for which to produce percentage annotations.
836  // This is a flag field for values of AnnotationUsecase.
837  percentage_annotation_usecases:uint = 2;
838}
839
840// DurationAnnotator is so far tailored for English and Japanese only.
841namespace libtextclassifier3;
842table DurationAnnotatorOptions {
843  // If true, duration annotations will be produced.
844  enabled:bool = false;
845
846  // Score to assign to the annotated durations from the annotator.
847  score:float = 1;
848
849  // Priority score used for conflict resolution with the other models.
850  priority_score:float = 0;
851
852  // The modes in which to enable duration annotations.
853  enabled_modes:ModeFlag = ALL;
854
855  // The annotation usecases for which to produce duration annotations.
856  enabled_annotation_usecases:uint = 4294967295;
857
858  // Durations typically look like XX hours and XX minutes etc... The list of
859  // strings below enumerate variants of "hours", "minutes", etc. in these
860  // expressions. These are verbatim strings that are matched against tokens in
861  // the input.
862  week_expressions:[string];
863
864  day_expressions:[string];
865  hour_expressions:[string];
866  minute_expressions:[string];
867  second_expressions:[string];
868
869  // List of expressions that doesn't break a duration expression (can become
870  // a part of it) but has not semantic meaning.
871  filler_expressions:[string];
872
873  // List of expressions that mean half of a unit of duration (e.g. "half an
874  // hour").
875  half_expressions:[string];
876
877  // Set of condepoints that can split the Annotator tokens to sub-tokens for
878  // sub-token matching.
879  sub_token_separator_codepoints:[int];
880
881  // If this is true, unit must be associated with quantity. For example, a
882  // phrase "minute" is not parsed as one minute duration if this is true.
883  require_quantity:bool;
884
885  // If this is true, dangling quantity is included in the annotation. For
886  // example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds.
887  enable_dangling_quantity_interpretation:bool = true;
888}
889
890namespace libtextclassifier3;
891table ContactAnnotatorOptions {
892  // Supported for English genitives only so far.
893  enable_declension:bool;
894
895  // For each language there is a customized list of supported declensions.
896  language:string (shared);
897}
898
899namespace libtextclassifier3.TranslateAnnotatorOptions_;
900enum Algorithm : int {
901  DEFAULT_ALGORITHM = 0,
902  BACKOFF = 1,
903}
904
905// Backoff is the algorithm shipped with Android Q.
906namespace libtextclassifier3.TranslateAnnotatorOptions_;
907table BackoffOptions {
908  // The minimum size of text to prefer for detection (in codepoints).
909  min_text_size:int = 20;
910
911  // For reducing the score when text is less than the preferred size.
912  penalize_ratio:float = 1;
913
914  // Original detection score to surrounding text detection score ratios.
915  subject_text_score_ratio:float = 0.4;
916}
917
918namespace libtextclassifier3;
919table TranslateAnnotatorOptions {
920  enabled:bool = false;
921
922  // Score to assign to the classification results.
923  score:float = 1;
924
925  // Priority score used for conflict resolution with the other models.
926  priority_score:float;
927
928  algorithm:TranslateAnnotatorOptions_.Algorithm;
929  backoff_options:TranslateAnnotatorOptions_.BackoffOptions;
930}
931
932namespace libtextclassifier3.PodNerModel_;
933table Collection {
934  // Collection's name (e.g., "location", "person").
935  name:string (shared);
936
937  // Priority scores used for conflict resolution with the other annotators
938  // when the annotation is made over a single/multi token text.
939  single_token_priority_score:float;
940
941  multi_token_priority_score:float;
942}
943
944namespace libtextclassifier3.PodNerModel_.Label_;
945enum BoiseType : int {
946  NONE = 0,
947  BEGIN = 1,
948  O = 2,
949  // No label.
950
951  INTERMEDIATE = 3,
952  SINGLE = 4,
953  END = 5,
954}
955
956namespace libtextclassifier3.PodNerModel_.Label_;
957enum MentionType : int {
958  UNDEFINED = 0,
959  NAM = 1,
960  NOM = 2,
961}
962
963namespace libtextclassifier3.PodNerModel_;
964table Label {
965  boise_type:Label_.BoiseType;
966  mention_type:Label_.MentionType;
967  collection_id:int;
968  // points to the collections array above.
969}
970
971namespace libtextclassifier3;
972table PodNerModel {
973  tflite_model:[ubyte];
974  word_piece_vocab:[ubyte];
975  lowercase_input:bool = true;
976
977  // Index of mention_logits tensor in the output of the tflite model. Can
978  // be found in the textproto output after model is converted to tflite.
979  logits_index_in_output_tensor:int = 0;
980
981  // Whether to append a period at the end of an input that doesn't already
982  // end in punctuation.
983  append_final_period:bool = false;
984
985  // Priority score used for conflict resolution with the other models. Used
986  // only if collections_array is empty.
987  priority_score:float = 0;
988
989  // Maximum number of wordpieces supported by the model.
990  max_num_wordpieces:int = 128;
991
992  // In case of long text (number of wordpieces greater than the max) we use
993  // sliding window approach, this determines the number of overlapping
994  // wordpieces between two consecutive windows. This overlap enables context
995  // for each word NER annotates.
996  sliding_window_num_wordpieces_overlap:int = 20;
997  reserved_9:int16 (deprecated);
998
999  // The possible labels the ner model can output. If empty the default labels
1000  // will be used.
1001  labels:[PodNerModel_.Label];
1002
1003  // If the ratio of unknown wordpieces in the input text is greater than this
1004  // maximum, the text won't be annotated.
1005  max_ratio_unknown_wordpieces:float = 0.1;
1006
1007  // Possible collections for labeled entities.
1008  collections:[PodNerModel_.Collection];
1009
1010  // Minimum word-length and wordpieces-length required for the text to be
1011  // annotated.
1012  min_number_of_tokens:int = 1;
1013
1014  min_number_of_wordpieces:int = 1;
1015}
1016
1017namespace libtextclassifier3;
1018table VocabModel {
1019  // A trie that stores a list of vocabs that triggers "Define". A id is
1020  // returned when looking up a vocab from the trie and the id can be used
1021  // to access more information about that vocab. The marisa trie library
1022  // requires 8-byte alignment because the first thing in a marisa trie is a
1023  // 64-bit integer.
1024  vocab_trie:[ubyte] (force_align: 8);
1025
1026  // A bit vector that tells if the vocab should trigger "Define" for users of
1027  // beginner proficiency only. To look up the bit vector, use the id returned
1028  // by the trie.
1029  beginner_level:BitVectorData;
1030
1031  // A sorted list of indices of vocabs that should not trigger "Define" if
1032  // its leading character is in upper case. The indices are those returned by
1033  // trie. You may perform binary search to look up an index.
1034  do_not_trigger_in_upper_case:BitVectorData;
1035
1036  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
1037  // are used to prevent  triggering on input in unsupported languages. If
1038  // empty, the model will trigger on all inputs.
1039  triggering_locales:string (shared);
1040
1041  // The final score to assign to the results of the vocab model
1042  target_classification_score:float = 1;
1043
1044  // Priority score used for conflict resolution with the other models.
1045  priority_score:float = 0;
1046}
1047
1048root_type libtextclassifier3.Model;
1049