• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2018 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17include "utils/codepoint-range.fbs";
18include "utils/flatbuffers.fbs";
19include "utils/intents/intent-config.fbs";
20include "utils/resources.fbs";
21include "utils/tokenizer.fbs";
22include "utils/zlib/buffer.fbs";
23
24file_identifier "TC2 ";
25
26// The possible model modes, represents a bit field.
27namespace libtextclassifier3;
28enum ModeFlag : int {
29  NONE = 0,
30  ANNOTATION = 1,
31  CLASSIFICATION = 2,
32  ANNOTATION_AND_CLASSIFICATION = 3,
33  SELECTION = 4,
34  ANNOTATION_AND_SELECTION = 5,
35  CLASSIFICATION_AND_SELECTION = 6,
36  ALL = 7,
37}
38
39// Enum for specifying the annotation usecase.
40namespace libtextclassifier3;
41enum AnnotationUsecase : int {
42  // Results are optimized for Smart{Select,Share,Linkify}.
43  ANNOTATION_USECASE_SMART = 0,
44
45  // Results are optimized for using TextClassifier as an infrastructure that
46  // annotates as much as possible.
47  ANNOTATION_USECASE_RAW = 1,
48}
49
50namespace libtextclassifier3;
51enum DatetimeExtractorType : int {
52  UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
53  AM = 1,
54  PM = 2,
55  JANUARY = 3,
56  FEBRUARY = 4,
57  MARCH = 5,
58  APRIL = 6,
59  MAY = 7,
60  JUNE = 8,
61  JULY = 9,
62  AUGUST = 10,
63  SEPTEMBER = 11,
64  OCTOBER = 12,
65  NOVEMBER = 13,
66  DECEMBER = 14,
67  NEXT = 15,
68  NEXT_OR_SAME = 16,
69  LAST = 17,
70  NOW = 18,
71  TOMORROW = 19,
72  YESTERDAY = 20,
73  PAST = 21,
74  FUTURE = 22,
75  DAY = 23,
76  WEEK = 24,
77  MONTH = 25,
78  YEAR = 26,
79  MONDAY = 27,
80  TUESDAY = 28,
81  WEDNESDAY = 29,
82  THURSDAY = 30,
83  FRIDAY = 31,
84  SATURDAY = 32,
85  SUNDAY = 33,
86  DAYS = 34,
87  WEEKS = 35,
88  MONTHS = 36,
89
90  // TODO(zilka): Make the following 3 values singular for consistency.
91  HOURS = 37,
92
93  MINUTES = 38,
94  SECONDS = 39,
95  YEARS = 40,
96  DIGITS = 41,
97  SIGNEDDIGITS = 42,
98  ZERO = 43,
99  ONE = 44,
100  TWO = 45,
101  THREE = 46,
102  FOUR = 47,
103  FIVE = 48,
104  SIX = 49,
105  SEVEN = 50,
106  EIGHT = 51,
107  NINE = 52,
108  TEN = 53,
109  ELEVEN = 54,
110  TWELVE = 55,
111  THIRTEEN = 56,
112  FOURTEEN = 57,
113  FIFTEEN = 58,
114  SIXTEEN = 59,
115  SEVENTEEN = 60,
116  EIGHTEEN = 61,
117  NINETEEN = 62,
118  TWENTY = 63,
119  THIRTY = 64,
120  FORTY = 65,
121  FIFTY = 66,
122  SIXTY = 67,
123  SEVENTY = 68,
124  EIGHTY = 69,
125  NINETY = 70,
126  HUNDRED = 71,
127  THOUSAND = 72,
128}
129
130namespace libtextclassifier3;
131enum DatetimeGroupType : int {
132  GROUP_UNKNOWN = 0,
133  GROUP_UNUSED = 1,
134  GROUP_YEAR = 2,
135  GROUP_MONTH = 3,
136  GROUP_DAY = 4,
137  GROUP_HOUR = 5,
138  GROUP_MINUTE = 6,
139  GROUP_SECOND = 7,
140  GROUP_AMPM = 8,
141  GROUP_RELATIONDISTANCE = 9,
142  GROUP_RELATION = 10,
143  GROUP_RELATIONTYPE = 11,
144
145  // Dummy groups serve just as an inflator of the selection. E.g. we might want
146  // to select more text than was contained in an envelope of all extractor
147  // spans.
148  GROUP_DUMMY1 = 12,
149
150  GROUP_DUMMY2 = 13,
151}
152
153// Options for the model that predicts text selection.
154namespace libtextclassifier3;
155table SelectionModelOptions {
156  // If true, before the selection is returned, the unpaired brackets contained
157  // in the predicted selection are stripped from the both selection ends.
158  // The bracket codepoints are defined in the Unicode standard:
159  // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
160  strip_unpaired_brackets:bool = true;
161
162  // Number of hypothetical click positions on either side of the actual click
163  // to consider in order to enforce symmetry.
164  symmetry_context_size:int;
165
166  // Number of examples to bundle in one batch for inference.
167  batch_size:int = 1024;
168
169  // Whether to always classify a suggested selection or only on demand.
170  always_classify_suggested_selection:bool = false;
171}
172
173// Options for the model that classifies a text selection.
174namespace libtextclassifier3;
175table ClassificationModelOptions {
176  // Limits for phone numbers.
177  phone_min_num_digits:int = 7;
178
179  phone_max_num_digits:int = 15;
180
181  // Limits for addresses.
182  address_min_num_tokens:int;
183
184  // Maximum number of tokens to attempt a classification (-1 is unlimited).
185  max_num_tokens:int = -1;
186}
187
188// Options for post-checks, checksums and verification to apply on a match.
189namespace libtextclassifier3;
190table VerificationOptions {
191  verify_luhn_checksum:bool = false;
192
193  // Lua verifier to use.
194  // Index of the lua verifier in the model.
195  lua_verifier:int = -1;
196}
197
198// Behaviour of capturing groups.
199namespace libtextclassifier3.RegexModel_.Pattern_;
200table CapturingGroup {
201  // If true, the span of the capturing group will be used to
202  // extend the selection.
203  extend_selection:bool = true;
204
205  // If set, the text of the capturing group will be used to set a field in
206  // the classfication result entity data.
207  entity_field_path:FlatbufferFieldPath;
208}
209
210// List of regular expression matchers to check.
211namespace libtextclassifier3.RegexModel_;
212table Pattern {
213  // The name of the collection of a match.
214  collection_name:string;
215
216  // The pattern to check.
217  pattern:string;
218
219  // The modes for which to apply the patterns.
220  enabled_modes:ModeFlag = ALL;
221
222  // The final score to assign to the results of this pattern.
223  target_classification_score:float = 1;
224
225  // Priority score used for conflict resolution with the other models.
226  priority_score:float = 0;
227
228  // If true, will use an approximate matching implementation implemented
229  // using Find() instead of the true Match(). This approximate matching will
230  // use the first Find() result and then check that it spans the whole input.
231  use_approximate_matching:bool = false;
232
233  compressed_pattern:CompressedBuffer;
234
235  // Verification to apply on a match.
236  verification_options:VerificationOptions;
237
238  capturing_group:[Pattern_.CapturingGroup];
239
240  // Serialized entity data to set for a match.
241  serialized_entity_data:string;
242}
243
244namespace libtextclassifier3;
245table RegexModel {
246  patterns:[RegexModel_.Pattern];
247
248  // If true, will compile the regexes only on first use.
249  lazy_regex_compilation:bool = true;
250
251  // Lua scripts for match verification.
252  // The verifier can access:
253  // * `context`: The context as a string.
254  // * `match`: The groups of the regex match as an array, each group gives
255  // * `begin`: span start
256  // * `end`: span end
257  // * `text`: the text
258  // The verifier is expected to return a boolean, indicating whether the
259  // verification succeeded or not.
260  lua_verifier:[string];
261}
262
263// List of regex patterns.
264namespace libtextclassifier3.DatetimeModelPattern_;
265table Regex {
266  pattern:string;
267
268  // The ith entry specifies the type of the ith capturing group.
269  // This is used to decide how the matched content has to be parsed.
270  groups:[DatetimeGroupType];
271
272  compressed_pattern:CompressedBuffer;
273}
274
275namespace libtextclassifier3;
276table DatetimeModelPattern {
277  regexes:[DatetimeModelPattern_.Regex];
278
279  // List of locale indices in DatetimeModel that represent the locales that
280  // these patterns should be used for. If empty, can be used for all locales.
281  locales:[int];
282
283  // The final score to assign to the results of this pattern.
284  target_classification_score:float = 1;
285
286  // Priority score used for conflict resolution with the other models.
287  priority_score:float = 0;
288
289  // The modes for which to apply the patterns.
290  enabled_modes:ModeFlag = ALL;
291
292  // The annotation usecases for which to apply the patterns.
293  // This is a flag field for values of AnnotationUsecase.
294  enabled_annotation_usecases:uint = 4294967295;
295}
296
297namespace libtextclassifier3;
298table DatetimeModelExtractor {
299  extractor:DatetimeExtractorType;
300  pattern:string;
301  locales:[int];
302  compressed_pattern:CompressedBuffer;
303}
304
305namespace libtextclassifier3;
306table DatetimeModel {
307  // List of BCP 47 locale strings representing all locales supported by the
308  // model. The individual patterns refer back to them using an index.
309  locales:[string];
310
311  patterns:[DatetimeModelPattern];
312  extractors:[DatetimeModelExtractor];
313
314  // If true, will use the extractors for determining the match location as
315  // opposed to using the location where the global pattern matched.
316  use_extractors_for_locating:bool = true;
317
318  // List of locale ids, rules of whose are always run, after the requested
319  // ones.
320  default_locales:[int];
321
322  // If true, will generate the alternative interpretations for ambiguous
323  // datetime expressions.
324  generate_alternative_interpretations_when_ambiguous:bool = false;
325
326  // If true, will compile the regexes only on first use.
327  lazy_regex_compilation:bool = true;
328}
329
330namespace libtextclassifier3.DatetimeModelLibrary_;
331table Item {
332  key:string;
333  value:DatetimeModel;
334}
335
336// A set of named DateTime models.
337namespace libtextclassifier3;
338table DatetimeModelLibrary {
339  models:[DatetimeModelLibrary_.Item];
340}
341
342// Options controlling the output of the Tensorflow Lite models.
343namespace libtextclassifier3;
344table ModelTriggeringOptions {
345  // Lower bound threshold for filtering annotation model outputs.
346  min_annotate_confidence:float = 0;
347
348  // The modes for which to enable the models.
349  enabled_modes:ModeFlag = ALL;
350
351  // Comma-separated list of locales (BCP 47 tags) that dictionary
352  // classification supports.
353  dictionary_locales:string;
354
355  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
356  // are used to prevent  triggering on input in unsupported languages. If
357  // empty, the model will trigger on all inputs.
358  locales:string;
359}
360
361// Options controlling the output of the classifier.
362namespace libtextclassifier3;
363table OutputOptions {
364  // Lists of collection names that will be filtered out at the output:
365  // - For annotation, the spans of given collection are simply dropped.
366  // - For classification, the result is mapped to the class "other".
367  // - For selection, the spans of given class are returned as
368  // single-selection.
369  filtered_collections_annotation:[string];
370
371  filtered_collections_classification:[string];
372  filtered_collections_selection:[string];
373}
374
375namespace libtextclassifier3.Model_;
376table EmbeddingPruningMask {
377  // If true, use pruning mask. In this case, we use mask
378  // pruning_mask to determine the mapping of hashed-charactergrams.
379  enabled:bool;
380
381  // Packing of the binary pruning mask into uint64 values.
382  pruning_mask:[ulong] (force_align: 16);
383
384  // Number of buckets before pruning.
385  full_num_buckets:int;
386
387  // Index of row of compressed embedding matrix to which all pruned buckets
388  // are mapped.
389  pruned_row_bucket_id:int;
390}
391
392namespace libtextclassifier3;
393table Model {
394  // Comma-separated list of locales supported by the model as BCP 47 tags.
395  locales:string;
396
397  version:int;
398
399  // A name for the model that can be used for e.g. logging.
400  name:string;
401
402  selection_feature_options:FeatureProcessorOptions;
403  classification_feature_options:FeatureProcessorOptions;
404
405  // Tensorflow Lite models.
406  selection_model:[ubyte] (force_align: 16);
407
408  classification_model:[ubyte] (force_align: 16);
409  embedding_model:[ubyte] (force_align: 16);
410
411  // Options for the different models.
412  selection_options:SelectionModelOptions;
413
414  classification_options:ClassificationModelOptions;
415  regex_model:RegexModel;
416  datetime_model:DatetimeModel;
417
418  // Options controlling the output of the models.
419  triggering_options:ModelTriggeringOptions;
420
421  // Global switch that controls if SuggestSelection(), ClassifyText() and
422  // Annotate() will run. If a mode is disabled it returns empty/no-op results.
423  enabled_modes:ModeFlag = ALL;
424
425  // If true, will snap the selections that consist only of whitespaces to the
426  // containing suggested span. Otherwise, no suggestion is proposed, since the
427  // selections are not part of any token.
428  snap_whitespace_selections:bool = true;
429
430  // Global configuration for the output of SuggestSelection(), ClassifyText()
431  // and Annotate().
432  output_options:OutputOptions;
433
434  // Configures how Intents should be generated on Android.
435  android_intent_options:AndroidIntentFactoryOptions;
436
437  intent_options:IntentFactoryModel;
438
439  // Model resources.
440  resources:ResourcePool;
441
442  // Schema data for handling entity data.
443  entity_data_schema:[ubyte];
444
445  number_annotator_options:NumberAnnotatorOptions;
446  duration_annotator_options:DurationAnnotatorOptions;
447
448  // Comma-separated list of locales (BCP 47 tags) that the model supports, that
449  // are used to prevent  triggering on input in unsupported languages. If
450  // empty, the model will trigger on all inputs.
451  triggering_locales:string;
452
453  embedding_pruning_mask:Model_.EmbeddingPruningMask;
454}
455
456// Method for selecting the center token.
457namespace libtextclassifier3.FeatureProcessorOptions_;
458enum CenterTokenSelectionMethod : int {
459  DEFAULT_CENTER_TOKEN_METHOD = 0,
460
461  // Use click indices to determine the center token.
462  CENTER_TOKEN_FROM_CLICK = 1,
463
464  // Use selection indices to get a token range, and select the middle of it
465  // as the center token.
466  CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
467}
468
469// Bounds-sensitive feature extraction configuration.
470namespace libtextclassifier3.FeatureProcessorOptions_;
471table BoundsSensitiveFeatures {
472  // Enables the extraction of bounds-sensitive features, instead of the click
473  // context features.
474  enabled:bool;
475
476  // The numbers of tokens to extract in specific locations relative to the
477  // bounds.
478  // Immediately before the span.
479  num_tokens_before:int;
480
481  // Inside the span, aligned with the beginning.
482  num_tokens_inside_left:int;
483
484  // Inside the span, aligned with the end.
485  num_tokens_inside_right:int;
486
487  // Immediately after the span.
488  num_tokens_after:int;
489
490  // If true, also extracts the tokens of the entire span and adds up their
491  // features forming one "token" to include in the extracted features.
492  include_inside_bag:bool;
493
494  // If true, includes the selection length (in the number of tokens) as a
495  // feature.
496  include_inside_length:bool;
497
498  // If true, for selection, single token spans are not run through the model
499  // and their score is assumed to be zero.
500  score_single_token_spans_as_zero:bool;
501}
502
503namespace libtextclassifier3;
504table FeatureProcessorOptions {
505  // Number of buckets used for hashing charactergrams.
506  num_buckets:int = -1;
507
508  // Size of the embedding.
509  embedding_size:int = -1;
510
511  // Number of bits for quantization for embeddings.
512  embedding_quantization_bits:int = 8;
513
514  // Context size defines the number of words to the left and to the right of
515  // the selected word to be used as context. For example, if context size is
516  // N, then we take N words to the left and N words to the right of the
517  // selected word as its context.
518  context_size:int = -1;
519
520  // Maximum number of words of the context to select in total.
521  max_selection_span:int = -1;
522
523  // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
524  // character trigrams etc.
525  chargram_orders:[int];
526
527  // Maximum length of a word, in codepoints.
528  max_word_length:int = 20;
529
530  // If true, will use the unicode-aware functionality for extracting features.
531  unicode_aware_features:bool = false;
532
533  // Whether to extract the token case feature.
534  extract_case_feature:bool = false;
535
536  // Whether to extract the selection mask feature.
537  extract_selection_mask_feature:bool = false;
538
539  // List of regexps to run over each token. For each regexp, if there is a
540  // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
541  regexp_feature:[string];
542
543  // Whether to remap all digits to a single number.
544  remap_digits:bool = false;
545
546  // Whether to lower-case each token before generating hashgrams.
547  lowercase_tokens:bool;
548
549  // If true, the selection classifier output will contain only the selections
550  // that are feasible (e.g., those that are shorter than max_selection_span),
551  // if false, the output will be a complete cross-product of possible
552  // selections to the left and possible selections to the right, including the
553  // infeasible ones.
554  // NOTE: Exists mainly for compatibility with older models that were trained
555  // with the non-reduced output space.
556  selection_reduced_output_space:bool = true;
557
558  // Collection names.
559  collections:[string];
560
561  // An index of collection in collections to be used if a collection name can't
562  // be mapped to an id.
563  default_collection:int = -1;
564
565  // If true, will split the input by lines, and only use the line that contains
566  // the clicked token.
567  only_use_line_with_click:bool = false;
568
569  // If true, will split tokens that contain the selection boundary, at the
570  // position of the boundary.
571  // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
572  split_tokens_on_selection_boundaries:bool = false;
573
574  // Codepoint ranges that determine how different codepoints are tokenized.
575  // The ranges must not overlap.
576  tokenization_codepoint_config:[TokenizationCodepointRange];
577
578  center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod;
579
580  // If true, span boundaries will be snapped to containing tokens and not
581  // required to exactly match token boundaries.
582  snap_label_span_boundaries_to_containing_tokens:bool;
583
584  // A set of codepoint ranges supported by the model.
585  supported_codepoint_ranges:[CodepointRange];
586
587  // A set of codepoint ranges to use in the mixed tokenization mode to identify
588  // stretches of tokens to re-tokenize using the internal tokenizer.
589  internal_tokenizer_codepoint_ranges:[CodepointRange];
590
591  // Minimum ratio of supported codepoints in the input context. If the ratio
592  // is lower than this, the feature computation will fail.
593  min_supported_codepoint_ratio:float = 0;
594
595  // Used for versioning the format of features the model expects.
596  // - feature_version == 0:
597  // For each token the features consist of:
598  // - chargram embeddings
599  // - dense features
600  // Chargram embeddings for tokens are concatenated first together,
601  // and at the end, the dense features for the tokens are concatenated
602  // to it. So the resulting feature vector has two regions.
603  feature_version:int = 0;
604
605  tokenization_type:TokenizationType = INTERNAL_TOKENIZER;
606  icu_preserve_whitespace_tokens:bool = false;
607
608  // List of codepoints that will be stripped from beginning and end of
609  // predicted spans.
610  ignored_span_boundary_codepoints:[int];
611
612  bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures;
613
614  // List of allowed charactergrams. The extracted charactergrams are filtered
615  // using this list, and charactergrams that are not present are interpreted as
616  // out-of-vocabulary.
617  // If no allowed_chargrams are specified, all charactergrams are allowed.
618  // The field is typed as bytes type to allow non-UTF8 chargrams.
619  allowed_chargrams:[string];
620
621  // If true, tokens will be also split when the codepoint's script_id changes
622  // as defined in TokenizationCodepointRange.
623  tokenize_on_script_change:bool = false;
624}
625
626namespace libtextclassifier3;
627table NumberAnnotatorOptions {
628  // If true, number annotations will be produced.
629  enabled:bool = false;
630
631  // Score to assign to the annotated numbers from the annotator.
632  score:float = 1;
633
634  // Priority score used for conflict resolution with the other models.
635  priority_score:float = 0;
636
637  // The modes in which to enable number annotations.
638  enabled_modes:ModeFlag = ALL;
639
640  // The annotation usecases for which to produce number annotations.
641  // This is a flag field for values of AnnotationUsecase.
642  enabled_annotation_usecases:uint = 4294967295;
643
644  // A list of codepoints that can form a prefix of a valid number.
645  allowed_prefix_codepoints:[int];
646
647  // A list of codepoints that can form a suffix of a valid number.
648  allowed_suffix_codepoints:[int];
649}
650
651// DurationAnnotator is so far tailored for English only.
652namespace libtextclassifier3;
653table DurationAnnotatorOptions {
654  // If true, duration annotations will be produced.
655  enabled:bool = false;
656
657  // Score to assign to the annotated durations from the annotator.
658  score:float = 1;
659
660  // Priority score used for conflict resolution with the other models.
661  priority_score:float = 0;
662
663  // The modes in which to enable duration annotations.
664  enabled_modes:ModeFlag = ALL;
665
666  // The annotation usecases for which to produce duration annotations.
667  enabled_annotation_usecases:uint = 4294967295;
668
669  // Durations typically look like XX hours and XX minutes etc... The list of
670  // strings below enumerate variants of "hours", "minutes", etc. in these
671  // expressions. These are verbatim strings that are matched against tokens in
672  // the input.
673  week_expressions:[string];
674
675  day_expressions:[string];
676  hour_expressions:[string];
677  minute_expressions:[string];
678  second_expressions:[string];
679
680  // List of expressions that doesn't break a duration expression (can become
681  // a part of it) but has not semantic meaning.
682  filler_expressions:[string];
683
684  // List of expressions that mean half of a unit of duration (e.g. "half an
685  // hour").
686  half_expressions:[string];
687}
688
689root_type libtextclassifier3.Model;
690