• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2017 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17file_identifier "TC2 ";
18
19// The possible model modes, represents a bit field.
20namespace libtextclassifier2;
21enum ModeFlag : int {
22  NONE = 0,
23  ANNOTATION = 1,
24  CLASSIFICATION = 2,
25  ANNOTATION_AND_CLASSIFICATION = 3,
26  SELECTION = 4,
27  ANNOTATION_AND_SELECTION = 5,
28  CLASSIFICATION_AND_SELECTION = 6,
29  ALL = 7,
30}
31
32namespace libtextclassifier2;
33enum DatetimeExtractorType : int {
34  UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
35  AM = 1,
36  PM = 2,
37  JANUARY = 3,
38  FEBRUARY = 4,
39  MARCH = 5,
40  APRIL = 6,
41  MAY = 7,
42  JUNE = 8,
43  JULY = 9,
44  AUGUST = 10,
45  SEPTEMBER = 11,
46  OCTOBER = 12,
47  NOVEMBER = 13,
48  DECEMBER = 14,
49  NEXT = 15,
50  NEXT_OR_SAME = 16,
51  LAST = 17,
52  NOW = 18,
53  TOMORROW = 19,
54  YESTERDAY = 20,
55  PAST = 21,
56  FUTURE = 22,
57  DAY = 23,
58  WEEK = 24,
59  MONTH = 25,
60  YEAR = 26,
61  MONDAY = 27,
62  TUESDAY = 28,
63  WEDNESDAY = 29,
64  THURSDAY = 30,
65  FRIDAY = 31,
66  SATURDAY = 32,
67  SUNDAY = 33,
68  DAYS = 34,
69  WEEKS = 35,
70  MONTHS = 36,
71  HOURS = 37,
72  MINUTES = 38,
73  SECONDS = 39,
74  YEARS = 40,
75  DIGITS = 41,
76  SIGNEDDIGITS = 42,
77  ZERO = 43,
78  ONE = 44,
79  TWO = 45,
80  THREE = 46,
81  FOUR = 47,
82  FIVE = 48,
83  SIX = 49,
84  SEVEN = 50,
85  EIGHT = 51,
86  NINE = 52,
87  TEN = 53,
88  ELEVEN = 54,
89  TWELVE = 55,
90  THIRTEEN = 56,
91  FOURTEEN = 57,
92  FIFTEEN = 58,
93  SIXTEEN = 59,
94  SEVENTEEN = 60,
95  EIGHTEEN = 61,
96  NINETEEN = 62,
97  TWENTY = 63,
98  THIRTY = 64,
99  FORTY = 65,
100  FIFTY = 66,
101  SIXTY = 67,
102  SEVENTY = 68,
103  EIGHTY = 69,
104  NINETY = 70,
105  HUNDRED = 71,
106  THOUSAND = 72,
107}
108
109namespace libtextclassifier2;
110enum DatetimeGroupType : int {
111  GROUP_UNKNOWN = 0,
112  GROUP_UNUSED = 1,
113  GROUP_YEAR = 2,
114  GROUP_MONTH = 3,
115  GROUP_DAY = 4,
116  GROUP_HOUR = 5,
117  GROUP_MINUTE = 6,
118  GROUP_SECOND = 7,
119  GROUP_AMPM = 8,
120  GROUP_RELATIONDISTANCE = 9,
121  GROUP_RELATION = 10,
122  GROUP_RELATIONTYPE = 11,
123
124  // Dummy groups serve just as an inflator of the selection. E.g. we might want
125  // to select more text than was contained in an envelope of all extractor
126  // spans.
127  GROUP_DUMMY1 = 12,
128
129  GROUP_DUMMY2 = 13,
130}
131
132namespace libtextclassifier2;
133table CompressedBuffer {
134  buffer:[ubyte];
135  uncompressed_size:int;
136}
137
138// Options for the model that predicts text selection.
139namespace libtextclassifier2;
140table SelectionModelOptions {
141  // If true, before the selection is returned, the unpaired brackets contained
142  // in the predicted selection are stripped from the both selection ends.
143  // The bracket codepoints are defined in the Unicode standard:
144  // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
145  strip_unpaired_brackets:bool = 1;
146
147  // Number of hypothetical click positions on either side of the actual click
148  // to consider in order to enforce symmetry.
149  symmetry_context_size:int;
150
151  // Number of examples to bundle in one batch for inference.
152  batch_size:int = 1024;
153
154  // Whether to always classify a suggested selection or only on demand.
155  always_classify_suggested_selection:bool = 0;
156}
157
158// Options for the model that classifies a text selection.
159namespace libtextclassifier2;
160table ClassificationModelOptions {
161  // Limits for phone numbers.
162  phone_min_num_digits:int = 7;
163
164  phone_max_num_digits:int = 15;
165
166  // Limits for addresses.
167  address_min_num_tokens:int;
168
169  // Maximum number of tokens to attempt a classification (-1 is unlimited).
170  max_num_tokens:int = -1;
171}
172
173// List of regular expression matchers to check.
174namespace libtextclassifier2.RegexModel_;
175table Pattern {
176  // The name of the collection of a match.
177  collection_name:string;
178
179  // The pattern to check.
180  // Can specify a single capturing group used as match boundaries.
181  pattern:string;
182
183  // The modes for which to apply the patterns.
184  enabled_modes:libtextclassifier2.ModeFlag = ALL;
185
186  // The final score to assign to the results of this pattern.
187  target_classification_score:float = 1;
188
189  // Priority score used for conflict resolution with the other models.
190  priority_score:float = 0;
191
192  // If true, will use an approximate matching implementation implemented
193  // using Find() instead of the true Match(). This approximate matching will
194  // use the first Find() result and then check that it spans the whole input.
195  use_approximate_matching:bool = 0;
196
197  compressed_pattern:libtextclassifier2.CompressedBuffer;
198}
199
200namespace libtextclassifier2;
201table RegexModel {
202  patterns:[libtextclassifier2.RegexModel_.Pattern];
203}
204
205// List of regex patterns.
206namespace libtextclassifier2.DatetimeModelPattern_;
207table Regex {
208  pattern:string;
209
210  // The ith entry specifies the type of the ith capturing group.
211  // This is used to decide how the matched content has to be parsed.
212  groups:[libtextclassifier2.DatetimeGroupType];
213
214  compressed_pattern:libtextclassifier2.CompressedBuffer;
215}
216
217namespace libtextclassifier2;
218table DatetimeModelPattern {
219  regexes:[libtextclassifier2.DatetimeModelPattern_.Regex];
220
221  // List of locale indices in DatetimeModel that represent the locales that
222  // these patterns should be used for. If empty, can be used for all locales.
223  locales:[int];
224
225  // The final score to assign to the results of this pattern.
226  target_classification_score:float = 1;
227
228  // Priority score used for conflict resulution with the other models.
229  priority_score:float = 0;
230
231  // The modes for which to apply the patterns.
232  enabled_modes:libtextclassifier2.ModeFlag = ALL;
233}
234
235namespace libtextclassifier2;
236table DatetimeModelExtractor {
237  extractor:libtextclassifier2.DatetimeExtractorType;
238  pattern:string;
239  locales:[int];
240  compressed_pattern:libtextclassifier2.CompressedBuffer;
241}
242
243namespace libtextclassifier2;
244table DatetimeModel {
245  // List of BCP 47 locale strings representing all locales supported by the
246  // model. The individual patterns refer back to them using an index.
247  locales:[string];
248
249  patterns:[libtextclassifier2.DatetimeModelPattern];
250  extractors:[libtextclassifier2.DatetimeModelExtractor];
251
252  // If true, will use the extractors for determining the match location as
253  // opposed to using the location where the global pattern matched.
254  use_extractors_for_locating:bool = 1;
255
256  // List of locale ids, rules of whose are always run, after the requested
257  // ones.
258  default_locales:[int];
259}
260
261namespace libtextclassifier2.DatetimeModelLibrary_;
262table Item {
263  key:string;
264  value:libtextclassifier2.DatetimeModel;
265}
266
267// A set of named DateTime models.
268namespace libtextclassifier2;
269table DatetimeModelLibrary {
270  models:[libtextclassifier2.DatetimeModelLibrary_.Item];
271}
272
273// Options controlling the output of the Tensorflow Lite models.
274namespace libtextclassifier2;
275table ModelTriggeringOptions {
276  // Lower bound threshold for filtering annotation model outputs.
277  min_annotate_confidence:float = 0;
278
279  // The modes for which to enable the models.
280  enabled_modes:libtextclassifier2.ModeFlag = ALL;
281}
282
283// Options controlling the output of the classifier.
284namespace libtextclassifier2;
285table OutputOptions {
286  // Lists of collection names that will be filtered out at the output:
287  // - For annotation, the spans of given collection are simply dropped.
288  // - For classification, the result is mapped to the class "other".
289  // - For selection, the spans of given class are returned as
290  // single-selection.
291  filtered_collections_annotation:[string];
292
293  filtered_collections_classification:[string];
294  filtered_collections_selection:[string];
295}
296
297namespace libtextclassifier2;
298table Model {
299  // Comma-separated list of locales supported by the model as BCP 47 tags.
300  locales:string;
301
302  version:int;
303
304  // A name for the model that can be used for e.g. logging.
305  name:string;
306
307  selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
308  classification_feature_options:libtextclassifier2.FeatureProcessorOptions;
309
310  // Tensorflow Lite models.
311  selection_model:[ubyte] (force_align: 16);
312
313  classification_model:[ubyte] (force_align: 16);
314  embedding_model:[ubyte] (force_align: 16);
315
316  // Options for the different models.
317  selection_options:libtextclassifier2.SelectionModelOptions;
318
319  classification_options:libtextclassifier2.ClassificationModelOptions;
320  regex_model:libtextclassifier2.RegexModel;
321  datetime_model:libtextclassifier2.DatetimeModel;
322
323  // Options controlling the output of the models.
324  triggering_options:libtextclassifier2.ModelTriggeringOptions;
325
326  // Global switch that controls if SuggestSelection(), ClassifyText() and
327  // Annotate() will run. If a mode is disabled it returns empty/no-op results.
328  enabled_modes:libtextclassifier2.ModeFlag = ALL;
329
330  // If true, will snap the selections that consist only of whitespaces to the
331  // containing suggested span. Otherwise, no suggestion is proposed, since the
332  // selections are not part of any token.
333  snap_whitespace_selections:bool = 1;
334
335  // Global configuration for the output of SuggestSelection(), ClassifyText()
336  // and Annotate().
337  output_options:libtextclassifier2.OutputOptions;
338}
339
340// Role of the codepoints in the range.
341namespace libtextclassifier2.TokenizationCodepointRange_;
342enum Role : int {
343  // Concatenates the codepoint to the current run of codepoints.
344  DEFAULT_ROLE = 0,
345
346  // Splits a run of codepoints before the current codepoint.
347  SPLIT_BEFORE = 1,
348
349  // Splits a run of codepoints after the current codepoint.
350  SPLIT_AFTER = 2,
351
352  // Each codepoint will be a separate token. Good e.g. for Chinese
353  // characters.
354  TOKEN_SEPARATOR = 3,
355
356  // Discards the codepoint.
357  DISCARD_CODEPOINT = 4,
358
359  // Common values:
360  // Splits on the characters and discards them. Good e.g. for the space
361  // character.
362  WHITESPACE_SEPARATOR = 7,
363}
364
365// Represents a codepoint range [start, end) with its role for tokenization.
366namespace libtextclassifier2;
367table TokenizationCodepointRange {
368  start:int;
369  end:int;
370  role:libtextclassifier2.TokenizationCodepointRange_.Role;
371
372  // Integer identifier of the script this range denotes. Negative values are
373  // reserved for Tokenizer's internal use.
374  script_id:int;
375}
376
377// Method for selecting the center token.
378namespace libtextclassifier2.FeatureProcessorOptions_;
379enum CenterTokenSelectionMethod : int {
380  DEFAULT_CENTER_TOKEN_METHOD = 0,
381
382  // Use click indices to determine the center token.
383  CENTER_TOKEN_FROM_CLICK = 1,
384
385  // Use selection indices to get a token range, and select the middle of it
386  // as the center token.
387  CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
388}
389
390// Controls the type of tokenization the model will use for the input text.
391namespace libtextclassifier2.FeatureProcessorOptions_;
392enum TokenizationType : int {
393  INVALID_TOKENIZATION_TYPE = 0,
394
395  // Use the internal tokenizer for tokenization.
396  INTERNAL_TOKENIZER = 1,
397
398  // Use ICU for tokenization.
399  ICU = 2,
400
401  // First apply ICU tokenization. Then identify stretches of tokens
402  // consisting only of codepoints in internal_tokenizer_codepoint_ranges
403  // and re-tokenize them using the internal tokenizer.
404  MIXED = 3,
405}
406
407// Range of codepoints start - end, where end is exclusive.
408namespace libtextclassifier2.FeatureProcessorOptions_;
409table CodepointRange {
410  start:int;
411  end:int;
412}
413
414// Bounds-sensitive feature extraction configuration.
415namespace libtextclassifier2.FeatureProcessorOptions_;
416table BoundsSensitiveFeatures {
417  // Enables the extraction of bounds-sensitive features, instead of the click
418  // context features.
419  enabled:bool;
420
421  // The numbers of tokens to extract in specific locations relative to the
422  // bounds.
423  // Immediately before the span.
424  num_tokens_before:int;
425
426  // Inside the span, aligned with the beginning.
427  num_tokens_inside_left:int;
428
429  // Inside the span, aligned with the end.
430  num_tokens_inside_right:int;
431
432  // Immediately after the span.
433  num_tokens_after:int;
434
435  // If true, also extracts the tokens of the entire span and adds up their
436  // features forming one "token" to include in the extracted features.
437  include_inside_bag:bool;
438
439  // If true, includes the selection length (in the number of tokens) as a
440  // feature.
441  include_inside_length:bool;
442
443  // If true, for selection, single token spans are not run through the model
444  // and their score is assumed to be zero.
445  score_single_token_spans_as_zero:bool;
446}
447
448namespace libtextclassifier2.FeatureProcessorOptions_;
449table AlternativeCollectionMapEntry {
450  key:string;
451  value:string;
452}
453
454namespace libtextclassifier2;
455table FeatureProcessorOptions {
456  // Number of buckets used for hashing charactergrams.
457  num_buckets:int = -1;
458
459  // Size of the embedding.
460  embedding_size:int = -1;
461
462  // Number of bits for quantization for embeddings.
463  embedding_quantization_bits:int = 8;
464
465  // Context size defines the number of words to the left and to the right of
466  // the selected word to be used as context. For example, if context size is
467  // N, then we take N words to the left and N words to the right of the
468  // selected word as its context.
469  context_size:int = -1;
470
471  // Maximum number of words of the context to select in total.
472  max_selection_span:int = -1;
473
474  // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
475  // character trigrams etc.
476  chargram_orders:[int];
477
478  // Maximum length of a word, in codepoints.
479  max_word_length:int = 20;
480
481  // If true, will use the unicode-aware functionality for extracting features.
482  unicode_aware_features:bool = 0;
483
484  // Whether to extract the token case feature.
485  extract_case_feature:bool = 0;
486
487  // Whether to extract the selection mask feature.
488  extract_selection_mask_feature:bool = 0;
489
490  // List of regexps to run over each token. For each regexp, if there is a
491  // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
492  regexp_feature:[string];
493
494  // Whether to remap all digits to a single number.
495  remap_digits:bool = 0;
496
497  // Whether to lower-case each token before generating hashgrams.
498  lowercase_tokens:bool;
499
500  // If true, the selection classifier output will contain only the selections
501  // that are feasible (e.g., those that are shorter than max_selection_span),
502  // if false, the output will be a complete cross-product of possible
503  // selections to the left and possible selections to the right, including the
504  // infeasible ones.
505  // NOTE: Exists mainly for compatibility with older models that were trained
506  // with the non-reduced output space.
507  selection_reduced_output_space:bool = 1;
508
509  // Collection names.
510  collections:[string];
511
512  // An index of collection in collections to be used if a collection name can't
513  // be mapped to an id.
514  default_collection:int = -1;
515
516  // If true, will split the input by lines, and only use the line that contains
517  // the clicked token.
518  only_use_line_with_click:bool = 0;
519
520  // If true, will split tokens that contain the selection boundary, at the
521  // position of the boundary.
522  // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
523  split_tokens_on_selection_boundaries:bool = 0;
524
525  // Codepoint ranges that determine how different codepoints are tokenized.
526  // The ranges must not overlap.
527  tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];
528
529  center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;
530
531  // If true, span boundaries will be snapped to containing tokens and not
532  // required to exactly match token boundaries.
533  snap_label_span_boundaries_to_containing_tokens:bool;
534
535  // A set of codepoint ranges supported by the model.
536  supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
537
538  // A set of codepoint ranges to use in the mixed tokenization mode to identify
539  // stretches of tokens to re-tokenize using the internal tokenizer.
540  internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
541
542  // Minimum ratio of supported codepoints in the input context. If the ratio
543  // is lower than this, the feature computation will fail.
544  min_supported_codepoint_ratio:float = 0;
545
546  // Used for versioning the format of features the model expects.
547  // - feature_version == 0:
548  // For each token the features consist of:
549  // - chargram embeddings
550  // - dense features
551  // Chargram embeddings for tokens are concatenated first together,
552  // and at the end, the dense features for the tokens are concatenated
553  // to it. So the resulting feature vector has two regions.
554  feature_version:int = 0;
555
556  tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType = INTERNAL_TOKENIZER;
557  icu_preserve_whitespace_tokens:bool = 0;
558
559  // List of codepoints that will be stripped from beginning and end of
560  // predicted spans.
561  ignored_span_boundary_codepoints:[int];
562
563  bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;
564
565  // List of allowed charactergrams. The extracted charactergrams are filtered
566  // using this list, and charactergrams that are not present are interpreted as
567  // out-of-vocabulary.
568  // If no allowed_chargrams are specified, all charactergrams are allowed.
569  // The field is typed as bytes type to allow non-UTF8 chargrams.
570  allowed_chargrams:[string];
571
572  // If true, tokens will be also split when the codepoint's script_id changes
573  // as defined in TokenizationCodepointRange.
574  tokenize_on_script_change:bool = 0;
575}
576
577root_type libtextclassifier2.Model;
578