• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2018 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17// Controls the type of tokenization the model will use for the input text.
18namespace libtextclassifier3;
19enum TokenizationType : int {
20  INVALID_TOKENIZATION_TYPE = 0,
21
22  // Use the internal tokenizer for tokenization.
23  INTERNAL_TOKENIZER = 1,
24
25  // Use ICU for tokenization.
26  ICU = 2,
27
28  // First apply ICU tokenization. Then identify stretches of tokens
29  // consisting only of codepoints in internal_tokenizer_codepoint_ranges
30  // and re-tokenize them using the internal tokenizer.
31  MIXED = 3,
32
33  // Tokenizer parsing out numbers, words and separators.
34  LETTER_DIGIT = 4,
35}
36
37// Role of the codepoints in the range.
38namespace libtextclassifier3.TokenizationCodepointRange_;
39enum Role : int {
40  // Concatenates the codepoint to the current run of codepoints.
41  DEFAULT_ROLE = 0,
42
43  // Splits a run of codepoints before the current codepoint.
44  SPLIT_BEFORE = 1,
45
46  // Splits a run of codepoints after the current codepoint.
47  SPLIT_AFTER = 2,
48
49  // Each codepoint will be a separate token. Good e.g. for Chinese
50  // characters.
51  TOKEN_SEPARATOR = 3,
52
53  // Discards the codepoint.
54  DISCARD_CODEPOINT = 4,
55
56  // Common values:
57  // Splits on the characters and discards them. Good e.g. for the space
58  // character.
59  WHITESPACE_SEPARATOR = 7,
60}
61
62// Represents a codepoint range [start, end) with its role for tokenization.
63namespace libtextclassifier3;
64table TokenizationCodepointRange {
65  start:int;
66  end:int;
67  role:TokenizationCodepointRange_.Role;
68
69  // Integer identifier of the script this range denotes. Negative values are
70  // reserved for Tokenizer's internal use.
71  script_id:int;
72}
73
74