• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2018 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17// Controls the type of tokenization the model will use for the input text.
18namespace libtextclassifier3;
19enum TokenizationType : int {
20  INVALID_TOKENIZATION_TYPE = 0,
21
22  // Use the internal tokenizer for tokenization.
23  INTERNAL_TOKENIZER = 1,
24
25  // Use ICU for tokenization.
26  ICU = 2,
27
28  // First apply ICU tokenization. Then identify stretches of tokens
29  // consisting only of codepoints in internal_tokenizer_codepoint_ranges
30  // and re-tokenize them using the internal tokenizer.
31  MIXED = 3,
32}
33
34// Role of the codepoints in the range.
35namespace libtextclassifier3.TokenizationCodepointRange_;
36enum Role : int {
37  // Concatenates the codepoint to the current run of codepoints.
38  DEFAULT_ROLE = 0,
39
40  // Splits a run of codepoints before the current codepoint.
41  SPLIT_BEFORE = 1,
42
43  // Splits a run of codepoints after the current codepoint.
44  SPLIT_AFTER = 2,
45
46  // Each codepoint will be a separate token. Good e.g. for Chinese
47  // characters.
48  TOKEN_SEPARATOR = 3,
49
50  // Discards the codepoint.
51  DISCARD_CODEPOINT = 4,
52
53  // Common values:
54  // Splits on the characters and discards them. Good e.g. for the space
55  // character.
56  WHITESPACE_SEPARATOR = 7,
57}
58
59// Represents a codepoint range [start, end) with its role for tokenization.
60namespace libtextclassifier3;
61table TokenizationCodepointRange {
62  start:int;
63  end:int;
64  role:TokenizationCodepointRange_.Role;
65
66  // Integer identifier of the script this range denotes. Negative values are
67  // reserved for Tokenizer's internal use.
68  script_id:int;
69}
70
71