1// 2// Copyright (C) 2018 The Android Open Source Project 3// 4// Licensed under the Apache License, Version 2.0 (the "License"); 5// you may not use this file except in compliance with the License. 6// You may obtain a copy of the License at 7// 8// http://www.apache.org/licenses/LICENSE-2.0 9// 10// Unless required by applicable law or agreed to in writing, software 11// distributed under the License is distributed on an "AS IS" BASIS, 12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13// See the License for the specific language governing permissions and 14// limitations under the License. 15// 16 17// Controls the type of tokenization the model will use for the input text. 18namespace libtextclassifier3; 19enum TokenizationType : int { 20 INVALID_TOKENIZATION_TYPE = 0, 21 22 // Use the internal tokenizer for tokenization. 23 INTERNAL_TOKENIZER = 1, 24 25 // Use ICU for tokenization. 26 ICU = 2, 27 28 // First apply ICU tokenization. Then identify stretches of tokens 29 // consisting only of codepoints in internal_tokenizer_codepoint_ranges 30 // and re-tokenize them using the internal tokenizer. 31 MIXED = 3, 32 33 // Tokenizer parsing out numbers, words and separators. 34 LETTER_DIGIT = 4, 35} 36 37// Role of the codepoints in the range. 38namespace libtextclassifier3.TokenizationCodepointRange_; 39enum Role : int { 40 // Concatenates the codepoint to the current run of codepoints. 41 DEFAULT_ROLE = 0, 42 43 // Splits a run of codepoints before the current codepoint. 44 SPLIT_BEFORE = 1, 45 46 // Splits a run of codepoints after the current codepoint. 47 SPLIT_AFTER = 2, 48 49 // Each codepoint will be a separate token. Good e.g. for Chinese 50 // characters. 51 TOKEN_SEPARATOR = 3, 52 53 // Discards the codepoint. 54 DISCARD_CODEPOINT = 4, 55 56 // Common values: 57 // Splits on the characters and discards them. Good e.g. for the space 58 // character. 59 WHITESPACE_SEPARATOR = 7, 60} 61 62// Represents a codepoint range [start, end) with its role for tokenization. 63namespace libtextclassifier3; 64table TokenizationCodepointRange { 65 start:int; 66 end:int; 67 role:TokenizationCodepointRange_.Role; 68 69 // Integer identifier of the script this range denotes. Negative values are 70 // reserved for Tokenizer's internal use. 71 script_id:int; 72} 73 74