1// 2// Copyright (C) 2018 The Android Open Source Project 3// 4// Licensed under the Apache License, Version 2.0 (the "License"); 5// you may not use this file except in compliance with the License. 6// You may obtain a copy of the License at 7// 8// http://www.apache.org/licenses/LICENSE-2.0 9// 10// Unless required by applicable law or agreed to in writing, software 11// distributed under the License is distributed on an "AS IS" BASIS, 12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13// See the License for the specific language governing permissions and 14// limitations under the License. 15// 16 17// Controls the type of tokenization the model will use for the input text. 18namespace libtextclassifier3; 19enum TokenizationType : int { 20 INVALID_TOKENIZATION_TYPE = 0, 21 22 // Use the internal tokenizer for tokenization. 23 INTERNAL_TOKENIZER = 1, 24 25 // Use ICU for tokenization. 26 ICU = 2, 27 28 // First apply ICU tokenization. Then identify stretches of tokens 29 // consisting only of codepoints in internal_tokenizer_codepoint_ranges 30 // and re-tokenize them using the internal tokenizer. 31 MIXED = 3, 32} 33 34// Role of the codepoints in the range. 35namespace libtextclassifier3.TokenizationCodepointRange_; 36enum Role : int { 37 // Concatenates the codepoint to the current run of codepoints. 38 DEFAULT_ROLE = 0, 39 40 // Splits a run of codepoints before the current codepoint. 41 SPLIT_BEFORE = 1, 42 43 // Splits a run of codepoints after the current codepoint. 44 SPLIT_AFTER = 2, 45 46 // Each codepoint will be a separate token. Good e.g. for Chinese 47 // characters. 48 TOKEN_SEPARATOR = 3, 49 50 // Discards the codepoint. 51 DISCARD_CODEPOINT = 4, 52 53 // Common values: 54 // Splits on the characters and discards them. Good e.g. for the space 55 // character. 56 WHITESPACE_SEPARATOR = 7, 57} 58 59// Represents a codepoint range [start, end) with its role for tokenization. 60namespace libtextclassifier3; 61table TokenizationCodepointRange { 62 start:int; 63 end:int; 64 role:TokenizationCodepointRange_.Role; 65 66 // Integer identifier of the script this range denotes. Negative values are 67 // reserved for Tokenizer's internal use. 68 script_id:int; 69} 70 71