1/* 2 * Copyright 2022 Google LLC 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16syntax = "proto2"; 17 18package fcp.dictionary; 19 20// Describes a mapping of strings to (usually consecutive) integer ids. 21message DictionaryDescription { 22 // Vocabulary ids with special meaning. 23 message SpecialIds { 24 // If set and non-negative, id used for an unknown token. 25 optional int32 unk = 1 [default = -1]; 26 27 // If set and non-negative, id used for the beginning of a sequence. 28 optional int32 bos = 2 [default = -1]; 29 30 // If set and non-negative, id used for the end of a sequence. 31 optional int32 eos = 3 [default = -1]; 32 } 33 34 // Vocabulary ids that should be filtered from the predictions (e.g., 35 // punctuation, bad words etc.). 36 message OutputBlocklistIds { 37 repeated int32 id = 1 [packed = true]; 38 } 39 40 // Optional persistent storage format for the token to id map. 41 message Vocabulary { 42 message TokenIndex { 43 repeated string token = 1; 44 } 45 46 reserved 1; 47 48 oneof vocabulary { 49 // Repeated strings stored in-order (index begins at 0). 50 TokenIndex index = 2; 51 } 52 } 53 54 optional SpecialIds special_ids = 1; 55 56 optional Vocabulary vocabulary = 2; 57 58 optional OutputBlocklistIds output_blocklist_ids = 3; 59 60 reserved 4; 61} 62