• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright 2022 Google LLC
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16syntax = "proto2";
17
18package fcp.dictionary;
19
20// Describes a mapping of strings to (usually consecutive) integer ids.
21message DictionaryDescription {
22  // Vocabulary ids with special meaning.
23  message SpecialIds {
24    // If set and non-negative, id used for an unknown token.
25    optional int32 unk = 1 [default = -1];
26
27    // If set and non-negative, id used for the beginning of a sequence.
28    optional int32 bos = 2 [default = -1];
29
30    // If set and non-negative, id used for the end of a sequence.
31    optional int32 eos = 3 [default = -1];
32  }
33
34  // Vocabulary ids that should be filtered from the predictions (e.g.,
35  // punctuation, bad words etc.).
36  message OutputBlocklistIds {
37    repeated int32 id = 1 [packed = true];
38  }
39
40  // Optional persistent storage format for the token to id map.
41  message Vocabulary {
42    message TokenIndex {
43      repeated string token = 1;
44    }
45
46    reserved 1;
47
48    oneof vocabulary {
49      // Repeated strings stored in-order (index begins at 0).
50      TokenIndex index = 2;
51    }
52  }
53
54  optional SpecialIds special_ids = 1;
55
56  optional Vocabulary vocabulary = 2;
57
58  optional OutputBlocklistIds output_blocklist_ids = 3;
59
60  reserved 4;
61}
62