• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2022 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.discoveryengine.v1alpha;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21
22option csharp_namespace = "Google.Cloud.DiscoveryEngine.V1Alpha";
23option go_package = "cloud.google.com/go/discoveryengine/apiv1alpha/discoveryenginepb;discoveryenginepb";
24option java_multiple_files = true;
25option java_outer_classname = "DocumentProcessingConfigProto";
26option java_package = "com.google.cloud.discoveryengine.v1alpha";
27option objc_class_prefix = "DISCOVERYENGINE";
28option php_namespace = "Google\\Cloud\\DiscoveryEngine\\V1alpha";
29option ruby_package = "Google::Cloud::DiscoveryEngine::V1alpha";
30
31// A singleton resource of
32// [DataStore][google.cloud.discoveryengine.v1alpha.DataStore]. It's empty when
33// [DataStore][google.cloud.discoveryengine.v1alpha.DataStore] is created, which
34// defaults to digital parser. The first call to
35// [DataStoreService.UpdateDocumentProcessingConfig][google.cloud.discoveryengine.v1alpha.DataStoreService.UpdateDocumentProcessingConfig]
36// method will initialize the config.
37message DocumentProcessingConfig {
38  option (google.api.resource) = {
39    type: "discoveryengine.googleapis.com/DocumentProcessingConfig"
40    pattern: "projects/{project}/locations/{location}/dataStores/{data_store}/documentProcessingConfig"
41    pattern: "projects/{project}/locations/{location}/collections/{collection}/dataStores/{data_store}/documentProcessingConfig"
42  };
43
44  // Configuration for chunking config.
45  message ChunkingConfig {
46    // Configuration for the layout based chunking.
47    message LayoutBasedChunkingConfig {
48      // The token size limit for each chunk.
49      //
50      // Supported values: 100-500 (inclusive).
51      // Default value: 500.
52      int32 chunk_size = 1;
53
54      // Whether to include appending different levels of headings to chunks
55      // from the middle of the document to prevent context loss.
56      //
57      // Default value: False.
58      bool include_ancestor_headings = 2;
59    }
60
61    // Additional configs that defines the behavior of the chunking.
62    oneof chunk_mode {
63      // Configuration for the layout based chunking.
64      LayoutBasedChunkingConfig layout_based_chunking_config = 1;
65    }
66  }
67
68  // Related configurations applied to a specific type of document parser.
69  message ParsingConfig {
70    // The digital parsing configurations for documents.
71    message DigitalParsingConfig {}
72
73    // The OCR parsing configurations for documents.
74    message OcrParsingConfig {
75      // [DEPRECATED] This field is deprecated. To use the additional enhanced
76      // document elements processing, please switch to `layout_parsing_config`.
77      repeated string enhanced_document_elements = 1 [deprecated = true];
78
79      // If true, will use native text instead of OCR text on pages containing
80      // native text.
81      bool use_native_text = 2;
82    }
83
84    // The layout parsing configurations for documents.
85    message LayoutParsingConfig {}
86
87    // Configs for document processing types.
88    oneof type_dedicated_config {
89      // Configurations applied to digital parser.
90      DigitalParsingConfig digital_parsing_config = 1;
91
92      // Configurations applied to OCR parser. Currently it only applies to
93      // PDFs.
94      OcrParsingConfig ocr_parsing_config = 2;
95
96      // Configurations applied to layout parser.
97      LayoutParsingConfig layout_parsing_config = 3;
98    }
99  }
100
101  // The full resource name of the Document Processing Config.
102  // Format:
103  // `projects/*/locations/*/collections/*/dataStores/*/documentProcessingConfig`.
104  string name = 1;
105
106  // Whether chunking mode is enabled.
107  ChunkingConfig chunking_config = 3;
108
109  // Configurations for default Document parser.
110  // If not specified, we will configure it as default DigitalParsingConfig, and
111  // the default parsing config will be applied to all file types for Document
112  // parsing.
113  ParsingConfig default_parsing_config = 4;
114
115  // Map from file type to override the default parsing configuration based on
116  // the file type. Supported keys:
117  // * `pdf`: Override parsing config for PDF files, either digital parsing, ocr
118  // parsing or layout parsing is supported.
119  // * `html`: Override parsing config for HTML files, only digital parsing and
120  // or layout parsing are supported.
121  // * `docx`: Override parsing config for DOCX files, only digital parsing and
122  // or layout parsing are supported.
123  map<string, ParsingConfig> parsing_config_overrides = 5;
124}
125