1// Copyright 2022 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.discoveryengine.v1alpha; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21 22option csharp_namespace = "Google.Cloud.DiscoveryEngine.V1Alpha"; 23option go_package = "cloud.google.com/go/discoveryengine/apiv1alpha/discoveryenginepb;discoveryenginepb"; 24option java_multiple_files = true; 25option java_outer_classname = "DocumentProcessingConfigProto"; 26option java_package = "com.google.cloud.discoveryengine.v1alpha"; 27option objc_class_prefix = "DISCOVERYENGINE"; 28option php_namespace = "Google\\Cloud\\DiscoveryEngine\\V1alpha"; 29option ruby_package = "Google::Cloud::DiscoveryEngine::V1alpha"; 30 31// A singleton resource of 32// [DataStore][google.cloud.discoveryengine.v1alpha.DataStore]. It's empty when 33// [DataStore][google.cloud.discoveryengine.v1alpha.DataStore] is created, which 34// defaults to digital parser. The first call to 35// [DataStoreService.UpdateDocumentProcessingConfig][google.cloud.discoveryengine.v1alpha.DataStoreService.UpdateDocumentProcessingConfig] 36// method will initialize the config. 37message DocumentProcessingConfig { 38 option (google.api.resource) = { 39 type: "discoveryengine.googleapis.com/DocumentProcessingConfig" 40 pattern: "projects/{project}/locations/{location}/dataStores/{data_store}/documentProcessingConfig" 41 pattern: "projects/{project}/locations/{location}/collections/{collection}/dataStores/{data_store}/documentProcessingConfig" 42 }; 43 44 // Configuration for chunking config. 45 message ChunkingConfig { 46 // Configuration for the layout based chunking. 47 message LayoutBasedChunkingConfig { 48 // The token size limit for each chunk. 49 // 50 // Supported values: 100-500 (inclusive). 51 // Default value: 500. 52 int32 chunk_size = 1; 53 54 // Whether to include appending different levels of headings to chunks 55 // from the middle of the document to prevent context loss. 56 // 57 // Default value: False. 58 bool include_ancestor_headings = 2; 59 } 60 61 // Additional configs that defines the behavior of the chunking. 62 oneof chunk_mode { 63 // Configuration for the layout based chunking. 64 LayoutBasedChunkingConfig layout_based_chunking_config = 1; 65 } 66 } 67 68 // Related configurations applied to a specific type of document parser. 69 message ParsingConfig { 70 // The digital parsing configurations for documents. 71 message DigitalParsingConfig {} 72 73 // The OCR parsing configurations for documents. 74 message OcrParsingConfig { 75 // [DEPRECATED] This field is deprecated. To use the additional enhanced 76 // document elements processing, please switch to `layout_parsing_config`. 77 repeated string enhanced_document_elements = 1 [deprecated = true]; 78 79 // If true, will use native text instead of OCR text on pages containing 80 // native text. 81 bool use_native_text = 2; 82 } 83 84 // The layout parsing configurations for documents. 85 message LayoutParsingConfig {} 86 87 // Configs for document processing types. 88 oneof type_dedicated_config { 89 // Configurations applied to digital parser. 90 DigitalParsingConfig digital_parsing_config = 1; 91 92 // Configurations applied to OCR parser. Currently it only applies to 93 // PDFs. 94 OcrParsingConfig ocr_parsing_config = 2; 95 96 // Configurations applied to layout parser. 97 LayoutParsingConfig layout_parsing_config = 3; 98 } 99 } 100 101 // The full resource name of the Document Processing Config. 102 // Format: 103 // `projects/*/locations/*/collections/*/dataStores/*/documentProcessingConfig`. 104 string name = 1; 105 106 // Whether chunking mode is enabled. 107 ChunkingConfig chunking_config = 3; 108 109 // Configurations for default Document parser. 110 // If not specified, we will configure it as default DigitalParsingConfig, and 111 // the default parsing config will be applied to all file types for Document 112 // parsing. 113 ParsingConfig default_parsing_config = 4; 114 115 // Map from file type to override the default parsing configuration based on 116 // the file type. Supported keys: 117 // * `pdf`: Override parsing config for PDF files, either digital parsing, ocr 118 // parsing or layout parsing is supported. 119 // * `html`: Override parsing config for HTML files, only digital parsing and 120 // or layout parsing are supported. 121 // * `docx`: Override parsing config for DOCX files, only digital parsing and 122 // or layout parsing are supported. 123 map<string, ParsingConfig> parsing_config_overrides = 5; 124} 125