1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.bigquery.storage.v1beta1; 18 19import "google/api/annotations.proto"; 20import "google/api/client.proto"; 21import "google/api/field_behavior.proto"; 22import "google/api/resource.proto"; 23import "google/cloud/bigquery/storage/v1beta1/arrow.proto"; 24import "google/cloud/bigquery/storage/v1beta1/avro.proto"; 25import "google/cloud/bigquery/storage/v1beta1/read_options.proto"; 26import "google/cloud/bigquery/storage/v1beta1/table_reference.proto"; 27import "google/protobuf/empty.proto"; 28import "google/protobuf/timestamp.proto"; 29 30option go_package = "cloud.google.com/go/bigquery/storage/apiv1beta1/storagepb;storagepb"; 31option java_package = "com.google.cloud.bigquery.storage.v1beta1"; 32 33// BigQuery storage API. 34// 35// The BigQuery storage API can be used to read data stored in BigQuery. 36// 37// The v1beta1 API is not yet officially deprecated, and will go through a full 38// deprecation cycle (https://cloud.google.com/products#product-launch-stages) 39// before the service is turned down. However, new code should use the v1 API 40// going forward. 41service BigQueryStorage { 42 option (google.api.default_host) = "bigquerystorage.googleapis.com"; 43 option (google.api.oauth_scopes) = 44 "https://www.googleapis.com/auth/bigquery," 45 "https://www.googleapis.com/auth/cloud-platform"; 46 47 // Creates a new read session. A read session divides the contents of a 48 // BigQuery table into one or more streams, which can then be used to read 49 // data from the table. The read session also specifies properties of the 50 // data to be read, such as a list of columns or a push-down filter describing 51 // the rows to be returned. 52 // 53 // A particular row can be read by at most one stream. When the caller has 54 // reached the end of each stream in the session, then all the data in the 55 // table has been read. 56 // 57 // Read sessions automatically expire 6 hours after they are created and do 58 // not require manual clean-up by the caller. 59 rpc CreateReadSession(CreateReadSessionRequest) returns (ReadSession) { 60 option (google.api.http) = { 61 post: "/v1beta1/{table_reference.project_id=projects/*}" 62 body: "*" 63 additional_bindings { 64 post: "/v1beta1/{table_reference.dataset_id=projects/*/datasets/*}" 65 body: "*" 66 } 67 }; 68 option (google.api.method_signature) = 69 "table_reference,parent,requested_streams"; 70 } 71 72 // Reads rows from the table in the format prescribed by the read session. 73 // Each response contains one or more table rows, up to a maximum of 10 MiB 74 // per response; read requests which attempt to read individual rows larger 75 // than this will fail. 76 // 77 // Each request also returns a set of stream statistics reflecting the 78 // estimated total number of rows in the read stream. This number is computed 79 // based on the total table size and the number of active streams in the read 80 // session, and may change as other streams continue to read data. 81 rpc ReadRows(ReadRowsRequest) returns (stream ReadRowsResponse) { 82 option (google.api.http) = { 83 get: "/v1beta1/{read_position.stream.name=projects/*/streams/*}" 84 }; 85 option (google.api.method_signature) = "read_position"; 86 } 87 88 // Creates additional streams for a ReadSession. This API can be used to 89 // dynamically adjust the parallelism of a batch processing task upwards by 90 // adding additional workers. 91 rpc BatchCreateReadSessionStreams(BatchCreateReadSessionStreamsRequest) 92 returns (BatchCreateReadSessionStreamsResponse) { 93 option (google.api.http) = { 94 post: "/v1beta1/{session.name=projects/*/sessions/*}" 95 body: "*" 96 }; 97 option (google.api.method_signature) = "session,requested_streams"; 98 } 99 100 // Causes a single stream in a ReadSession to gracefully stop. This 101 // API can be used to dynamically adjust the parallelism of a batch processing 102 // task downwards without losing data. 103 // 104 // This API does not delete the stream -- it remains visible in the 105 // ReadSession, and any data processed by the stream is not released to other 106 // streams. However, no additional data will be assigned to the stream once 107 // this call completes. Callers must continue reading data on the stream until 108 // the end of the stream is reached so that data which has already been 109 // assigned to the stream will be processed. 110 // 111 // This method will return an error if there are no other live streams 112 // in the Session, or if SplitReadStream() has been called on the given 113 // Stream. 114 rpc FinalizeStream(FinalizeStreamRequest) returns (google.protobuf.Empty) { 115 option (google.api.http) = { 116 post: "/v1beta1/{stream.name=projects/*/streams/*}" 117 body: "*" 118 }; 119 option (google.api.method_signature) = "stream"; 120 } 121 122 // Splits a given read stream into two Streams. These streams are referred to 123 // as the primary and the residual of the split. The original stream can still 124 // be read from in the same manner as before. Both of the returned streams can 125 // also be read from, and the total rows return by both child streams will be 126 // the same as the rows read from the original stream. 127 // 128 // Moreover, the two child streams will be allocated back to back in the 129 // original Stream. Concretely, it is guaranteed that for streams Original, 130 // Primary, and Residual, that Original[0-j] = Primary[0-j] and 131 // Original[j-n] = Residual[0-m] once the streams have been read to 132 // completion. 133 // 134 // This method is guaranteed to be idempotent. 135 rpc SplitReadStream(SplitReadStreamRequest) 136 returns (SplitReadStreamResponse) { 137 option (google.api.http) = { 138 get: "/v1beta1/{original_stream.name=projects/*/streams/*}" 139 }; 140 option (google.api.method_signature) = "original_stream"; 141 } 142} 143 144// Information about a single data stream within a read session. 145message Stream { 146 option (google.api.resource) = { 147 type: "bigquerystorage.googleapis.com/Stream" 148 pattern: "projects/{project}/locations/{location}/streams/{stream}" 149 }; 150 151 // Name of the stream, in the form 152 // `projects/{project_id}/locations/{location}/streams/{stream_id}`. 153 string name = 1; 154} 155 156// Expresses a point within a given stream using an offset position. 157message StreamPosition { 158 // Identifier for a given Stream. 159 Stream stream = 1; 160 161 // Position in the stream. 162 int64 offset = 2; 163} 164 165// Information returned from a `CreateReadSession` request. 166message ReadSession { 167 option (google.api.resource) = { 168 type: "bigquerystorage.googleapis.com/ReadSession" 169 pattern: "projects/{project}/locations/{location}/sessions/{session}" 170 }; 171 172 // Unique identifier for the session, in the form 173 // `projects/{project_id}/locations/{location}/sessions/{session_id}`. 174 string name = 1; 175 176 // Time at which the session becomes invalid. After this time, subsequent 177 // requests to read this Session will return errors. 178 google.protobuf.Timestamp expire_time = 2; 179 180 // The schema for the read. If read_options.selected_fields is set, the 181 // schema may be different from the table schema as it will only contain 182 // the selected fields. 183 oneof schema { 184 // Avro schema. 185 AvroSchema avro_schema = 5; 186 187 // Arrow schema. 188 ArrowSchema arrow_schema = 6; 189 } 190 191 // Streams associated with this session. 192 repeated Stream streams = 4; 193 194 // Table that this ReadSession is reading from. 195 TableReference table_reference = 7; 196 197 // Any modifiers which are applied when reading from the specified table. 198 TableModifiers table_modifiers = 8; 199 200 // The strategy to use for distributing data among the streams. 201 ShardingStrategy sharding_strategy = 9; 202} 203 204// Data format for input or output data. 205enum DataFormat { 206 // Data format is unspecified. 207 DATA_FORMAT_UNSPECIFIED = 0; 208 209 // Avro is a standard open source row based file format. 210 // See https://avro.apache.org/ for more details. 211 AVRO = 1; 212 213 // Arrow is a standard open source column-based message format. 214 // See https://arrow.apache.org/ for more details. 215 ARROW = 3; 216} 217 218// Strategy for distributing data among multiple streams in a read session. 219enum ShardingStrategy { 220 // Same as LIQUID. 221 SHARDING_STRATEGY_UNSPECIFIED = 0; 222 223 // Assigns data to each stream based on the client's read rate. The faster the 224 // client reads from a stream, the more data is assigned to the stream. In 225 // this strategy, it's possible to read all data from a single stream even if 226 // there are other streams present. 227 LIQUID = 1; 228 229 // Assigns data to each stream such that roughly the same number of rows can 230 // be read from each stream. Because the server-side unit for assigning data 231 // is collections of rows, the API does not guarantee that each stream will 232 // return the same number or rows. Additionally, the limits are enforced based 233 // on the number of pre-filtering rows, so some filters can lead to lopsided 234 // assignments. 235 BALANCED = 2; 236} 237 238// Creates a new read session, which may include additional options such as 239// requested parallelism, projection filters and constraints. 240message CreateReadSessionRequest { 241 // Required. Reference to the table to read. 242 TableReference table_reference = 1 [(google.api.field_behavior) = REQUIRED]; 243 244 // Required. String of the form `projects/{project_id}` indicating the 245 // project this ReadSession is associated with. This is the project that will 246 // be billed for usage. 247 string parent = 6 [ 248 (google.api.field_behavior) = REQUIRED, 249 (google.api.resource_reference) = { 250 type: "cloudresourcemanager.googleapis.com/Project" 251 } 252 ]; 253 254 // Any modifiers to the Table (e.g. snapshot timestamp). 255 TableModifiers table_modifiers = 2; 256 257 // Initial number of streams. If unset or 0, we will 258 // provide a value of streams so as to produce reasonable throughput. Must be 259 // non-negative. The number of streams may be lower than the requested number, 260 // depending on the amount parallelism that is reasonable for the table and 261 // the maximum amount of parallelism allowed by the system. 262 // 263 // Streams must be read starting from offset 0. 264 int32 requested_streams = 3; 265 266 // Read options for this session (e.g. column selection, filters). 267 TableReadOptions read_options = 4; 268 269 // Data output format. Currently default to Avro. 270 // DATA_FORMAT_UNSPECIFIED not supported. 271 DataFormat format = 5; 272 273 // The strategy to use for distributing data among multiple streams. Currently 274 // defaults to liquid sharding. 275 ShardingStrategy sharding_strategy = 7; 276} 277 278// Requesting row data via `ReadRows` must provide Stream position information. 279message ReadRowsRequest { 280 // Required. Identifier of the position in the stream to start reading from. 281 // The offset requested must be less than the last row read from ReadRows. 282 // Requesting a larger offset is undefined. 283 StreamPosition read_position = 1 [(google.api.field_behavior) = REQUIRED]; 284} 285 286// Progress information for a given Stream. 287message StreamStatus { 288 // Number of estimated rows in the current stream. May change over time as 289 // different readers in the stream progress at rates which are relatively fast 290 // or slow. 291 int64 estimated_row_count = 1; 292 293 // A value in the range [0.0, 1.0] that represents the fraction of rows 294 // assigned to this stream that have been processed by the server. In the 295 // presence of read filters, the server may process more rows than it returns, 296 // so this value reflects progress through the pre-filtering rows. 297 // 298 // This value is only populated for sessions created through the BALANCED 299 // sharding strategy. 300 float fraction_consumed = 2; 301 302 // Represents the progress of the current stream. 303 Progress progress = 4; 304 305 // Whether this stream can be split. For sessions that use the LIQUID sharding 306 // strategy, this value is always false. For BALANCED sessions, this value is 307 // false when enough data have been read such that no more splits are possible 308 // at that point or beyond. For small tables or streams that are the result of 309 // a chain of splits, this value may never be true. 310 bool is_splittable = 3; 311} 312 313message Progress { 314 // The fraction of rows assigned to the stream that have been processed by the 315 // server so far, not including the rows in the current response message. 316 // 317 // This value, along with `at_response_end`, can be used to interpolate the 318 // progress made as the rows in the message are being processed using the 319 // following formula: `at_response_start + (at_response_end - 320 // at_response_start) * rows_processed_from_response / rows_in_response`. 321 // 322 // Note that if a filter is provided, the `at_response_end` value of the 323 // previous response may not necessarily be equal to the `at_response_start` 324 // value of the current response. 325 float at_response_start = 1; 326 327 // Similar to `at_response_start`, except that this value includes the rows in 328 // the current response. 329 float at_response_end = 2; 330} 331 332// Information on if the current connection is being throttled. 333message ThrottleStatus { 334 // How much this connection is being throttled. 335 // 0 is no throttling, 100 is completely throttled. 336 int32 throttle_percent = 1; 337} 338 339// Response from calling `ReadRows` may include row data, progress and 340// throttling information. 341message ReadRowsResponse { 342 // Row data is returned in format specified during session creation. 343 oneof rows { 344 // Serialized row data in AVRO format. 345 AvroRows avro_rows = 3; 346 347 // Serialized row data in Arrow RecordBatch format. 348 ArrowRecordBatch arrow_record_batch = 4; 349 } 350 351 // Number of serialized rows in the rows block. This value is recorded here, 352 // in addition to the row_count values in the output-specific messages in 353 // `rows`, so that code which needs to record progress through the stream can 354 // do so in an output format-independent way. 355 int64 row_count = 6; 356 357 // Estimated stream statistics. 358 StreamStatus status = 2; 359 360 // Throttling status. If unset, the latest response still describes 361 // the current throttling status. 362 ThrottleStatus throttle_status = 5; 363 364 // The schema for the read. If read_options.selected_fields is set, the 365 // schema may be different from the table schema as it will only contain 366 // the selected fields. This schema is equivalent to the one returned by 367 // CreateSession. This field is only populated in the first ReadRowsResponse 368 // RPC. 369 oneof schema { 370 // Output only. Avro schema. 371 AvroSchema avro_schema = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; 372 373 // Output only. Arrow schema. 374 ArrowSchema arrow_schema = 8 [(google.api.field_behavior) = OUTPUT_ONLY]; 375 } 376} 377 378// Information needed to request additional streams for an established read 379// session. 380message BatchCreateReadSessionStreamsRequest { 381 // Required. Must be a non-expired session obtained from a call to 382 // CreateReadSession. Only the name field needs to be set. 383 ReadSession session = 1 [(google.api.field_behavior) = REQUIRED]; 384 385 // Required. Number of new streams requested. Must be positive. 386 // Number of added streams may be less than this, see CreateReadSessionRequest 387 // for more information. 388 int32 requested_streams = 2 [(google.api.field_behavior) = REQUIRED]; 389} 390 391// The response from `BatchCreateReadSessionStreams` returns the stream 392// identifiers for the newly created streams. 393message BatchCreateReadSessionStreamsResponse { 394 // Newly added streams. 395 repeated Stream streams = 1; 396} 397 398// Request information for invoking `FinalizeStream`. 399message FinalizeStreamRequest { 400 // Required. Stream to finalize. 401 Stream stream = 2 [(google.api.field_behavior) = REQUIRED]; 402} 403 404// Request information for `SplitReadStream`. 405message SplitReadStreamRequest { 406 // Required. Stream to split. 407 Stream original_stream = 1 [(google.api.field_behavior) = REQUIRED]; 408 409 // A value in the range (0.0, 1.0) that specifies the fractional point at 410 // which the original stream should be split. The actual split point is 411 // evaluated on pre-filtered rows, so if a filter is provided, then there is 412 // no guarantee that the division of the rows between the new child streams 413 // will be proportional to this fractional value. Additionally, because the 414 // server-side unit for assigning data is collections of rows, this fraction 415 // will always map to to a data storage boundary on the server side. 416 float fraction = 2; 417} 418 419// Response from `SplitReadStream`. 420message SplitReadStreamResponse { 421 // Primary stream, which contains the beginning portion of 422 // |original_stream|. An empty value indicates that the original stream can no 423 // longer be split. 424 Stream primary_stream = 1; 425 426 // Remainder stream, which contains the tail of |original_stream|. An empty 427 // value indicates that the original stream can no longer be split. 428 Stream remainder_stream = 2; 429} 430