• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.bigquery.storage.v1beta1;
18
19import "google/api/annotations.proto";
20import "google/api/client.proto";
21import "google/api/field_behavior.proto";
22import "google/api/resource.proto";
23import "google/cloud/bigquery/storage/v1beta1/arrow.proto";
24import "google/cloud/bigquery/storage/v1beta1/avro.proto";
25import "google/cloud/bigquery/storage/v1beta1/read_options.proto";
26import "google/cloud/bigquery/storage/v1beta1/table_reference.proto";
27import "google/protobuf/empty.proto";
28import "google/protobuf/timestamp.proto";
29
30option go_package = "cloud.google.com/go/bigquery/storage/apiv1beta1/storagepb;storagepb";
31option java_package = "com.google.cloud.bigquery.storage.v1beta1";
32
33// BigQuery storage API.
34//
35// The BigQuery storage API can be used to read data stored in BigQuery.
36//
37// The v1beta1 API is not yet officially deprecated, and will go through a full
38// deprecation cycle (https://cloud.google.com/products#product-launch-stages)
39// before the service is turned down. However, new code should use the v1 API
40// going forward.
41service BigQueryStorage {
42  option (google.api.default_host) = "bigquerystorage.googleapis.com";
43  option (google.api.oauth_scopes) =
44      "https://www.googleapis.com/auth/bigquery,"
45      "https://www.googleapis.com/auth/cloud-platform";
46
47  // Creates a new read session. A read session divides the contents of a
48  // BigQuery table into one or more streams, which can then be used to read
49  // data from the table. The read session also specifies properties of the
50  // data to be read, such as a list of columns or a push-down filter describing
51  // the rows to be returned.
52  //
53  // A particular row can be read by at most one stream. When the caller has
54  // reached the end of each stream in the session, then all the data in the
55  // table has been read.
56  //
57  // Read sessions automatically expire 6 hours after they are created and do
58  // not require manual clean-up by the caller.
59  rpc CreateReadSession(CreateReadSessionRequest) returns (ReadSession) {
60    option (google.api.http) = {
61      post: "/v1beta1/{table_reference.project_id=projects/*}"
62      body: "*"
63      additional_bindings {
64        post: "/v1beta1/{table_reference.dataset_id=projects/*/datasets/*}"
65        body: "*"
66      }
67    };
68    option (google.api.method_signature) =
69        "table_reference,parent,requested_streams";
70  }
71
72  // Reads rows from the table in the format prescribed by the read session.
73  // Each response contains one or more table rows, up to a maximum of 10 MiB
74  // per response; read requests which attempt to read individual rows larger
75  // than this will fail.
76  //
77  // Each request also returns a set of stream statistics reflecting the
78  // estimated total number of rows in the read stream. This number is computed
79  // based on the total table size and the number of active streams in the read
80  // session, and may change as other streams continue to read data.
81  rpc ReadRows(ReadRowsRequest) returns (stream ReadRowsResponse) {
82    option (google.api.http) = {
83      get: "/v1beta1/{read_position.stream.name=projects/*/streams/*}"
84    };
85    option (google.api.method_signature) = "read_position";
86  }
87
88  // Creates additional streams for a ReadSession. This API can be used to
89  // dynamically adjust the parallelism of a batch processing task upwards by
90  // adding additional workers.
91  rpc BatchCreateReadSessionStreams(BatchCreateReadSessionStreamsRequest)
92      returns (BatchCreateReadSessionStreamsResponse) {
93    option (google.api.http) = {
94      post: "/v1beta1/{session.name=projects/*/sessions/*}"
95      body: "*"
96    };
97    option (google.api.method_signature) = "session,requested_streams";
98  }
99
100  // Causes a single stream in a ReadSession to gracefully stop. This
101  // API can be used to dynamically adjust the parallelism of a batch processing
102  // task downwards without losing data.
103  //
104  // This API does not delete the stream -- it remains visible in the
105  // ReadSession, and any data processed by the stream is not released to other
106  // streams. However, no additional data will be assigned to the stream once
107  // this call completes. Callers must continue reading data on the stream until
108  // the end of the stream is reached so that data which has already been
109  // assigned to the stream will be processed.
110  //
111  // This method will return an error if there are no other live streams
112  // in the Session, or if SplitReadStream() has been called on the given
113  // Stream.
114  rpc FinalizeStream(FinalizeStreamRequest) returns (google.protobuf.Empty) {
115    option (google.api.http) = {
116      post: "/v1beta1/{stream.name=projects/*/streams/*}"
117      body: "*"
118    };
119    option (google.api.method_signature) = "stream";
120  }
121
122  // Splits a given read stream into two Streams. These streams are referred to
123  // as the primary and the residual of the split. The original stream can still
124  // be read from in the same manner as before. Both of the returned streams can
125  // also be read from, and the total rows return by both child streams will be
126  // the same as the rows read from the original stream.
127  //
128  // Moreover, the two child streams will be allocated back to back in the
129  // original Stream. Concretely, it is guaranteed that for streams Original,
130  // Primary, and Residual, that Original[0-j] = Primary[0-j] and
131  // Original[j-n] = Residual[0-m] once the streams have been read to
132  // completion.
133  //
134  // This method is guaranteed to be idempotent.
135  rpc SplitReadStream(SplitReadStreamRequest)
136      returns (SplitReadStreamResponse) {
137    option (google.api.http) = {
138      get: "/v1beta1/{original_stream.name=projects/*/streams/*}"
139    };
140    option (google.api.method_signature) = "original_stream";
141  }
142}
143
144// Information about a single data stream within a read session.
145message Stream {
146  option (google.api.resource) = {
147    type: "bigquerystorage.googleapis.com/Stream"
148    pattern: "projects/{project}/locations/{location}/streams/{stream}"
149  };
150
151  // Name of the stream, in the form
152  // `projects/{project_id}/locations/{location}/streams/{stream_id}`.
153  string name = 1;
154}
155
156// Expresses a point within a given stream using an offset position.
157message StreamPosition {
158  // Identifier for a given Stream.
159  Stream stream = 1;
160
161  // Position in the stream.
162  int64 offset = 2;
163}
164
165// Information returned from a `CreateReadSession` request.
166message ReadSession {
167  option (google.api.resource) = {
168    type: "bigquerystorage.googleapis.com/ReadSession"
169    pattern: "projects/{project}/locations/{location}/sessions/{session}"
170  };
171
172  // Unique identifier for the session, in the form
173  // `projects/{project_id}/locations/{location}/sessions/{session_id}`.
174  string name = 1;
175
176  // Time at which the session becomes invalid. After this time, subsequent
177  // requests to read this Session will return errors.
178  google.protobuf.Timestamp expire_time = 2;
179
180  // The schema for the read. If read_options.selected_fields is set, the
181  // schema may be different from the table schema as it will only contain
182  // the selected fields.
183  oneof schema {
184    // Avro schema.
185    AvroSchema avro_schema = 5;
186
187    // Arrow schema.
188    ArrowSchema arrow_schema = 6;
189  }
190
191  // Streams associated with this session.
192  repeated Stream streams = 4;
193
194  // Table that this ReadSession is reading from.
195  TableReference table_reference = 7;
196
197  // Any modifiers which are applied when reading from the specified table.
198  TableModifiers table_modifiers = 8;
199
200  // The strategy to use for distributing data among the streams.
201  ShardingStrategy sharding_strategy = 9;
202}
203
204// Data format for input or output data.
205enum DataFormat {
206  // Data format is unspecified.
207  DATA_FORMAT_UNSPECIFIED = 0;
208
209  // Avro is a standard open source row based file format.
210  // See https://avro.apache.org/ for more details.
211  AVRO = 1;
212
213  // Arrow is a standard open source column-based message format.
214  // See https://arrow.apache.org/ for more details.
215  ARROW = 3;
216}
217
218// Strategy for distributing data among multiple streams in a read session.
219enum ShardingStrategy {
220  // Same as LIQUID.
221  SHARDING_STRATEGY_UNSPECIFIED = 0;
222
223  // Assigns data to each stream based on the client's read rate. The faster the
224  // client reads from a stream, the more data is assigned to the stream. In
225  // this strategy, it's possible to read all data from a single stream even if
226  // there are other streams present.
227  LIQUID = 1;
228
229  // Assigns data to each stream such that roughly the same number of rows can
230  // be read from each stream. Because the server-side unit for assigning data
231  // is collections of rows, the API does not guarantee that each stream will
232  // return the same number or rows. Additionally, the limits are enforced based
233  // on the number of pre-filtering rows, so some filters can lead to lopsided
234  // assignments.
235  BALANCED = 2;
236}
237
238// Creates a new read session, which may include additional options such as
239// requested parallelism, projection filters and constraints.
240message CreateReadSessionRequest {
241  // Required. Reference to the table to read.
242  TableReference table_reference = 1 [(google.api.field_behavior) = REQUIRED];
243
244  // Required. String of the form `projects/{project_id}` indicating the
245  // project this ReadSession is associated with. This is the project that will
246  // be billed for usage.
247  string parent = 6 [
248    (google.api.field_behavior) = REQUIRED,
249    (google.api.resource_reference) = {
250      type: "cloudresourcemanager.googleapis.com/Project"
251    }
252  ];
253
254  // Any modifiers to the Table (e.g. snapshot timestamp).
255  TableModifiers table_modifiers = 2;
256
257  // Initial number of streams. If unset or 0, we will
258  // provide a value of streams so as to produce reasonable throughput. Must be
259  // non-negative. The number of streams may be lower than the requested number,
260  // depending on the amount parallelism that is reasonable for the table and
261  // the maximum amount of parallelism allowed by the system.
262  //
263  // Streams must be read starting from offset 0.
264  int32 requested_streams = 3;
265
266  // Read options for this session (e.g. column selection, filters).
267  TableReadOptions read_options = 4;
268
269  // Data output format. Currently default to Avro.
270  // DATA_FORMAT_UNSPECIFIED not supported.
271  DataFormat format = 5;
272
273  // The strategy to use for distributing data among multiple streams. Currently
274  // defaults to liquid sharding.
275  ShardingStrategy sharding_strategy = 7;
276}
277
278// Requesting row data via `ReadRows` must provide Stream position information.
279message ReadRowsRequest {
280  // Required. Identifier of the position in the stream to start reading from.
281  // The offset requested must be less than the last row read from ReadRows.
282  // Requesting a larger offset is undefined.
283  StreamPosition read_position = 1 [(google.api.field_behavior) = REQUIRED];
284}
285
286// Progress information for a given Stream.
287message StreamStatus {
288  // Number of estimated rows in the current stream. May change over time as
289  // different readers in the stream progress at rates which are relatively fast
290  // or slow.
291  int64 estimated_row_count = 1;
292
293  // A value in the range [0.0, 1.0] that represents the fraction of rows
294  // assigned to this stream that have been processed by the server. In the
295  // presence of read filters, the server may process more rows than it returns,
296  // so this value reflects progress through the pre-filtering rows.
297  //
298  // This value is only populated for sessions created through the BALANCED
299  // sharding strategy.
300  float fraction_consumed = 2;
301
302  // Represents the progress of the current stream.
303  Progress progress = 4;
304
305  // Whether this stream can be split. For sessions that use the LIQUID sharding
306  // strategy, this value is always false. For BALANCED sessions, this value is
307  // false when enough data have been read such that no more splits are possible
308  // at that point or beyond. For small tables or streams that are the result of
309  // a chain of splits, this value may never be true.
310  bool is_splittable = 3;
311}
312
313message Progress {
314  // The fraction of rows assigned to the stream that have been processed by the
315  // server so far, not including the rows in the current response message.
316  //
317  // This value, along with `at_response_end`, can be used to interpolate the
318  // progress made as the rows in the message are being processed using the
319  // following formula: `at_response_start + (at_response_end -
320  // at_response_start) * rows_processed_from_response / rows_in_response`.
321  //
322  // Note that if a filter is provided, the `at_response_end` value of the
323  // previous response may not necessarily be equal to the `at_response_start`
324  // value of the current response.
325  float at_response_start = 1;
326
327  // Similar to `at_response_start`, except that this value includes the rows in
328  // the current response.
329  float at_response_end = 2;
330}
331
332// Information on if the current connection is being throttled.
333message ThrottleStatus {
334  // How much this connection is being throttled.
335  // 0 is no throttling, 100 is completely throttled.
336  int32 throttle_percent = 1;
337}
338
339// Response from calling `ReadRows` may include row data, progress and
340// throttling information.
341message ReadRowsResponse {
342  // Row data is returned in format specified during session creation.
343  oneof rows {
344    // Serialized row data in AVRO format.
345    AvroRows avro_rows = 3;
346
347    // Serialized row data in Arrow RecordBatch format.
348    ArrowRecordBatch arrow_record_batch = 4;
349  }
350
351  // Number of serialized rows in the rows block. This value is recorded here,
352  // in addition to the row_count values in the output-specific messages in
353  // `rows`, so that code which needs to record progress through the stream can
354  // do so in an output format-independent way.
355  int64 row_count = 6;
356
357  // Estimated stream statistics.
358  StreamStatus status = 2;
359
360  // Throttling status. If unset, the latest response still describes
361  // the current throttling status.
362  ThrottleStatus throttle_status = 5;
363
364  // The schema for the read. If read_options.selected_fields is set, the
365  // schema may be different from the table schema as it will only contain
366  // the selected fields. This schema is equivalent to the one returned by
367  // CreateSession. This field is only populated in the first ReadRowsResponse
368  // RPC.
369  oneof schema {
370    // Output only. Avro schema.
371    AvroSchema avro_schema = 7 [(google.api.field_behavior) = OUTPUT_ONLY];
372
373    // Output only. Arrow schema.
374    ArrowSchema arrow_schema = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
375  }
376}
377
378// Information needed to request additional streams for an established read
379// session.
380message BatchCreateReadSessionStreamsRequest {
381  // Required. Must be a non-expired session obtained from a call to
382  // CreateReadSession. Only the name field needs to be set.
383  ReadSession session = 1 [(google.api.field_behavior) = REQUIRED];
384
385  // Required. Number of new streams requested. Must be positive.
386  // Number of added streams may be less than this, see CreateReadSessionRequest
387  // for more information.
388  int32 requested_streams = 2 [(google.api.field_behavior) = REQUIRED];
389}
390
391// The response from `BatchCreateReadSessionStreams` returns the stream
392// identifiers for the newly created streams.
393message BatchCreateReadSessionStreamsResponse {
394  // Newly added streams.
395  repeated Stream streams = 1;
396}
397
398// Request information for invoking `FinalizeStream`.
399message FinalizeStreamRequest {
400  // Required. Stream to finalize.
401  Stream stream = 2 [(google.api.field_behavior) = REQUIRED];
402}
403
404// Request information for `SplitReadStream`.
405message SplitReadStreamRequest {
406  // Required. Stream to split.
407  Stream original_stream = 1 [(google.api.field_behavior) = REQUIRED];
408
409  // A value in the range (0.0, 1.0) that specifies the fractional point at
410  // which the original stream should be split. The actual split point is
411  // evaluated on pre-filtered rows, so if a filter is provided, then there is
412  // no guarantee that the division of the rows between the new child streams
413  // will be proportional to this fractional value. Additionally, because the
414  // server-side unit for assigning data is collections of rows, this fraction
415  // will always map to to a data storage boundary on the server side.
416  float fraction = 2;
417}
418
419// Response from `SplitReadStream`.
420message SplitReadStreamResponse {
421  // Primary stream, which contains the beginning portion of
422  // |original_stream|. An empty value indicates that the original stream can no
423  // longer be split.
424  Stream primary_stream = 1;
425
426  // Remainder stream, which contains the tail of |original_stream|. An empty
427  // value indicates that the original stream can no longer be split.
428  Stream remainder_stream = 2;
429}
430