storage/v1beta2/storage.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.bigquery.storage.v1beta2;

import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/bigquery/storage/v1beta2/arrow.proto";
import "google/cloud/bigquery/storage/v1beta2/avro.proto";
import "google/cloud/bigquery/storage/v1beta2/protobuf.proto";
import "google/cloud/bigquery/storage/v1beta2/stream.proto";
import "google/cloud/bigquery/storage/v1beta2/table.proto";
import "google/protobuf/timestamp.proto";
import "google/protobuf/wrappers.proto";
import "google/rpc/status.proto";

option go_package = "cloud.google.com/go/bigquery/storage/apiv1beta2/storagepb;storagepb";
option java_multiple_files = true;
option java_outer_classname = "StorageProto";
option java_package = "com.google.cloud.bigquery.storage.v1beta2";

// BigQuery Read API.
//
// The Read API can be used to read data from BigQuery.
//
// New code should use the v1 Read API going forward, if they don't use Write
// API at the same time.
service BigQueryRead {
  option (google.api.default_host) = "bigquerystorage.googleapis.com";
  option (google.api.oauth_scopes) =
      "https://www.googleapis.com/auth/bigquery,"
      "https://www.googleapis.com/auth/cloud-platform";

  // Creates a new read session. A read session divides the contents of a
  // BigQuery table into one or more streams, which can then be used to read
  // data from the table. The read session also specifies properties of the
  // data to be read, such as a list of columns or a push-down filter describing
  // the rows to be returned.
  //
  // A particular row can be read by at most one stream. When the caller has
  // reached the end of each stream in the session, then all the data in the
  // table has been read.
  //
  // Data is assigned to each stream such that roughly the same number of
  // rows can be read from each stream. Because the server-side unit for
  // assigning data is collections of rows, the API does not guarantee that
  // each stream will return the same number or rows. Additionally, the
  // limits are enforced based on the number of pre-filtered rows, so some
  // filters can lead to lopsided assignments.
  //
  // Read sessions automatically expire 6 hours after they are created and do
  // not require manual clean-up by the caller.
  rpc CreateReadSession(CreateReadSessionRequest) returns (ReadSession) {
    option (google.api.http) = {
      post: "/v1beta2/{read_session.table=projects/*/datasets/*/tables/*}"
      body: "*"
    };
    option (google.api.method_signature) =
        "parent,read_session,max_stream_count";
  }

  // Reads rows from the stream in the format prescribed by the ReadSession.
  // Each response contains one or more table rows, up to a maximum of 100 MiB
  // per response; read requests which attempt to read individual rows larger
  // than 100 MiB will fail.
  //
  // Each request also returns a set of stream statistics reflecting the current
  // state of the stream.
  rpc ReadRows(ReadRowsRequest) returns (stream ReadRowsResponse) {
    option (google.api.http) = {
      get: "/v1beta2/{read_stream=projects/*/locations/*/sessions/*/streams/*}"
    };
    option (google.api.method_signature) = "read_stream,offset";
  }

  // Splits a given `ReadStream` into two `ReadStream` objects. These
  // `ReadStream` objects are referred to as the primary and the residual
  // streams of the split. The original `ReadStream` can still be read from in
  // the same manner as before. Both of the returned `ReadStream` objects can
  // also be read from, and the rows returned by both child streams will be
  // the same as the rows read from the original stream.
  //
  // Moreover, the two child streams will be allocated back-to-back in the
  // original `ReadStream`. Concretely, it is guaranteed that for streams
  // original, primary, and residual, that original[0-j] = primary[0-j] and
  // original[j-n] = residual[0-m] once the streams have been read to
  // completion.
  rpc SplitReadStream(SplitReadStreamRequest)
      returns (SplitReadStreamResponse) {
    option (google.api.http) = {
      get: "/v1beta2/{name=projects/*/locations/*/sessions/*/streams/*}"
    };
  }
}

// BigQuery Write API.
//
// The Write API can be used to write data to BigQuery.
//
//
// The [google.cloud.bigquery.storage.v1
//   API](/bigquery/docs/reference/storage/rpc/google.cloud.bigquery.storage.v1)
//   should be used instead of the v1beta2 API for BigQueryWrite operations.
service BigQueryWrite {
  option deprecated = true;
  option (google.api.default_host) = "bigquerystorage.googleapis.com";
  option (google.api.oauth_scopes) =
      "https://www.googleapis.com/auth/bigquery,"
      "https://www.googleapis.com/auth/bigquery.insertdata,"
      "https://www.googleapis.com/auth/cloud-platform";

  // Creates a write stream to the given table.
  // Additionally, every table has a special COMMITTED stream named '_default'
  // to which data can be written. This stream doesn't need to be created using
  // CreateWriteStream. It is a stream that can be used simultaneously by any
  // number of clients. Data written to this stream is considered committed as
  // soon as an acknowledgement is received.
  rpc CreateWriteStream(CreateWriteStreamRequest) returns (WriteStream) {
    option deprecated = true;
    option (google.api.http) = {
      post: "/v1beta2/{parent=projects/*/datasets/*/tables/*}"
      body: "write_stream"
    };
    option (google.api.method_signature) = "parent,write_stream";
  }

  // Appends data to the given stream.
  //
  // If `offset` is specified, the `offset` is checked against the end of
  // stream. The server returns `OUT_OF_RANGE` in `AppendRowsResponse` if an
  // attempt is made to append to an offset beyond the current end of the stream
  // or `ALREADY_EXISTS` if user provids an `offset` that has already been
  // written to. User can retry with adjusted offset within the same RPC
  // stream. If `offset` is not specified, append happens at the end of the
  // stream.
  //
  // The response contains the offset at which the append happened. Responses
  // are received in the same order in which requests are sent. There will be
  // one response for each successful request. If the `offset` is not set in
  // response, it means append didn't happen due to some errors. If one request
  // fails, all the subsequent requests will also fail until a success request
  // is made again.
  //
  // If the stream is of `PENDING` type, data will only be available for read
  // operations after the stream is committed.
  rpc AppendRows(stream AppendRowsRequest) returns (stream AppendRowsResponse) {
    option deprecated = true;
    option (google.api.http) = {
      post: "/v1beta2/{write_stream=projects/*/datasets/*/tables/*/streams/*}"
      body: "*"
    };
    option (google.api.method_signature) = "write_stream";
  }

  // Gets a write stream.
  rpc GetWriteStream(GetWriteStreamRequest) returns (WriteStream) {
    option deprecated = true;
    option (google.api.http) = {
      post: "/v1beta2/{name=projects/*/datasets/*/tables/*/streams/*}"
      body: "*"
    };
    option (google.api.method_signature) = "name";
  }

  // Finalize a write stream so that no new data can be appended to the
  // stream. Finalize is not supported on the '_default' stream.
  rpc FinalizeWriteStream(FinalizeWriteStreamRequest)
      returns (FinalizeWriteStreamResponse) {
    option deprecated = true;
    option (google.api.http) = {
      post: "/v1beta2/{name=projects/*/datasets/*/tables/*/streams/*}"
      body: "*"
    };
    option (google.api.method_signature) = "name";
  }

  // Atomically commits a group of `PENDING` streams that belong to the same
  // `parent` table.
  // Streams must be finalized before commit and cannot be committed multiple
  // times. Once a stream is committed, data in the stream becomes available
  // for read operations.
  rpc BatchCommitWriteStreams(BatchCommitWriteStreamsRequest)
      returns (BatchCommitWriteStreamsResponse) {
    option deprecated = true;
    option (google.api.http) = {
      get: "/v1beta2/{parent=projects/*/datasets/*/tables/*}"
    };
    option (google.api.method_signature) = "parent";
  }

  // Flushes rows to a BUFFERED stream.
  // If users are appending rows to BUFFERED stream, flush operation is
  // required in order for the rows to become available for reading. A
  // Flush operation flushes up to any previously flushed offset in a BUFFERED
  // stream, to the offset specified in the request.
  // Flush is not supported on the _default stream, since it is not BUFFERED.
  rpc FlushRows(FlushRowsRequest) returns (FlushRowsResponse) {
    option deprecated = true;
    option (google.api.http) = {
      post: "/v1beta2/{write_stream=projects/*/datasets/*/tables/*/streams/*}"
      body: "*"
    };
    option (google.api.method_signature) = "write_stream";
  }
}

// Request message for `CreateReadSession`.
message CreateReadSessionRequest {
  // Required. The request project that owns the session, in the form of
  // `projects/{project_id}`.
  string parent = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "cloudresourcemanager.googleapis.com/Project"
    }
  ];

  // Required. Session to be created.
  ReadSession read_session = 2 [(google.api.field_behavior) = REQUIRED];

  // Max initial number of streams. If unset or zero, the server will
  // provide a value of streams so as to produce reasonable throughput. Must be
  // non-negative. The number of streams may be lower than the requested number,
  // depending on the amount parallelism that is reasonable for the table. Error
  // will be returned if the max count is greater than the current system
  // max limit of 1,000.
  //
  // Streams must be read starting from offset 0.
  int32 max_stream_count = 3;
}

// Request message for `ReadRows`.
message ReadRowsRequest {
  // Required. Stream to read rows from.
  string read_stream = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "bigquerystorage.googleapis.com/ReadStream"
    }
  ];

  // The offset requested must be less than the last row read from Read.
  // Requesting a larger offset is undefined. If not specified, start reading
  // from offset zero.
  int64 offset = 2;
}

// Information on if the current connection is being throttled.
message ThrottleState {
  // How much this connection is being throttled. Zero means no throttling,
  // 100 means fully throttled.
  int32 throttle_percent = 1;
}

// Estimated stream statistics for a given Stream.
message StreamStats {
  message Progress {
    // The fraction of rows assigned to the stream that have been processed by
    // the server so far, not including the rows in the current response
    // message.
    //
    // This value, along with `at_response_end`, can be used to interpolate
    // the progress made as the rows in the message are being processed using
    // the following formula: `at_response_start + (at_response_end -
    // at_response_start) * rows_processed_from_response / rows_in_response`.
    //
    // Note that if a filter is provided, the `at_response_end` value of the
    // previous response may not necessarily be equal to the
    // `at_response_start` value of the current response.
    double at_response_start = 1;

    // Similar to `at_response_start`, except that this value includes the
    // rows in the current response.
    double at_response_end = 2;
  }

  // Represents the progress of the current stream.
  Progress progress = 2;
}

// Response from calling `ReadRows` may include row data, progress and
// throttling information.
message ReadRowsResponse {
  // Row data is returned in format specified during session creation.
  oneof rows {
    // Serialized row data in AVRO format.
    AvroRows avro_rows = 3;

    // Serialized row data in Arrow RecordBatch format.
    ArrowRecordBatch arrow_record_batch = 4;
  }

  // Number of serialized rows in the rows block.
  int64 row_count = 6;

  // Statistics for the stream.
  StreamStats stats = 2;

  // Throttling state. If unset, the latest response still describes
  // the current throttling status.
  ThrottleState throttle_state = 5;

  // The schema for the read. If read_options.selected_fields is set, the
  // schema may be different from the table schema as it will only contain
  // the selected fields. This schema is equivalent to the one returned by
  // CreateSession. This field is only populated in the first ReadRowsResponse
  // RPC.
  oneof schema {
    // Output only. Avro schema.
    AvroSchema avro_schema = 7 [(google.api.field_behavior) = OUTPUT_ONLY];

    // Output only. Arrow schema.
    ArrowSchema arrow_schema = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
  }
}

// Request message for `SplitReadStream`.
message SplitReadStreamRequest {
  // Required. Name of the stream to split.
  string name = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "bigquerystorage.googleapis.com/ReadStream"
    }
  ];

  // A value in the range (0.0, 1.0) that specifies the fractional point at
  // which the original stream should be split. The actual split point is
  // evaluated on pre-filtered rows, so if a filter is provided, then there is
  // no guarantee that the division of the rows between the new child streams
  // will be proportional to this fractional value. Additionally, because the
  // server-side unit for assigning data is collections of rows, this fraction
  // will always map to a data storage boundary on the server side.
  double fraction = 2;
}

message SplitReadStreamResponse {
  // Primary stream, which contains the beginning portion of
  // |original_stream|. An empty value indicates that the original stream can no
  // longer be split.
  ReadStream primary_stream = 1;

  // Remainder stream, which contains the tail of |original_stream|. An empty
  // value indicates that the original stream can no longer be split.
  ReadStream remainder_stream = 2;
}

// Request message for `CreateWriteStream`.
message CreateWriteStreamRequest {
  // Required. Reference to the table to which the stream belongs, in the format
  // of `projects/{project}/datasets/{dataset}/tables/{table}`.
  string parent = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = { type: "bigquery.googleapis.com/Table" }
  ];

  // Required. Stream to be created.
  WriteStream write_stream = 2 [(google.api.field_behavior) = REQUIRED];
}

// Request message for `AppendRows`.
message AppendRowsRequest {
  // Proto schema and data.
  message ProtoData {
    // Proto schema used to serialize the data.
    ProtoSchema writer_schema = 1;

    // Serialized row data in protobuf message format.
    ProtoRows rows = 2;
  }

  // Required. The stream that is the target of the append operation. This value
  // must be specified for the initial request. If subsequent requests specify
  // the stream name, it must equal to the value provided in the first request.
  // To write to the _default stream, populate this field with a string in the
  // format `projects/{project}/datasets/{dataset}/tables/{table}/_default`.
  string write_stream = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "bigquerystorage.googleapis.com/WriteStream"
    }
  ];

  // If present, the write is only performed if the next append offset is same
  // as the provided value. If not present, the write is performed at the
  // current end of stream. Specifying a value for this field is not allowed
  // when calling AppendRows for the '_default' stream.
  google.protobuf.Int64Value offset = 2;

  // Input rows. The `writer_schema` field must be specified at the initial
  // request and currently, it will be ignored if specified in following
  // requests. Following requests must have data in the same format as the
  // initial request.
  oneof rows {
    // Rows in proto format.
    ProtoData proto_rows = 4;
  }

  // Id set by client to annotate its identity. Only initial request setting is
  // respected.
  string trace_id = 6;
}

// Response message for `AppendRows`.
message AppendRowsResponse {
  // AppendResult is returned for successful append requests.
  message AppendResult {
    // The row offset at which the last append occurred. The offset will not be
    // set if appending using default streams.
    google.protobuf.Int64Value offset = 1;
  }

  oneof response {
    // Result if the append is successful.
    AppendResult append_result = 1;

    // Error returned when problems were encountered.  If present,
    // it indicates rows were not accepted into the system.
    // Users can retry or continue with other append requests within the
    // same connection.
    //
    // Additional information about error signalling:
    //
    // ALREADY_EXISTS: Happens when an append specified an offset, and the
    // backend already has received data at this offset.  Typically encountered
    // in retry scenarios, and can be ignored.
    //
    // OUT_OF_RANGE: Returned when the specified offset in the stream is beyond
    // the current end of the stream.
    //
    // INVALID_ARGUMENT: Indicates a malformed request or data.
    //
    // ABORTED: Request processing is aborted because of prior failures.  The
    // request can be retried if previous failure is addressed.
    //
    // INTERNAL: Indicates server side error(s) that can be retried.
    google.rpc.Status error = 2;
  }

  // If backend detects a schema update, pass it to user so that user can
  // use it to input new type of message. It will be empty when no schema
  // updates have occurred.
  TableSchema updated_schema = 3;
}

// Request message for `GetWriteStreamRequest`.
message GetWriteStreamRequest {
  // Required. Name of the stream to get, in the form of
  // `projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}`.
  string name = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "bigquerystorage.googleapis.com/WriteStream"
    }
  ];
}

// Request message for `BatchCommitWriteStreams`.
message BatchCommitWriteStreamsRequest {
  // Required. Parent table that all the streams should belong to, in the form
  // of `projects/{project}/datasets/{dataset}/tables/{table}`.
  string parent = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. The group of streams that will be committed atomically.
  repeated string write_streams = 2 [(google.api.field_behavior) = REQUIRED];
}

// Response message for `BatchCommitWriteStreams`.
message BatchCommitWriteStreamsResponse {
  // The time at which streams were committed in microseconds granularity.
  // This field will only exist when there are no stream errors.
  // **Note** if this field is not set, it means the commit was not successful.
  google.protobuf.Timestamp commit_time = 1;

  // Stream level error if commit failed. Only streams with error will be in
  // the list.
  // If empty, there is no error and all streams are committed successfully.
  // If non empty, certain streams have errors and ZERO stream is committed due
  // to atomicity guarantee.
  repeated StorageError stream_errors = 2;
}

// Request message for invoking `FinalizeWriteStream`.
message FinalizeWriteStreamRequest {
  // Required. Name of the stream to finalize, in the form of
  // `projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}`.
  string name = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "bigquerystorage.googleapis.com/WriteStream"
    }
  ];
}

// Response message for `FinalizeWriteStream`.
message FinalizeWriteStreamResponse {
  // Number of rows in the finalized stream.
  int64 row_count = 1;
}

// Request message for `FlushRows`.
message FlushRowsRequest {
  // Required. The stream that is the target of the flush operation.
  string write_stream = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "bigquerystorage.googleapis.com/WriteStream"
    }
  ];

  // Ending offset of the flush operation. Rows before this offset(including
  // this offset) will be flushed.
  google.protobuf.Int64Value offset = 2;
}

// Respond message for `FlushRows`.
message FlushRowsResponse {
  // The rows before this offset (including this offset) are flushed.
  int64 offset = 1;
}

// Structured custom BigQuery Storage error message. The error can be attached
// as error details in the returned rpc Status. In particular, the use of error
// codes allows more structured error handling, and reduces the need to evaluate
// unstructured error text strings.
message StorageError {
  // Error code for `StorageError`.
  enum StorageErrorCode {
    // Default error.
    STORAGE_ERROR_CODE_UNSPECIFIED = 0;

    // Table is not found in the system.
    TABLE_NOT_FOUND = 1;

    // Stream is already committed.
    STREAM_ALREADY_COMMITTED = 2;

    // Stream is not found.
    STREAM_NOT_FOUND = 3;

    // Invalid Stream type.
    // For example, you try to commit a stream that is not pending.
    INVALID_STREAM_TYPE = 4;

    // Invalid Stream state.
    // For example, you try to commit a stream that is not finalized or is
    // garbaged.
    INVALID_STREAM_STATE = 5;

    // Stream is finalized.
    STREAM_FINALIZED = 6;
  }

  // BigQuery Storage specific error code.
  StorageErrorCode code = 1;

  // Name of the failed entity.
  string entity = 2;

  // Message that describes the error.
  string error_message = 3;
}