• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2023 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.cloud.aiplatform.v1beta1;
18
19import "google/api/field_behavior.proto";
20import "google/api/resource.proto";
21import "google/cloud/aiplatform/v1beta1/encryption_spec.proto";
22import "google/cloud/aiplatform/v1beta1/io.proto";
23import "google/cloud/aiplatform/v1beta1/saved_query.proto";
24import "google/protobuf/struct.proto";
25import "google/protobuf/timestamp.proto";
26
27option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1";
28option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb;aiplatformpb";
29option java_multiple_files = true;
30option java_outer_classname = "DatasetProto";
31option java_package = "com.google.cloud.aiplatform.v1beta1";
32option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1";
33option ruby_package = "Google::Cloud::AIPlatform::V1beta1";
34
35// A collection of DataItems and Annotations on them.
36message Dataset {
37  option (google.api.resource) = {
38    type: "aiplatform.googleapis.com/Dataset"
39    pattern: "projects/{project}/locations/{location}/datasets/{dataset}"
40  };
41
42  // Output only. The resource name of the Dataset.
43  string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
44
45  // Required. The user-defined name of the Dataset.
46  // The name can be up to 128 characters long and can consist of any UTF-8
47  // characters.
48  string display_name = 2 [(google.api.field_behavior) = REQUIRED];
49
50  // The description of the Dataset.
51  string description = 16;
52
53  // Required. Points to a YAML file stored on Google Cloud Storage describing
54  // additional information about the Dataset. The schema is defined as an
55  // OpenAPI 3.0.2 Schema Object. The schema files that can be used here are
56  // found in gs://google-cloud-aiplatform/schema/dataset/metadata/.
57  string metadata_schema_uri = 3 [(google.api.field_behavior) = REQUIRED];
58
59  // Required. Additional information about the Dataset.
60  google.protobuf.Value metadata = 8 [(google.api.field_behavior) = REQUIRED];
61
62  // Output only. The number of DataItems in this Dataset. Only apply for
63  // non-structured Dataset.
64  int64 data_item_count = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
65
66  // Output only. Timestamp when this Dataset was created.
67  google.protobuf.Timestamp create_time = 4
68      [(google.api.field_behavior) = OUTPUT_ONLY];
69
70  // Output only. Timestamp when this Dataset was last updated.
71  google.protobuf.Timestamp update_time = 5
72      [(google.api.field_behavior) = OUTPUT_ONLY];
73
74  // Used to perform consistent read-modify-write updates. If not set, a blind
75  // "overwrite" update happens.
76  string etag = 6;
77
78  // The labels with user-defined metadata to organize your Datasets.
79  //
80  // Label keys and values can be no longer than 64 characters
81  // (Unicode codepoints), can only contain lowercase letters, numeric
82  // characters, underscores and dashes. International characters are allowed.
83  // No more than 64 user labels can be associated with one Dataset (System
84  // labels are excluded).
85  //
86  // See https://goo.gl/xmQnxf for more information and examples of labels.
87  // System reserved label keys are prefixed with "aiplatform.googleapis.com/"
88  // and are immutable. Following system labels exist for each Dataset:
89  //
90  // * "aiplatform.googleapis.com/dataset_metadata_schema": output only, its
91  //   value is the
92  //   [metadata_schema's][google.cloud.aiplatform.v1beta1.Dataset.metadata_schema_uri]
93  //   title.
94  map<string, string> labels = 7;
95
96  // All SavedQueries belong to the Dataset will be returned in List/Get
97  // Dataset response. The annotation_specs field
98  // will not be populated except for UI cases which will only use
99  // [annotation_spec_count][google.cloud.aiplatform.v1beta1.SavedQuery.annotation_spec_count].
100  // In CreateDataset request, a SavedQuery is created together if
101  // this field is set, up to one SavedQuery can be set in CreateDatasetRequest.
102  // The SavedQuery should not contain any AnnotationSpec.
103  repeated SavedQuery saved_queries = 9;
104
105  // Customer-managed encryption key spec for a Dataset. If set, this Dataset
106  // and all sub-resources of this Dataset will be secured by this key.
107  EncryptionSpec encryption_spec = 11;
108
109  // Output only. The resource name of the Artifact that was created in
110  // MetadataStore when creating the Dataset. The Artifact resource name pattern
111  // is
112  // `projects/{project}/locations/{location}/metadataStores/{metadata_store}/artifacts/{artifact}`.
113  string metadata_artifact = 17 [(google.api.field_behavior) = OUTPUT_ONLY];
114}
115
116// Describes the location from where we import data into a Dataset, together
117// with the labels that will be applied to the DataItems and the Annotations.
118message ImportDataConfig {
119  // The source of the input.
120  oneof source {
121    // The Google Cloud Storage location for the input content.
122    GcsSource gcs_source = 1;
123  }
124
125  // Labels that will be applied to newly imported DataItems. If an identical
126  // DataItem as one being imported already exists in the Dataset, then these
127  // labels will be appended to these of the already existing one, and if labels
128  // with identical key is imported before, the old label value will be
129  // overwritten. If two DataItems are identical in the same import data
130  // operation, the labels will be combined and if key collision happens in this
131  // case, one of the values will be picked randomly. Two DataItems are
132  // considered identical if their content bytes are identical (e.g. image bytes
133  // or pdf bytes).
134  // These labels will be overridden by Annotation labels specified inside index
135  // file referenced by
136  // [import_schema_uri][google.cloud.aiplatform.v1beta1.ImportDataConfig.import_schema_uri],
137  // e.g. jsonl file.
138  map<string, string> data_item_labels = 2;
139
140  // Labels that will be applied to newly imported Annotations. If two
141  // Annotations are identical, one of them will be deduped. Two Annotations are
142  // considered identical if their
143  // [payload][google.cloud.aiplatform.v1beta1.Annotation.payload],
144  // [payload_schema_uri][google.cloud.aiplatform.v1beta1.Annotation.payload_schema_uri]
145  // and all of their
146  // [labels][google.cloud.aiplatform.v1beta1.Annotation.labels] are the same.
147  // These labels will be overridden by Annotation labels specified inside index
148  // file referenced by
149  // [import_schema_uri][google.cloud.aiplatform.v1beta1.ImportDataConfig.import_schema_uri],
150  // e.g. jsonl file.
151  map<string, string> annotation_labels = 3;
152
153  // Required. Points to a YAML file stored on Google Cloud Storage describing
154  // the import format. Validation will be done against the schema. The schema
155  // is defined as an [OpenAPI 3.0.2 Schema
156  // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject).
157  string import_schema_uri = 4 [(google.api.field_behavior) = REQUIRED];
158}
159
160// Describes what part of the Dataset is to be exported, the destination of
161// the export and how to export.
162message ExportDataConfig {
163  // The destination of the output.
164  oneof destination {
165    // The Google Cloud Storage location where the output is to be written to.
166    // In the given directory a new directory will be created with name:
167    // `export-data-<dataset-display-name>-<timestamp-of-export-call>` where
168    // timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format. All export
169    // output will be written into that directory. Inside that directory,
170    // annotations with the same schema will be grouped into sub directories
171    // which are named with the corresponding annotations' schema title. Inside
172    // these sub directories, a schema.yaml will be created to describe the
173    // output format.
174    GcsDestination gcs_destination = 1;
175  }
176
177  // The instructions how the export data should be split between the
178  // training, validation and test sets.
179  oneof split {
180    // Split based on fractions defining the size of each set.
181    ExportFractionSplit fraction_split = 5;
182  }
183
184  // An expression for filtering what part of the Dataset is to be exported.
185  // Only Annotations that match this filter will be exported. The filter syntax
186  // is the same as in
187  // [ListAnnotations][google.cloud.aiplatform.v1beta1.DatasetService.ListAnnotations].
188  string annotations_filter = 2;
189}
190
191// Assigns the input data to training, validation, and test sets as per the
192// given fractions. Any of `training_fraction`, `validation_fraction` and
193// `test_fraction` may optionally be provided, they must sum to up to 1. If the
194// provided ones sum to less than 1, the remainder is assigned to sets as
195// decided by Vertex AI. If none of the fractions are set, by default roughly
196// 80% of data is used for training, 10% for validation, and 10% for test.
197message ExportFractionSplit {
198  // The fraction of the input data that is to be used to train the Model.
199  double training_fraction = 1;
200
201  // The fraction of the input data that is to be used to validate the Model.
202  double validation_fraction = 2;
203
204  // The fraction of the input data that is to be used to evaluate the Model.
205  double test_fraction = 3;
206}
207