1 /**
2 * Copyright 2019-2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "minddata/dataset/engine/datasetops/source/random_data_op.h"
18
19 #include <algorithm>
20 #include <iomanip>
21 #include <random>
22 #include "minddata/dataset/engine/execution_tree.h"
23 #include "minddata/dataset/core/config_manager.h"
24 #include "minddata/dataset/util/random.h"
25 #include "minddata/dataset/util/wait_post.h"
26 #include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
27
28 namespace mindspore {
29 namespace dataset {
30 // Constructor for RandomDataOp
RandomDataOp(int32_t num_workers,int32_t op_connector_size,int64_t total_rows,std::unique_ptr<DataSchema> data_schema)31 RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64_t total_rows,
32 std::unique_ptr<DataSchema> data_schema)
33 : MappableLeafOp(num_workers, op_connector_size, std::make_shared<SequentialSamplerRT>(0, 0)),
34 total_rows_(total_rows),
35 data_schema_(std::move(data_schema)) {
36 rand_gen_.seed(GetSeed()); // seed the random generator
37 // If total rows was not given, then randomly pick a number
38 if (total_rows_ == 0) {
39 total_rows_ = GenRandomInt(1, kMaxTotalRows);
40 }
41 // If the user did not provide a schema, then we will ask the op to generate a pseudo-random schema.
42 // See details of generateSchema function to learn what type of schema it will create.
43 if (data_schema_ == nullptr) {
44 GenerateSchema();
45 }
46 }
47
48 // A print method typically used for debugging
Print(std::ostream & out,bool show_all) const49 void RandomDataOp::Print(std::ostream &out, bool show_all) const {
50 if (!show_all) {
51 // Call the super class for displaying any common 1-liner info
52 ParallelOp::Print(out, show_all);
53 // Then show any custom derived-internal 1-liner info for this op
54 out << " [total rows: " << num_rows_ << "]\n";
55 } else {
56 // Call the super class for displaying any common detailed info
57 ParallelOp::Print(out, show_all);
58 // Then show any custom derived-internal stuff
59 out << "\nTotal_rows: " << num_rows_ << " \nSchema:\n" << *data_schema_ << "\n\n";
60 }
61 }
62
63 // Helper function to produce a default/random schema if one didn't exist
GenerateSchema()64 void RandomDataOp::GenerateSchema() {
65 // To randomly create a schema, we need to choose:
66 // a) how many columns
67 // b) the type of each column
68 // c) the shape of each column (number of dimensions i.e. rank)
69 // d) the shape of each column (dimension values)
70 data_schema_ = std::make_unique<DataSchema>();
71 std::unique_ptr<TensorShape> new_shape;
72 std::unique_ptr<ColDescriptor> new_col;
73
74 // Loop over the number of chosen columns
75 int32_t numColumns = GenRandomInt(1, kMaxNumColumns);
76 for (int32_t i = 0; i < numColumns; i++) {
77 // For each column:
78 // - choose a datatype
79 // - generate a shape that randomly chooses the number of dimensions and the dimension values.
80 auto newType = static_cast<DataType::Type>(GenRandomInt(1, DataType::DE_STRING - 1));
81 int32_t rank = GenRandomInt(1, kMaxRank);
82 std::vector<dsize_t> dims;
83 for (int32_t d = 0; d < rank; d++) {
84 // 0 is not a valid dimension value. however, we can support "*" or unknown, so map the random
85 // 0 value to the unknown attribute if 0 is chosen
86 auto dim_value = static_cast<dsize_t>(GenRandomInt(0, kMaxDimValue));
87 if (dim_value == 0) {
88 dim_value = TensorShape::kDimUnknown;
89 }
90 dims.push_back(dim_value);
91 }
92 new_shape = std::make_unique<TensorShape>(dims);
93
94 // Create the column descriptor
95 std::string col_name = "c" + std::to_string(i);
96 new_col =
97 std::make_unique<ColDescriptor>(col_name, DataType(newType), TensorImpl::kFlexible, rank, new_shape.get());
98
99 Status rc = data_schema_->AddColumn(*new_col);
100 if (rc.IsError()) {
101 MS_LOG(ERROR) << "[Internal ERROR] Failed to generate a schema. Message:" << rc;
102 }
103 }
104 }
105
106 // A helper function to create random data for the row
CreateRandomRow(TensorRow * new_row)107 Status RandomDataOp::CreateRandomRow(TensorRow *new_row) {
108 if (new_row == nullptr) {
109 RETURN_STATUS_UNEXPECTED("[Internal ERROR] Missing tensor row output.");
110 }
111
112 // Create a tensor for each column, then add the tensor to the row
113 for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
114 const ColDescriptor current_col = data_schema_->Column(i);
115 std::vector<dsize_t> current_shape = current_col.Shape().AsVector();
116 std::unique_ptr<TensorShape> new_shape = nullptr;
117 std::unique_ptr<unsigned char[]> buf = nullptr;
118 std::shared_ptr<Tensor> new_tensor = nullptr;
119
120 // We need to resolve the shape to fill in any unknown dimensions with random
121 // values, then use that as our shape for this tensor.
122 for (int j = 0; j < current_shape.size(); ++j) {
123 if (current_shape[j] == TensorShape::kDimUnknown) {
124 current_shape[j] = static_cast<dsize_t>(GenRandomInt(1, kMaxDimValue));
125 }
126 }
127
128 new_shape = std::make_unique<TensorShape>(current_shape);
129 int64_t size_in_bytes = new_shape->NumOfElements() * current_col.Type().SizeInBytes();
130
131 // Generate a random byte of data. This may cause some funny data for things like doubles,floats, bools
132 // however the random data op is not too concerned about the physical data itself.
133 std::uniform_int_distribution<uint32_t> uniDist(0, UINT8_MAX);
134 uint8_t random_byte = static_cast<uint8_t>(uniDist(rand_gen_));
135
136 // Now, create a chunk of memory for the entire tensor and copy this byte in repeatedly.
137 buf = std::make_unique<unsigned char[]>(size_in_bytes);
138 int ret_code = memset_s(buf.get(), size_in_bytes, random_byte, size_in_bytes);
139 if (ret_code != EOK) {
140 std::string error_msg = "RandomData: failed to set random data, ";
141 if (ret_code == ERANGE) {
142 RETURN_STATUS_UNEXPECTED(error_msg + "memory size of total data can not be zero or exceed " +
143 std::to_string(SECUREC_MEM_MAX_LEN) + ", but got: " + std::to_string(size_in_bytes));
144 } else {
145 RETURN_STATUS_UNEXPECTED("memset_s method failed with errno_t: " + std::to_string(ret_code));
146 }
147 }
148
149 RETURN_IF_NOT_OK(Tensor::CreateFromMemory(*new_shape, current_col.Type(), buf.get(), &new_tensor));
150
151 // Add this tensor to the tensor row for output
152 (*new_row).push_back(std::move(new_tensor));
153 }
154 return Status::OK();
155 }
156
ComputeColMap()157 Status RandomDataOp::ComputeColMap() {
158 // Extract the column name mapping from the schema and save it in the class.
159 if (column_name_id_map_.empty()) {
160 RETURN_IF_NOT_OK(data_schema_->GetColumnNameMap(&(column_name_id_map_)));
161 } else {
162 MS_LOG(WARNING) << "Column name map is already set!";
163 }
164 return Status::OK();
165 }
166
LoadTensorRow(row_id_type row_id,TensorRow * row)167 Status RandomDataOp::LoadTensorRow(row_id_type row_id, TensorRow *row) {
168 CHECK_FAIL_RETURN_UNEXPECTED(row_id < total_rows_, "Wrong index.");
169 for (const auto &tensor : rows_[static_cast<size_t>(row_id)]) {
170 TensorPtr new_tensor;
171 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(tensor, &new_tensor));
172 row->emplace_back(new_tensor);
173 }
174 return Status::OK();
175 }
176
PrepareData()177 Status RandomDataOp::PrepareData() {
178 for (int64_t i = 0; i < total_rows_; i++) {
179 TensorRow row;
180 RETURN_IF_NOT_OK(CreateRandomRow(&row));
181 rows_.emplace_back(row);
182 }
183 num_rows_ = total_rows_;
184 return Status::OK();
185 }
186
187 } // namespace dataset
188 } // namespace mindspore
189