• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minddata/dataset/engine/datasetops/source/random_data_op.h"
18 
19 #include <algorithm>
20 #include <iomanip>
21 #include <random>
22 #include "minddata/dataset/engine/execution_tree.h"
23 #include "minddata/dataset/core/config_manager.h"
24 #include "minddata/dataset/util/random.h"
25 #include "minddata/dataset/util/wait_post.h"
26 #include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
27 
28 namespace mindspore {
29 namespace dataset {
30 // Constructor for RandomDataOp
RandomDataOp(int32_t num_workers,int32_t op_connector_size,int64_t total_rows,std::unique_ptr<DataSchema> data_schema)31 RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64_t total_rows,
32                            std::unique_ptr<DataSchema> data_schema)
33     : MappableLeafOp(num_workers, op_connector_size, std::make_shared<SequentialSamplerRT>(0, 0)),
34       total_rows_(total_rows),
35       data_schema_(std::move(data_schema)) {
36   rand_gen_.seed(GetSeed());  // seed the random generator
37   // If total rows was not given, then randomly pick a number
38   if (total_rows_ == 0) {
39     total_rows_ = GenRandomInt(1, kMaxTotalRows);
40   }
41   // If the user did not provide a schema, then we will ask the op to generate a pseudo-random schema.
42   // See details of generateSchema function to learn what type of schema it will create.
43   if (data_schema_ == nullptr) {
44     GenerateSchema();
45   }
46 }
47 
48 // A print method typically used for debugging
Print(std::ostream & out,bool show_all) const49 void RandomDataOp::Print(std::ostream &out, bool show_all) const {
50   if (!show_all) {
51     // Call the super class for displaying any common 1-liner info
52     ParallelOp::Print(out, show_all);
53     // Then show any custom derived-internal 1-liner info for this op
54     out << " [total rows: " << num_rows_ << "]\n";
55   } else {
56     // Call the super class for displaying any common detailed info
57     ParallelOp::Print(out, show_all);
58     // Then show any custom derived-internal stuff
59     out << "\nTotal_rows: " << num_rows_ << " \nSchema:\n" << *data_schema_ << "\n\n";
60   }
61 }
62 
63 // Helper function to produce a default/random schema if one didn't exist
GenerateSchema()64 void RandomDataOp::GenerateSchema() {
65   // To randomly create a schema, we need to choose:
66   // a) how many columns
67   // b) the type of each column
68   // c) the shape of each column (number of dimensions i.e. rank)
69   // d) the shape of each column (dimension values)
70   data_schema_ = std::make_unique<DataSchema>();
71   std::unique_ptr<TensorShape> new_shape;
72   std::unique_ptr<ColDescriptor> new_col;
73 
74   // Loop over the number of chosen columns
75   int32_t numColumns = GenRandomInt(1, kMaxNumColumns);
76   for (int32_t i = 0; i < numColumns; i++) {
77     // For each column:
78     // - choose a datatype
79     // - generate a shape that randomly chooses the number of dimensions and the dimension values.
80     auto newType = static_cast<DataType::Type>(GenRandomInt(1, DataType::DE_STRING - 1));
81     int32_t rank = GenRandomInt(1, kMaxRank);
82     std::vector<dsize_t> dims;
83     for (int32_t d = 0; d < rank; d++) {
84       // 0 is not a valid dimension value.  however, we can support "*" or unknown, so map the random
85       // 0 value to the unknown attribute if 0 is chosen
86       auto dim_value = static_cast<dsize_t>(GenRandomInt(0, kMaxDimValue));
87       if (dim_value == 0) {
88         dim_value = TensorShape::kDimUnknown;
89       }
90       dims.push_back(dim_value);
91     }
92     new_shape = std::make_unique<TensorShape>(dims);
93 
94     // Create the column descriptor
95     std::string col_name = "c" + std::to_string(i);
96     new_col =
97       std::make_unique<ColDescriptor>(col_name, DataType(newType), TensorImpl::kFlexible, rank, new_shape.get());
98 
99     Status rc = data_schema_->AddColumn(*new_col);
100     if (rc.IsError()) {
101       MS_LOG(ERROR) << "[Internal ERROR] Failed to generate a schema. Message:" << rc;
102     }
103   }
104 }
105 
106 // A helper function to create random data for the row
CreateRandomRow(TensorRow * new_row)107 Status RandomDataOp::CreateRandomRow(TensorRow *new_row) {
108   if (new_row == nullptr) {
109     RETURN_STATUS_UNEXPECTED("[Internal ERROR] Missing tensor row output.");
110   }
111 
112   // Create a tensor for each column, then add the tensor to the row
113   for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
114     const ColDescriptor current_col = data_schema_->Column(i);
115     std::vector<dsize_t> current_shape = current_col.Shape().AsVector();
116     std::unique_ptr<TensorShape> new_shape = nullptr;
117     std::unique_ptr<unsigned char[]> buf = nullptr;
118     std::shared_ptr<Tensor> new_tensor = nullptr;
119 
120     // We need to resolve the shape to fill in any unknown dimensions with random
121     // values, then use that as our shape for this tensor.
122     for (int j = 0; j < current_shape.size(); ++j) {
123       if (current_shape[j] == TensorShape::kDimUnknown) {
124         current_shape[j] = static_cast<dsize_t>(GenRandomInt(1, kMaxDimValue));
125       }
126     }
127 
128     new_shape = std::make_unique<TensorShape>(current_shape);
129     int64_t size_in_bytes = new_shape->NumOfElements() * current_col.Type().SizeInBytes();
130 
131     // Generate a random byte of data.  This may cause some funny data for things like doubles,floats, bools
132     // however the random data op is not too concerned about the physical data itself.
133     std::uniform_int_distribution<uint32_t> uniDist(0, UINT8_MAX);
134     uint8_t random_byte = static_cast<uint8_t>(uniDist(rand_gen_));
135 
136     // Now, create a chunk of memory for the entire tensor and copy this byte in repeatedly.
137     buf = std::make_unique<unsigned char[]>(size_in_bytes);
138     int ret_code = memset_s(buf.get(), size_in_bytes, random_byte, size_in_bytes);
139     if (ret_code != EOK) {
140       std::string error_msg = "RandomData: failed to set random data, ";
141       if (ret_code == ERANGE) {
142         RETURN_STATUS_UNEXPECTED(error_msg + "memory size of total data can not be zero or exceed " +
143                                  std::to_string(SECUREC_MEM_MAX_LEN) + ", but got: " + std::to_string(size_in_bytes));
144       } else {
145         RETURN_STATUS_UNEXPECTED("memset_s method failed with errno_t: " + std::to_string(ret_code));
146       }
147     }
148 
149     RETURN_IF_NOT_OK(Tensor::CreateFromMemory(*new_shape, current_col.Type(), buf.get(), &new_tensor));
150 
151     // Add this tensor to the tensor row for output
152     (*new_row).push_back(std::move(new_tensor));
153   }
154   return Status::OK();
155 }
156 
ComputeColMap()157 Status RandomDataOp::ComputeColMap() {
158   // Extract the column name mapping from the schema and save it in the class.
159   if (column_name_id_map_.empty()) {
160     RETURN_IF_NOT_OK(data_schema_->GetColumnNameMap(&(column_name_id_map_)));
161   } else {
162     MS_LOG(WARNING) << "Column name map is already set!";
163   }
164   return Status::OK();
165 }
166 
LoadTensorRow(row_id_type row_id,TensorRow * row)167 Status RandomDataOp::LoadTensorRow(row_id_type row_id, TensorRow *row) {
168   CHECK_FAIL_RETURN_UNEXPECTED(row_id < total_rows_, "Wrong index.");
169   for (const auto &tensor : rows_[static_cast<size_t>(row_id)]) {
170     TensorPtr new_tensor;
171     RETURN_IF_NOT_OK(Tensor::CreateFromTensor(tensor, &new_tensor));
172     row->emplace_back(new_tensor);
173   }
174   return Status::OK();
175 }
176 
PrepareData()177 Status RandomDataOp::PrepareData() {
178   for (int64_t i = 0; i < total_rows_; i++) {
179     TensorRow row;
180     RETURN_IF_NOT_OK(CreateRandomRow(&row));
181     rows_.emplace_back(row);
182   }
183   num_rows_ = total_rows_;
184   return Status::OK();
185 }
186 
187 }  // namespace dataset
188 }  // namespace mindspore
189