1 /**
2 * Copyright 2019 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/engine/data_schema.h"
17
18 #include <algorithm>
19 #include <fstream>
20 #include <iostream>
21 #include <map>
22 #include <memory>
23 #include <nlohmann/json.hpp>
24
25 #include "utils/ms_utils.h"
26 #include "minddata/dataset/util/status.h"
27 #include "minddata/dataset/core/tensor_shape.h"
28 #include "minddata/dataset/util/log_adapter.h"
29
30 namespace mindspore {
31 namespace dataset {
32 // A macro for converting an input string representing the column type to it's actual
33 // numeric column type.
34 #define STR_TO_TENSORIMPL(in_col_str, out_type) \
35 do { \
36 if (in_col_str == "cvmat") { \
37 out_type = TensorImpl::kCv; \
38 } else if (in_col_str == "flex") { \
39 out_type = TensorImpl::kFlexible; \
40 } else if (in_col_str == "np") { \
41 out_type = TensorImpl::kNP; \
42 } else { \
43 out_type = TensorImpl::kNone; \
44 } \
45 } while (false)
46
47 // Constructor 1: Simple constructor that leaves things uninitialized.
ColDescriptor()48 ColDescriptor::ColDescriptor()
49 : type_(DataType::DE_UNKNOWN), rank_(0), tensor_impl_(TensorImpl::kNone), tensor_shape_(nullptr) {}
50
51 // Constructor 2: Main constructor
ColDescriptor(const std::string & col_name,DataType col_type,TensorImpl tensor_impl,int32_t rank,const TensorShape * in_shape)52 ColDescriptor::ColDescriptor(const std::string &col_name, DataType col_type, TensorImpl tensor_impl, int32_t rank,
53 const TensorShape *in_shape)
54 : type_(col_type), rank_(rank), tensor_impl_(tensor_impl), col_name_(col_name) {
55 // If a shape was provided, create unique pointer for it and copy construct it into
56 // our shape. Otherwise, set our shape to be empty.
57 if (in_shape != nullptr) {
58 // Create a shape and copy construct it into our column's shape.
59 tensor_shape_ = std::make_unique<TensorShape>(*in_shape);
60 } else {
61 tensor_shape_ = nullptr;
62 }
63 // If the user input a shape, then the rank of the input shape needs to match
64 // the input rank
65 if (in_shape != nullptr && in_shape->known() && in_shape->Size() != rank_) {
66 rank_ = in_shape->Size();
67 MS_LOG(WARNING) << "Rank does not match the number of dimensions in the provided shape."
68 << " Overriding rank with the number of dimensions in the provided shape.";
69 }
70 }
71
72 // Explicit copy constructor is required
ColDescriptor(const ColDescriptor & in_cd)73 ColDescriptor::ColDescriptor(const ColDescriptor &in_cd)
74 : type_(in_cd.type_), rank_(in_cd.rank_), tensor_impl_(in_cd.tensor_impl_), col_name_(in_cd.col_name_) {
75 // If it has a tensor shape, make a copy of it with our own unique_ptr.
76 tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
77 }
78
79 // Assignment overload
operator =(const ColDescriptor & in_cd)80 ColDescriptor &ColDescriptor::operator=(const ColDescriptor &in_cd) {
81 if (&in_cd != this) {
82 type_ = in_cd.type_;
83 rank_ = in_cd.rank_;
84 tensor_impl_ = in_cd.tensor_impl_;
85 col_name_ = in_cd.col_name_;
86 // If it has a tensor shape, make a copy of it with our own unique_ptr.
87 tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
88 }
89 return *this;
90 }
91
92 // Destructor
93 ColDescriptor::~ColDescriptor() = default;
94
95 // A print method typically used for debugging
Print(std::ostream & out) const96 void ColDescriptor::Print(std::ostream &out) const {
97 out << " Name : " << col_name_ << "\n Type : " << type_ << "\n Rank : " << rank_
98 << "\n Shape : (";
99 if (tensor_shape_) {
100 out << *tensor_shape_ << ")\n";
101 } else {
102 out << "no shape provided)\n";
103 }
104 }
105
106 // Given a number of elements, this function will compute what the actual Tensor shape would be.
107 // If there is no starting TensorShape in this column, or if there is a shape but it contains
108 // an unknown dimension, then the output shape returned shall resolve dimensions as needed.
MaterializeTensorShape(int32_t num_elements,TensorShape * out_shape) const109 Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *out_shape) const {
110 if (out_shape == nullptr) {
111 RETURN_STATUS_UNEXPECTED("Unexpected null output shape argument.");
112 }
113
114 // If the shape is not given in this column, then we assume the shape will be: {numElements}
115 if (tensor_shape_ == nullptr) {
116 if (this->Rank() == 0 && num_elements == 1) {
117 *out_shape = TensorShape::CreateScalar();
118 return Status::OK();
119 }
120 *out_shape = TensorShape({num_elements});
121 return Status::OK();
122 }
123
124 // Build the real TensorShape based on the requested shape and the number of elements in the data.
125 // If there are unknown dimensions, then the unknown dimension needs to be filled in.
126 // Example: requestedShape: {?,4,3}.
127 // If numElements is 24, then the output shape can be computed to: {2,4,3}
128 std::vector<dsize_t> requested_shape = tensor_shape_->AsVector();
129 int64_t num_elements_of_shape = 1; // init to 1 as a starting multiplier.
130
131 // unknownDimPosition variable is overloaded to provide 2 meanings:
132 // 1) If it's set to DIM_UNKNOWN, then it provides a boolean knowledge to tell us if there are
133 // any unknown dimensions. i.e. if it's set to unknown, then there are no unknown dimensions.
134 // 2) If it's set to a numeric value, then this is the vector index position within the shape
135 // where the single unknown dimension can be found.
136 int64_t unknown_dim_position = TensorShape::kDimUnknown; // Assume there are no unknown dims to start
137
138 for (int i = 0; i < requested_shape.size(); ++i) {
139 // If we already had an unknown dimension, then we cannot have a second unknown dimension.
140 // We only support the compute of a single unknown dim.
141 if (requested_shape[i] == TensorShape::kDimUnknown && unknown_dim_position != TensorShape::kDimUnknown) {
142 return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
143 "Requested shape has more than one unknown dimension!");
144 }
145
146 // If the current dimension in the requested shape is a known value, then compute the number of
147 // elements so far.
148 if (requested_shape[i] != TensorShape::kDimUnknown) {
149 num_elements_of_shape *= requested_shape[i];
150 } else {
151 // This dimension is unknown so track which dimension position has it.
152 unknown_dim_position = i;
153 }
154 }
155
156 // Sanity check the the computed element counts divide evenly into the input element count
157 if (num_elements < num_elements_of_shape || num_elements_of_shape == 0 || num_elements % num_elements_of_shape != 0) {
158 std::string err = "Requested shape has an invalid element count! Number elements: " + std::to_string(num_elements) +
159 ", number elements of shape: " + std::to_string(num_elements_of_shape);
160 RETURN_STATUS_UNEXPECTED(err);
161 }
162
163 // If there was any unknown dimensions, then update the requested shape to fill in the unknown
164 // dimension with the correct value. If there were no unknown dim's then the output shape will
165 // remain to be the same as the requested shape.
166 if (unknown_dim_position != TensorShape::kDimUnknown) {
167 requested_shape[unknown_dim_position] = (num_elements / num_elements_of_shape);
168 }
169
170 // Any unknown dimension is filled in now. Set the output shape
171 *out_shape = TensorShape(requested_shape);
172 return Status::OK();
173 }
174
175 // getter function for the shape
Shape() const176 TensorShape ColDescriptor::Shape() const {
177 if (tensor_shape_ != nullptr) {
178 return *tensor_shape_; // copy construct a shape to return
179 } else {
180 return TensorShape::CreateUnknownRankShape(); // empty shape to return
181 }
182 }
183
184 const char DataSchema::DEFAULT_DATA_SCHEMA_FILENAME[] = "datasetSchema.json";
185
186 // Constructor 1: Simple constructor that leaves things uninitialized.
DataSchema()187 DataSchema::DataSchema() : num_rows_(0) {}
188
189 // Internal helper function. Parses the json schema file in any order and produces a schema that
190 // does not follow any particular order (json standard does not enforce any ordering protocol).
191 // This one produces a schema that contains all of the columns from the schema file.
AnyOrderLoad(nlohmann::json column_tree)192 Status DataSchema::AnyOrderLoad(nlohmann::json column_tree) {
193 // Iterate over the json file. Each parent json node is the column name,
194 // followed by the column properties in the child tree under the column.
195 // Outer loop here iterates over the parents (i.e. the column name)
196 if (!column_tree.is_array()) {
197 for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
198 std::string col_name = it.key();
199 nlohmann::json column_child_tree = it.value();
200 RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
201 }
202 } else {
203 // Case where the schema is a list of columns not a dict
204 for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
205 nlohmann::json column_child_tree = it.value();
206 RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, ""));
207 }
208 }
209 return Status::OK();
210 }
211
212 // Internal helper function. For each input column name, perform a lookup to the json document to
213 // find the matching column. When the match is found, process that column to build the column
214 // descriptor and add to the schema in the order in which the input column names are given.id
ColumnOrderLoad(nlohmann::json column_tree,const std::vector<std::string> & columns_to_load)215 Status DataSchema::ColumnOrderLoad(nlohmann::json column_tree, const std::vector<std::string> &columns_to_load) {
216 if (!column_tree.is_array()) {
217 // the json file is dict (e.g., {image: ...})
218 // Loop over the column name list
219 for (const auto &curr_col_name : columns_to_load) {
220 // Find the column in the json document
221 auto column_info = column_tree.find(common::SafeCStr(curr_col_name));
222 if (column_info == column_tree.end()) {
223 RETURN_STATUS_UNEXPECTED("Invalid data, failed to find column name: " + curr_col_name + " in given json file.");
224 }
225 // At this point, columnInfo.value() is the subtree in the json document that contains
226 // all of the data for a given column. This data will formulate our schema column.
227 const std::string &col_name = column_info.key();
228 nlohmann::json column_child_tree = column_info.value();
229 RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
230 }
231 } else {
232 // the json file is array (e.g., [name: image...])
233 // Loop over the column name list
234 for (const auto &curr_col_name : columns_to_load) {
235 // Find the column in the json document
236 int32_t index = -1;
237 int32_t i = 0;
238 for (const auto &it_child : column_tree.items()) {
239 auto name = it_child.value().find("name");
240 if (name == it_child.value().end()) {
241 RETURN_STATUS_UNEXPECTED("Invalid data, \"name\" field is missing for column: " + curr_col_name);
242 }
243 if (name.value() == curr_col_name) {
244 index = i;
245 break;
246 }
247 i++;
248 }
249 if (index == -1) {
250 RETURN_STATUS_UNEXPECTED("Invalid data, failed to find column name: " + curr_col_name + " in given json file.");
251 }
252 nlohmann::json column_child_tree = column_tree[index];
253 RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, curr_col_name));
254 }
255 }
256 return Status::OK();
257 }
258
259 // Internal helper function for parsing shape info and building a vector for the shape construction.
BuildShape(const nlohmann::json & shapeVal,std::vector<dsize_t> * outShape)260 static Status BuildShape(const nlohmann::json &shapeVal, std::vector<dsize_t> *outShape) {
261 if (outShape == nullptr) {
262 RETURN_STATUS_UNEXPECTED("outShape can not be nullptr.");
263 }
264 if (shapeVal.empty()) return Status::OK();
265
266 // Iterate over the integer list and add those values to the output shape tensor
267 auto items = shapeVal.items();
268 using it_type = decltype(items.begin());
269 (void)std::transform(items.begin(), items.end(), std::back_inserter(*outShape), [](it_type j) { return j.value(); });
270 return Status::OK();
271 }
272
273 // Internal helper function. Given the json tree for a given column, load it into our schema.
ColumnLoad(nlohmann::json column_child_tree,const std::string & col_name)274 Status DataSchema::ColumnLoad(nlohmann::json column_child_tree, const std::string &col_name) {
275 int32_t rank_value = -1;
276 TensorImpl t_impl_value = TensorImpl::kFlexible;
277 std::string name = "";
278 std::string type_str = "";
279 std::vector<dsize_t> tmp_shape = {};
280 bool shape_field_exists = false;
281 // Iterate over this column's attributes.
282 // Manually iterating each of the child nodes/trees here so that we can provide our own error handling.
283 for (const auto &it_child : column_child_tree.items()) {
284 // Save the data for each of the attributes into variables. We'll use these to construct later.
285 if (it_child.key() == "name") {
286 name = it_child.value();
287 } else if (it_child.key() == "type") {
288 type_str = it_child.value();
289 } else if (it_child.key() == "rank") {
290 rank_value = it_child.value();
291 } else if (it_child.key() == "t_impl") {
292 STR_TO_TENSORIMPL(it_child.value(), t_impl_value);
293 } else if (it_child.key() == "shape") {
294 shape_field_exists = true;
295 RETURN_IF_NOT_OK(BuildShape(it_child.value(), &tmp_shape));
296 } else {
297 std::string err_msg = "Invalid data, unexpected column attribute " + it_child.key() + " for column " + col_name +
298 ", expected attribute: name, type, rank, t_impl or shape.";
299 RETURN_STATUS_UNEXPECTED(err_msg);
300 }
301 }
302 if (!name.empty()) {
303 if (!col_name.empty() && col_name != name) {
304 std::string err_msg =
305 "Invalid data, json schema file for column " + col_name + " has column name that does not match columnsToLoad";
306 RETURN_STATUS_UNEXPECTED(err_msg);
307 }
308 } else {
309 if (col_name.empty()) {
310 std::string err_msg =
311 "Invalid data, json schema file for column " + col_name + " has invalid or missing column name.";
312 RETURN_STATUS_UNEXPECTED(err_msg);
313 } else {
314 name = col_name;
315 }
316 }
317 // data type is mandatory field
318 if (type_str.empty())
319 return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
320 "Invalid data, json schema file for column " + col_name + " has invalid or missing column type.");
321
322 // rank number is mandatory field
323 if (rank_value <= -1)
324 return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
325 "Invalid data, json schema file for column " + col_name + " must define a positive rank value.");
326
327 // Create the column descriptor for this column from the data we pulled from the json file
328 TensorShape col_shape = TensorShape(tmp_shape);
329 if (shape_field_exists)
330 RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value, &col_shape)));
331 else
332 // Create a column descriptor that doesn't have a shape
333 RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value)));
334 return Status::OK();
335 }
336
337 // Parses a schema json file and populates the columns and meta info.
LoadSchemaFile(const std::string & schema_file_path,const std::vector<std::string> & columns_to_load)338 Status DataSchema::LoadSchemaFile(const std::string &schema_file_path,
339 const std::vector<std::string> &columns_to_load) {
340 try {
341 std::ifstream in(schema_file_path);
342
343 nlohmann::json js;
344 in >> js;
345 RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
346 try {
347 num_rows_ = js.at("numRows").get<int64_t>();
348 } catch (nlohmann::json::out_of_range &e) {
349 num_rows_ = 0;
350 } catch (nlohmann::json::exception &e) {
351 in.close();
352 RETURN_STATUS_UNEXPECTED("Invalid data, unable to parse \"numRows\" from schema file: " + schema_file_path);
353 }
354 nlohmann::json column_tree = js.at("columns");
355 if (column_tree.empty()) {
356 in.close();
357 RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in schema file: " + schema_file_path);
358 }
359 if (columns_to_load.empty()) {
360 // Parse the json tree and load the schema's columns in whatever order that the json
361 // layout decides
362 Status rc = this->AnyOrderLoad(column_tree);
363 if (rc.IsError()) {
364 in.close();
365 return rc;
366 }
367 } else {
368 Status rc = this->ColumnOrderLoad(column_tree, columns_to_load);
369 if (rc.IsError()) {
370 rc.SetErrDescription(rc.GetErrDescription() + " file: " + schema_file_path);
371 in.close();
372 return rc;
373 }
374 }
375 in.close();
376 } catch (const std::exception &err) {
377 // Catch any exception and convert to Status return code
378 RETURN_STATUS_UNEXPECTED("Invalid file, failed to load and parse schema file: " + schema_file_path);
379 }
380 return Status::OK();
381 }
382
383 // Parses a schema json string and populates the columns and meta info.
LoadSchemaString(const std::string & schema_json_string,const std::vector<std::string> & columns_to_load)384 Status DataSchema::LoadSchemaString(const std::string &schema_json_string,
385 const std::vector<std::string> &columns_to_load) {
386 try {
387 nlohmann::json js = nlohmann::json::parse(schema_json_string);
388 RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
389 num_rows_ = js.value("numRows", 0);
390 nlohmann::json column_tree = js.at("columns");
391 if (column_tree.empty()) {
392 RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in schema string.");
393 }
394 if (columns_to_load.empty()) {
395 // Parse the json tree and load the schema's columns in whatever order that the json
396 // layout decides
397 RETURN_IF_NOT_OK(this->AnyOrderLoad(column_tree));
398 } else {
399 Status rc = this->ColumnOrderLoad(column_tree, columns_to_load);
400 if (rc.IsError()) {
401 rc.SetErrDescription(rc.GetErrDescription() + " file content: " + schema_json_string);
402 return rc;
403 }
404 }
405 } catch (const std::exception &err) {
406 // Catch any exception and convert to Status return code
407 RETURN_STATUS_UNEXPECTED("Invalid data, failed to load and parse schema string.");
408 }
409 return Status::OK();
410 }
411
412 // Destructor
413 DataSchema::~DataSchema() = default;
414
415 // Getter for the ColDescriptor by index
Column(int32_t idx) const416 const ColDescriptor &DataSchema::Column(int32_t idx) const {
417 MS_ASSERT(idx < static_cast<int>(col_descs_.size()));
418 return col_descs_[idx];
419 }
420
421 // A print method typically used for debugging
Print(std::ostream & out) const422 void DataSchema::Print(std::ostream &out) const {
423 out << "Dataset schema: (";
424 for (const auto &col_desc : col_descs_) {
425 out << col_desc << "\n";
426 }
427 }
428
429 // Adds a column descriptor to the schema
AddColumn(const ColDescriptor & cd)430 Status DataSchema::AddColumn(const ColDescriptor &cd) {
431 // Sanity check there's not a duplicate name before adding the column
432 for (auto i = 0; i < col_descs_.size(); ++i) {
433 if (col_descs_[i].Name() == cd.Name()) {
434 std::ostringstream ss;
435 ss << "column name '" << cd.Name() << "' already exists in schema.";
436 std::string err_msg = ss.str();
437 RETURN_STATUS_UNEXPECTED(err_msg);
438 }
439 }
440 col_descs_.push_back(cd);
441 return Status::OK();
442 }
443
444 // Internal helper function. Performs sanity checks on the json file setup.
PreLoadExceptionCheck(const nlohmann::json & js)445 Status DataSchema::PreLoadExceptionCheck(const nlohmann::json &js) {
446 // Check if columns node exists. It is required for building schema from file.
447 if (js.find("columns") == js.end())
448 return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
449 "Invalid data, \"columns\" node is required in the schema json file.");
450 return Status::OK();
451 }
452
453 // Loops through all columns in the schema and returns a map with the column
454 // name to column index number.
GetColumnNameMap(std::unordered_map<std::string,int32_t> * out_column_name_map)455 Status DataSchema::GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map) {
456 if (out_column_name_map == nullptr) {
457 return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__, "unexpected null output column name map.");
458 }
459
460 for (size_t i = 0; i < col_descs_.size(); ++i) {
461 if (col_descs_[i].Name().empty()) {
462 return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
463 "Constructing column name map from schema, but found empty column name.");
464 }
465 (*out_column_name_map)[col_descs_[i].Name()] = i;
466 }
467
468 return Status::OK();
469 }
470 } // namespace dataset
471 } // namespace mindspore
472