1 /**
2 * Copyright 2020-2024 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/engine/data_schema.h"
17
18 #include <algorithm>
19 #include <fstream>
20 #include <iostream>
21 #include <map>
22 #include <memory>
23 #include <nlohmann/json.hpp>
24
25 #include "utils/ms_utils.h"
26 #include "minddata/dataset/util/status.h"
27 #include "minddata/dataset/core/tensor_shape.h"
28 #include "minddata/dataset/util/log_adapter.h"
29
30 namespace mindspore {
31 namespace dataset {
32 // A macro for converting an input string representing the column type to it's actual
33 // numeric column type.
34 #define STR_TO_TENSORIMPL(in_col_str, out_type) \
35 do { \
36 if (in_col_str == "cvmat") { \
37 out_type = TensorImpl::kCv; \
38 } else if (in_col_str == "flex") { \
39 out_type = TensorImpl::kFlexible; \
40 } else if (in_col_str == "np") { \
41 out_type = TensorImpl::kNP; \
42 } else { \
43 out_type = TensorImpl::kNone; \
44 } \
45 } while (false)
46
47 // Constructor 1: Simple constructor that leaves things uninitialized.
ColDescriptor()48 ColDescriptor::ColDescriptor()
49 : type_(DataType::DE_UNKNOWN), rank_(0), tensor_impl_(TensorImpl::kNone), tensor_shape_(nullptr) {}
50
51 // Constructor 2: Main constructor
ColDescriptor(const std::string & col_name,DataType col_type,TensorImpl tensor_impl,int32_t rank,const TensorShape * in_shape)52 ColDescriptor::ColDescriptor(const std::string &col_name, DataType col_type, TensorImpl tensor_impl, int32_t rank,
53 const TensorShape *in_shape)
54 : type_(col_type), rank_(rank), tensor_impl_(tensor_impl), col_name_(col_name) {
55 // If a shape was provided, create unique pointer for it and copy construct it into
56 // our shape. Otherwise, set our shape to be empty.
57 if (in_shape != nullptr) {
58 // Create a shape and copy construct it into our column's shape.
59 tensor_shape_ = std::make_unique<TensorShape>(*in_shape);
60 } else {
61 tensor_shape_ = nullptr;
62 }
63 // If the user input a shape, then the rank of the input shape needs to match
64 // the input rank
65 if (in_shape != nullptr && in_shape->known() && in_shape->Size() != rank_) {
66 rank_ = in_shape->Size();
67 MS_LOG(WARNING) << "Rank does not match the number of dimensions in the provided shape."
68 << " Overriding rank with the number of dimensions in the provided shape.";
69 }
70 }
71
72 // Explicit copy constructor is required
ColDescriptor(const ColDescriptor & in_cd)73 ColDescriptor::ColDescriptor(const ColDescriptor &in_cd)
74 : type_(in_cd.type_), rank_(in_cd.rank_), tensor_impl_(in_cd.tensor_impl_), col_name_(in_cd.col_name_) {
75 // If it has a tensor shape, make a copy of it with our own unique_ptr.
76 tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
77 }
78
79 // Assignment overload
operator =(const ColDescriptor & in_cd)80 ColDescriptor &ColDescriptor::operator=(const ColDescriptor &in_cd) {
81 if (&in_cd != this) {
82 type_ = in_cd.type_;
83 rank_ = in_cd.rank_;
84 tensor_impl_ = in_cd.tensor_impl_;
85 col_name_ = in_cd.col_name_;
86 // If it has a tensor shape, make a copy of it with our own unique_ptr.
87 tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
88 }
89 return *this;
90 }
91
92 // Destructor
93 ColDescriptor::~ColDescriptor() = default;
94
95 // A print method typically used for debugging
Print(std::ostream & out) const96 void ColDescriptor::Print(std::ostream &out) const {
97 out << " Name : " << col_name_ << "\n Type : " << type_ << "\n Rank : " << rank_
98 << "\n Shape : (";
99 if (tensor_shape_) {
100 out << *tensor_shape_ << ")\n";
101 } else {
102 out << "no shape provided)\n";
103 }
104 }
105
106 // Given a number of elements, this function will compute what the actual Tensor shape would be.
107 // If there is no starting TensorShape in this column, or if there is a shape but it contains
108 // an unknown dimension, then the output shape returned shall resolve dimensions as needed.
MaterializeTensorShape(int32_t num_elements,TensorShape * out_shape) const109 Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *out_shape) const {
110 if (out_shape == nullptr) {
111 RETURN_STATUS_UNEXPECTED("Unexpected null output shape argument.");
112 }
113
114 // If the shape is not given in this column, then we assume the shape will be: {numElements}
115 if (tensor_shape_ == nullptr) {
116 if (this->Rank() == 0 && num_elements == 1) {
117 *out_shape = TensorShape::CreateScalar();
118 return Status::OK();
119 }
120 *out_shape = TensorShape({num_elements});
121 return Status::OK();
122 }
123
124 // Build the real TensorShape based on the requested shape and the number of elements in the data.
125 // If there are unknown dimensions, then the unknown dimension needs to be filled in.
126 // Example: requestedShape: {?,4,3}.
127 // If numElements is 24, then the output shape can be computed to: {2,4,3}
128 std::vector<dsize_t> requested_shape = tensor_shape_->AsVector();
129 int64_t num_elements_of_shape = 1; // init to 1 as a starting multiplier.
130
131 // unknownDimPosition variable is overloaded to provide 2 meanings:
132 // 1) If it's set to DIM_UNKNOWN, then it provides a boolean knowledge to tell us if there are
133 // any unknown dimensions. i.e. if it's set to unknown, then there are no unknown dimensions.
134 // 2) If it's set to a numeric value, then this is the vector index position within the shape
135 // where the single unknown dimension can be found.
136 int64_t unknown_dim_position = TensorShape::kDimUnknown; // Assume there are no unknown dims to start
137
138 for (int i = 0; i < requested_shape.size(); ++i) {
139 // If we already had an unknown dimension, then we cannot have a second unknown dimension.
140 // We only support the compute of a single unknown dim.
141 if (requested_shape[i] == TensorShape::kDimUnknown && unknown_dim_position != TensorShape::kDimUnknown) {
142 RETURN_STATUS_UNEXPECTED("Requested shape has more than one unknown dimension!");
143 }
144
145 // If the current dimension in the requested shape is a known value, then compute the number of
146 // elements so far.
147 if (requested_shape[i] != TensorShape::kDimUnknown) {
148 num_elements_of_shape *= requested_shape[i];
149 } else {
150 // This dimension is unknown so track which dimension position has it.
151 unknown_dim_position = i;
152 }
153 }
154
155 // Sanity check the the computed element counts divide evenly into the input element count
156 if (num_elements < num_elements_of_shape || num_elements_of_shape == 0 || num_elements % num_elements_of_shape != 0) {
157 std::string err = "Requested shape has an invalid element count! Number elements: " + std::to_string(num_elements) +
158 ", number elements of shape: " + std::to_string(num_elements_of_shape);
159 RETURN_STATUS_UNEXPECTED(err);
160 }
161
162 // If there was any unknown dimensions, then update the requested shape to fill in the unknown
163 // dimension with the correct value. If there were no unknown dim's then the output shape will
164 // remain to be the same as the requested shape.
165 if (unknown_dim_position != TensorShape::kDimUnknown) {
166 requested_shape[unknown_dim_position] = (num_elements / num_elements_of_shape);
167 }
168
169 // Any unknown dimension is filled in now. Set the output shape
170 *out_shape = TensorShape(requested_shape);
171 return Status::OK();
172 }
173
174 // getter function for the shape
Shape() const175 TensorShape ColDescriptor::Shape() const {
176 if (tensor_shape_ != nullptr) {
177 return *tensor_shape_; // copy construct a shape to return
178 } else {
179 return TensorShape::CreateUnknownRankShape(); // empty shape to return
180 }
181 }
182
183 const char DataSchema::DEFAULT_DATA_SCHEMA_FILENAME[] = "datasetSchema.json";
184
185 // Constructor 1: Simple constructor that leaves things uninitialized.
DataSchema()186 DataSchema::DataSchema() : num_rows_(0) {}
187
188 // Internal helper function. Parses the json schema file in any order and produces a schema that
189 // does not follow any particular order (json standard does not enforce any ordering protocol).
190 // This one produces a schema that contains all of the columns from the schema file.
AnyOrderLoad(nlohmann::json column_tree)191 Status DataSchema::AnyOrderLoad(nlohmann::json column_tree) {
192 // Iterate over the json file. Each parent json node is the column name,
193 // followed by the column properties in the child tree under the column.
194 // Outer loop here iterates over the parents (i.e. the column name)
195 if (!column_tree.is_array()) {
196 for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
197 std::string col_name = it.key();
198 nlohmann::json column_child_tree = it.value();
199 RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
200 }
201 } else {
202 // Case where the schema is a list of columns not a dict
203 for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
204 nlohmann::json column_child_tree = it.value();
205 RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, ""));
206 }
207 }
208 return Status::OK();
209 }
210
211 // Internal helper function. For each input column name, perform a lookup to the json document to
212 // find the matching column. When the match is found, process that column to build the column
213 // descriptor and add to the schema in the order in which the input column names are given.id
ColumnOrderLoad(nlohmann::json column_tree,const std::vector<std::string> & columns_to_load)214 Status DataSchema::ColumnOrderLoad(nlohmann::json column_tree, const std::vector<std::string> &columns_to_load) {
215 if (!column_tree.is_array()) {
216 // the json file is dict (e.g., {image: ...})
217 // Loop over the column name list
218 for (const auto &curr_col_name : columns_to_load) {
219 // Find the column in the json document
220 auto column_info = column_tree.find(common::SafeCStr(curr_col_name));
221 if (column_info == column_tree.end()) {
222 RETURN_STATUS_UNEXPECTED("Invalid data, failed to find column: " + curr_col_name + " in JSON schema file.");
223 }
224 // At this point, columnInfo.value() is the subtree in the json document that contains
225 // all of the data for a given column. This data will formulate our schema column.
226 const std::string &col_name = column_info.key();
227 nlohmann::json column_child_tree = column_info.value();
228 RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
229 }
230 } else {
231 // the json file is array (e.g., [name: image...])
232 // Loop over the column name list
233 for (const auto &curr_col_name : columns_to_load) {
234 // Find the column in the json document
235 int32_t index = -1;
236 int32_t i = 0;
237 for (const auto &it_child : column_tree.items()) {
238 auto name = it_child.value().find("name");
239 if (name == it_child.value().end()) {
240 RETURN_STATUS_UNEXPECTED("Invalid data, \"name\" field is missing for column: " + curr_col_name +
241 " in JSON schema file.");
242 }
243 if (name.value() == curr_col_name) {
244 index = i;
245 break;
246 }
247 i++;
248 }
249 if (index == -1) {
250 RETURN_STATUS_UNEXPECTED("Invalid data, failed to find column: " + curr_col_name + " in JSON schema file.");
251 }
252 nlohmann::json column_child_tree = column_tree[index];
253 RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, curr_col_name));
254 }
255 }
256 return Status::OK();
257 }
258
259 // Internal helper function for parsing shape info and building a vector for the shape construction.
BuildShape(const nlohmann::json & shapeVal,std::vector<dsize_t> * outShape)260 static Status BuildShape(const nlohmann::json &shapeVal, std::vector<dsize_t> *outShape) {
261 if (outShape == nullptr) {
262 RETURN_STATUS_UNEXPECTED("outShape can not be nullptr.");
263 }
264 if (shapeVal.empty()) {
265 return Status::OK();
266 }
267
268 // Iterate over the integer list and add those values to the output shape tensor
269 auto items = shapeVal.items();
270 using it_type = decltype(items.begin());
271 (void)std::transform(items.begin(), items.end(), std::back_inserter(*outShape), [](it_type j) { return j.value(); });
272 return Status::OK();
273 }
274
275 // Internal helper function. Given the json tree for a given column, load it into our schema.
ColumnLoad(nlohmann::json column_child_tree,const std::string & col_name)276 Status DataSchema::ColumnLoad(nlohmann::json column_child_tree, const std::string &col_name) {
277 int32_t rank_value = -1;
278 TensorImpl t_impl_value = TensorImpl::kFlexible;
279 std::string name = "";
280 std::string type_str = "";
281 std::vector<dsize_t> tmp_shape = {};
282 bool shape_field_exists = false;
283 // Iterate over this column's attributes.
284 // Manually iterating each of the child nodes/trees here so that we can provide our own error handling.
285 for (const auto &it_child : column_child_tree.items()) {
286 // Save the data for each of the attributes into variables. We'll use these to construct later.
287 if (it_child.key() == "name") {
288 name = it_child.value();
289 } else if (it_child.key() == "type") {
290 type_str = it_child.value();
291 } else if (it_child.key() == "rank") {
292 rank_value = it_child.value();
293 } else if (it_child.key() == "t_impl") {
294 STR_TO_TENSORIMPL(it_child.value(), t_impl_value);
295 } else if (it_child.key() == "shape") {
296 shape_field_exists = true;
297 RETURN_IF_NOT_OK(BuildShape(it_child.value(), &tmp_shape));
298 } else {
299 std::string err_msg = "Invalid data, unexpected column attribute " + it_child.key() + " for column " + col_name +
300 ", expected attribute: name, type, rank, t_impl or shape.";
301 RETURN_STATUS_UNEXPECTED(err_msg);
302 }
303 }
304 if (!name.empty()) {
305 if (!col_name.empty() && col_name != name) {
306 std::string err_msg = "Invalid data, failed to find column: " + col_name + " in JSON schema file.";
307 RETURN_STATUS_UNEXPECTED(err_msg);
308 }
309 } else {
310 if (col_name.empty()) {
311 std::string err_msg = "Invalid data, \"name\" field is missing for column " + col_name + " in JSON schema file.";
312 RETURN_STATUS_UNEXPECTED(err_msg);
313 } else {
314 name = col_name;
315 }
316 }
317 // data type is mandatory field
318 if (type_str.empty()) {
319 RETURN_STATUS_UNEXPECTED("Invalid data, \"type\" field is missing for column " + col_name +
320 " in JSON schema file.");
321 }
322
323 // rank number is mandatory field
324 if (rank_value <= -1) {
325 RETURN_STATUS_UNEXPECTED("Invalid data, \"rank\" field of column " + col_name +
326 " must have value >= 0 in JSON schema file.");
327 }
328
329 // Create the column descriptor for this column from the data we pulled from the json file
330 TensorShape col_shape = TensorShape(tmp_shape);
331 if (shape_field_exists) {
332 RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value, &col_shape)));
333 } else {
334 // Create a column descriptor that doesn't have a shape
335 RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value)));
336 }
337 return Status::OK();
338 }
339
340 // Parses a schema json file and populates the columns and meta info.
LoadSchemaFile(const std::string & schema_file_path,const std::vector<std::string> & columns_to_load)341 Status DataSchema::LoadSchemaFile(const std::string &schema_file_path,
342 const std::vector<std::string> &columns_to_load) {
343 try {
344 std::ifstream in(schema_file_path, std::ifstream::in);
345
346 nlohmann::json js;
347 in >> js;
348 auto s = PreLoadExceptionCheck(js);
349 if (s != Status::OK()) {
350 in.close();
351 return s;
352 }
353 try {
354 num_rows_ = js.at("numRows").get<int64_t>();
355 } catch (nlohmann::json::out_of_range &e) {
356 num_rows_ = 0;
357 } catch (nlohmann::json::exception &e) {
358 in.close();
359 RETURN_STATUS_UNEXPECTED("Invalid data, unable to parse \"numRows\" field from JSON schema file: " +
360 schema_file_path + ", check syntax with JSON tool.");
361 }
362 nlohmann::json column_tree = js.at("columns");
363 if (column_tree.empty()) {
364 in.close();
365 RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in JSON schema file: " + schema_file_path);
366 }
367 if (columns_to_load.empty()) {
368 // Parse the json tree and load the schema's columns in whatever order that the json
369 // layout decides
370 Status rc = this->AnyOrderLoad(column_tree);
371 if (rc.IsError()) {
372 in.close();
373 return rc;
374 }
375 } else {
376 Status rc = this->ColumnOrderLoad(column_tree, columns_to_load);
377 if (rc.IsError()) {
378 rc.SetErrDescription(rc.GetErrDescription() + " file: " + schema_file_path);
379 in.close();
380 return rc;
381 }
382 }
383 in.close();
384 } catch (const std::exception &err) {
385 // Catch any exception and convert to Status return code
386 RETURN_STATUS_UNEXPECTED("Invalid file, failed to load and parse JSON schema file: " + schema_file_path +
387 ", check syntax with JSON tools.");
388 }
389 return Status::OK();
390 }
391
392 // Parses a schema json string and populates the columns and meta info.
LoadSchemaString(const std::string & schema_json_string,const std::vector<std::string> & columns_to_load)393 Status DataSchema::LoadSchemaString(const std::string &schema_json_string,
394 const std::vector<std::string> &columns_to_load) {
395 try {
396 nlohmann::json js = nlohmann::json::parse(schema_json_string);
397 RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
398 num_rows_ = js.value("numRows", 0);
399 nlohmann::json column_tree = js.at("columns");
400 if (column_tree.empty()) {
401 RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in JSON schema string.");
402 }
403 if (columns_to_load.empty()) {
404 // Parse the json tree and load the schema's columns in whatever order that the json
405 // layout decides
406 RETURN_IF_NOT_OK(this->AnyOrderLoad(column_tree));
407 } else {
408 Status rc = this->ColumnOrderLoad(column_tree, columns_to_load);
409 if (rc.IsError()) {
410 rc.SetErrDescription(rc.GetErrDescription() + " file content: " + schema_json_string);
411 return rc;
412 }
413 }
414 } catch (const std::exception &err) {
415 // Catch any exception and convert to Status return code
416 RETURN_STATUS_UNEXPECTED("Invalid data, failed to load and parse JSON schema string, check syntax with JSON tool.");
417 }
418 return Status::OK();
419 }
420
421 // Destructor
422 DataSchema::~DataSchema() = default;
423
424 // Getter for the ColDescriptor by index
Column(int32_t idx) const425 const ColDescriptor &DataSchema::Column(int32_t idx) const {
426 MS_ASSERT(idx < static_cast<int>(col_descs_.size()));
427 return col_descs_[idx];
428 }
429
430 // A print method typically used for debugging
Print(std::ostream & out) const431 void DataSchema::Print(std::ostream &out) const {
432 out << "Dataset schema: (";
433 for (const auto &col_desc : col_descs_) {
434 out << col_desc << "\n";
435 }
436 }
437
438 // Adds a column descriptor to the schema
AddColumn(const ColDescriptor & cd)439 Status DataSchema::AddColumn(const ColDescriptor &cd) {
440 // Sanity check there's not a duplicate name before adding the column
441 for (auto i = 0; i < col_descs_.size(); ++i) {
442 if (col_descs_[i].Name() == cd.Name()) {
443 std::ostringstream ss;
444 ss << "column name '" << cd.Name() << "' already exists in schema.";
445 std::string err_msg = ss.str();
446 RETURN_STATUS_UNEXPECTED(err_msg);
447 }
448 }
449 col_descs_.push_back(cd);
450 return Status::OK();
451 }
452
453 // Internal helper function. Performs sanity checks on the json file setup.
PreLoadExceptionCheck(const nlohmann::json & js)454 Status DataSchema::PreLoadExceptionCheck(const nlohmann::json &js) {
455 // Check if columns node exists. It is required for building schema from file.
456 if (js.find("columns") == js.end()) {
457 RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in the JSON schema file.");
458 }
459 return Status::OK();
460 }
461
462 // Loops through all columns in the schema and returns a map with the column
463 // name to column index number.
GetColumnNameMap(std::unordered_map<std::string,int32_t> * out_column_name_map)464 Status DataSchema::GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map) {
465 if (out_column_name_map == nullptr) {
466 RETURN_STATUS_UNEXPECTED("unexpected null output column name map.");
467 }
468
469 for (size_t i = 0; i < col_descs_.size(); ++i) {
470 if (col_descs_[i].Name().empty()) {
471 RETURN_STATUS_UNEXPECTED("Constructing column name map from schema, but found empty column name.");
472 }
473 (*out_column_name_map)[col_descs_[i].Name()] = i;
474 }
475
476 return Status::OK();
477 }
478
GetColumnName(std::vector<std::string> * column_names) const479 Status DataSchema::GetColumnName(std::vector<std::string> *column_names) const {
480 RETURN_UNEXPECTED_IF_NULL(column_names);
481 column_names->clear();
482 for (const auto &col_desc : col_descs_) {
483 if (col_desc.Name().empty()) {
484 RETURN_STATUS_UNEXPECTED("Found empty column name in schema.");
485 }
486 column_names->emplace_back(col_desc.Name());
487 }
488 return Status::OK();
489 }
490 } // namespace dataset
491 } // namespace mindspore
492