• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2024 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/engine/data_schema.h"
17 
18 #include <algorithm>
19 #include <fstream>
20 #include <iostream>
21 #include <map>
22 #include <memory>
23 #include <nlohmann/json.hpp>
24 
25 #include "utils/ms_utils.h"
26 #include "minddata/dataset/util/status.h"
27 #include "minddata/dataset/core/tensor_shape.h"
28 #include "minddata/dataset/util/log_adapter.h"
29 
30 namespace mindspore {
31 namespace dataset {
32 // A macro for converting an input string representing the column type to it's actual
33 // numeric column type.
34 #define STR_TO_TENSORIMPL(in_col_str, out_type) \
35   do {                                          \
36     if (in_col_str == "cvmat") {                \
37       out_type = TensorImpl::kCv;               \
38     } else if (in_col_str == "flex") {          \
39       out_type = TensorImpl::kFlexible;         \
40     } else if (in_col_str == "np") {            \
41       out_type = TensorImpl::kNP;               \
42     } else {                                    \
43       out_type = TensorImpl::kNone;             \
44     }                                           \
45   } while (false)
46 
47 // Constructor 1: Simple constructor that leaves things uninitialized.
ColDescriptor()48 ColDescriptor::ColDescriptor()
49     : type_(DataType::DE_UNKNOWN), rank_(0), tensor_impl_(TensorImpl::kNone), tensor_shape_(nullptr) {}
50 
51 // Constructor 2: Main constructor
ColDescriptor(const std::string & col_name,DataType col_type,TensorImpl tensor_impl,int32_t rank,const TensorShape * in_shape)52 ColDescriptor::ColDescriptor(const std::string &col_name, DataType col_type, TensorImpl tensor_impl, int32_t rank,
53                              const TensorShape *in_shape)
54     : type_(col_type), rank_(rank), tensor_impl_(tensor_impl), col_name_(col_name) {
55   // If a shape was provided, create unique pointer for it and copy construct it into
56   // our shape.  Otherwise, set our shape to be empty.
57   if (in_shape != nullptr) {
58     // Create a shape and copy construct it into our column's shape.
59     tensor_shape_ = std::make_unique<TensorShape>(*in_shape);
60   } else {
61     tensor_shape_ = nullptr;
62   }
63   // If the user input a shape, then the rank of the input shape needs to match
64   // the input rank
65   if (in_shape != nullptr && in_shape->known() && in_shape->Size() != rank_) {
66     rank_ = in_shape->Size();
67     MS_LOG(WARNING) << "Rank does not match the number of dimensions in the provided shape."
68                     << " Overriding rank with the number of dimensions in the provided shape.";
69   }
70 }
71 
72 // Explicit copy constructor is required
ColDescriptor(const ColDescriptor & in_cd)73 ColDescriptor::ColDescriptor(const ColDescriptor &in_cd)
74     : type_(in_cd.type_), rank_(in_cd.rank_), tensor_impl_(in_cd.tensor_impl_), col_name_(in_cd.col_name_) {
75   // If it has a tensor shape, make a copy of it with our own unique_ptr.
76   tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
77 }
78 
79 // Assignment overload
operator =(const ColDescriptor & in_cd)80 ColDescriptor &ColDescriptor::operator=(const ColDescriptor &in_cd) {
81   if (&in_cd != this) {
82     type_ = in_cd.type_;
83     rank_ = in_cd.rank_;
84     tensor_impl_ = in_cd.tensor_impl_;
85     col_name_ = in_cd.col_name_;
86     // If it has a tensor shape, make a copy of it with our own unique_ptr.
87     tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
88   }
89   return *this;
90 }
91 
92 // Destructor
93 ColDescriptor::~ColDescriptor() = default;
94 
95 // A print method typically used for debugging
Print(std::ostream & out) const96 void ColDescriptor::Print(std::ostream &out) const {
97   out << "  Name          : " << col_name_ << "\n  Type          : " << type_ << "\n  Rank          : " << rank_
98       << "\n  Shape         : (";
99   if (tensor_shape_) {
100     out << *tensor_shape_ << ")\n";
101   } else {
102     out << "no shape provided)\n";
103   }
104 }
105 
106 // Given a number of elements, this function will compute what the actual Tensor shape would be.
107 // If there is no starting TensorShape in this column, or if there is a shape but it contains
108 // an unknown dimension, then the output shape returned shall resolve dimensions as needed.
MaterializeTensorShape(int32_t num_elements,TensorShape * out_shape) const109 Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *out_shape) const {
110   if (out_shape == nullptr) {
111     RETURN_STATUS_UNEXPECTED("Unexpected null output shape argument.");
112   }
113 
114   // If the shape is not given in this column, then we assume the shape will be: {numElements}
115   if (tensor_shape_ == nullptr) {
116     if (this->Rank() == 0 && num_elements == 1) {
117       *out_shape = TensorShape::CreateScalar();
118       return Status::OK();
119     }
120     *out_shape = TensorShape({num_elements});
121     return Status::OK();
122   }
123 
124   // Build the real TensorShape based on the requested shape and the number of elements in the data.
125   // If there are unknown dimensions, then the unknown dimension needs to be filled in.
126   // Example: requestedShape: {?,4,3}.
127   // If numElements is 24, then the output shape can be computed to: {2,4,3}
128   std::vector<dsize_t> requested_shape = tensor_shape_->AsVector();
129   int64_t num_elements_of_shape = 1;  // init to 1 as a starting multiplier.
130 
131   // unknownDimPosition variable is overloaded to provide 2 meanings:
132   // 1) If it's set to DIM_UNKNOWN, then it provides a boolean knowledge to tell us if there are
133   //    any unknown dimensions.  i.e. if it's set to unknown, then there are no unknown dimensions.
134   // 2) If it's set to a numeric value, then this is the vector index position within the shape
135   //    where the single unknown dimension can be found.
136   int64_t unknown_dim_position = TensorShape::kDimUnknown;  // Assume there are no unknown dims to start
137 
138   for (int i = 0; i < requested_shape.size(); ++i) {
139     // If we already had an unknown dimension, then we cannot have a second unknown dimension.
140     // We only support the compute of a single unknown dim.
141     if (requested_shape[i] == TensorShape::kDimUnknown && unknown_dim_position != TensorShape::kDimUnknown) {
142       RETURN_STATUS_UNEXPECTED("Requested shape has more than one unknown dimension!");
143     }
144 
145     // If the current dimension in the requested shape is a known value, then compute the number of
146     // elements so far.
147     if (requested_shape[i] != TensorShape::kDimUnknown) {
148       num_elements_of_shape *= requested_shape[i];
149     } else {
150       // This dimension is unknown so track which dimension position has it.
151       unknown_dim_position = i;
152     }
153   }
154 
155   // Sanity check the the computed element counts divide evenly into the input element count
156   if (num_elements < num_elements_of_shape || num_elements_of_shape == 0 || num_elements % num_elements_of_shape != 0) {
157     std::string err = "Requested shape has an invalid element count! Number elements: " + std::to_string(num_elements) +
158                       ", number elements of shape: " + std::to_string(num_elements_of_shape);
159     RETURN_STATUS_UNEXPECTED(err);
160   }
161 
162   // If there was any unknown dimensions, then update the requested shape to fill in the unknown
163   // dimension with the correct value.  If there were no unknown dim's then the output shape will
164   // remain to be the same as the requested shape.
165   if (unknown_dim_position != TensorShape::kDimUnknown) {
166     requested_shape[unknown_dim_position] = (num_elements / num_elements_of_shape);
167   }
168 
169   // Any unknown dimension is filled in now.  Set the output shape
170   *out_shape = TensorShape(requested_shape);
171   return Status::OK();
172 }
173 
174 // getter function for the shape
Shape() const175 TensorShape ColDescriptor::Shape() const {
176   if (tensor_shape_ != nullptr) {
177     return *tensor_shape_;  // copy construct a shape to return
178   } else {
179     return TensorShape::CreateUnknownRankShape();  // empty shape to return
180   }
181 }
182 
183 const char DataSchema::DEFAULT_DATA_SCHEMA_FILENAME[] = "datasetSchema.json";
184 
185 // Constructor 1: Simple constructor that leaves things uninitialized.
DataSchema()186 DataSchema::DataSchema() : num_rows_(0) {}
187 
188 // Internal helper function. Parses the json schema file in any order and produces a schema that
189 // does not follow any particular order (json standard does not enforce any ordering protocol).
190 // This one produces a schema that contains all of the columns from the schema file.
AnyOrderLoad(nlohmann::json column_tree)191 Status DataSchema::AnyOrderLoad(nlohmann::json column_tree) {
192   // Iterate over the json file.  Each parent json node is the column name,
193   // followed by the column properties in the child tree under the column.
194   // Outer loop here iterates over the parents (i.e. the column name)
195   if (!column_tree.is_array()) {
196     for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
197       std::string col_name = it.key();
198       nlohmann::json column_child_tree = it.value();
199       RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
200     }
201   } else {
202     // Case where the schema is a list of columns not a dict
203     for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
204       nlohmann::json column_child_tree = it.value();
205       RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, ""));
206     }
207   }
208   return Status::OK();
209 }
210 
211 // Internal helper function. For each input column name, perform a lookup to the json document to
212 // find the matching column.  When the match is found, process that column to build the column
213 // descriptor and add to the schema in the order in which the input column names are given.id
ColumnOrderLoad(nlohmann::json column_tree,const std::vector<std::string> & columns_to_load)214 Status DataSchema::ColumnOrderLoad(nlohmann::json column_tree, const std::vector<std::string> &columns_to_load) {
215   if (!column_tree.is_array()) {
216     // the json file is dict (e.g., {image: ...})
217     // Loop over the column name list
218     for (const auto &curr_col_name : columns_to_load) {
219       // Find the column in the json document
220       auto column_info = column_tree.find(common::SafeCStr(curr_col_name));
221       if (column_info == column_tree.end()) {
222         RETURN_STATUS_UNEXPECTED("Invalid data, failed to find column: " + curr_col_name + " in JSON schema file.");
223       }
224       // At this point, columnInfo.value() is the subtree in the json document that contains
225       // all of the data for a given column.  This data will formulate our schema column.
226       const std::string &col_name = column_info.key();
227       nlohmann::json column_child_tree = column_info.value();
228       RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
229     }
230   } else {
231     // the json file is array (e.g., [name: image...])
232     // Loop over the column name list
233     for (const auto &curr_col_name : columns_to_load) {
234       // Find the column in the json document
235       int32_t index = -1;
236       int32_t i = 0;
237       for (const auto &it_child : column_tree.items()) {
238         auto name = it_child.value().find("name");
239         if (name == it_child.value().end()) {
240           RETURN_STATUS_UNEXPECTED("Invalid data, \"name\" field is missing for column: " + curr_col_name +
241                                    " in JSON schema file.");
242         }
243         if (name.value() == curr_col_name) {
244           index = i;
245           break;
246         }
247         i++;
248       }
249       if (index == -1) {
250         RETURN_STATUS_UNEXPECTED("Invalid data, failed to find column: " + curr_col_name + " in JSON schema file.");
251       }
252       nlohmann::json column_child_tree = column_tree[index];
253       RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, curr_col_name));
254     }
255   }
256   return Status::OK();
257 }
258 
259 // Internal helper function for parsing shape info and building a vector for the shape construction.
BuildShape(const nlohmann::json & shapeVal,std::vector<dsize_t> * outShape)260 static Status BuildShape(const nlohmann::json &shapeVal, std::vector<dsize_t> *outShape) {
261   if (outShape == nullptr) {
262     RETURN_STATUS_UNEXPECTED("outShape can not be nullptr.");
263   }
264   if (shapeVal.empty()) {
265     return Status::OK();
266   }
267 
268   // Iterate over the integer list and add those values to the output shape tensor
269   auto items = shapeVal.items();
270   using it_type = decltype(items.begin());
271   (void)std::transform(items.begin(), items.end(), std::back_inserter(*outShape), [](it_type j) { return j.value(); });
272   return Status::OK();
273 }
274 
275 // Internal helper function. Given the json tree for a given column, load it into our schema.
ColumnLoad(nlohmann::json column_child_tree,const std::string & col_name)276 Status DataSchema::ColumnLoad(nlohmann::json column_child_tree, const std::string &col_name) {
277   int32_t rank_value = -1;
278   TensorImpl t_impl_value = TensorImpl::kFlexible;
279   std::string name = "";
280   std::string type_str = "";
281   std::vector<dsize_t> tmp_shape = {};
282   bool shape_field_exists = false;
283   // Iterate over this column's attributes.
284   // Manually iterating each of the child nodes/trees here so that we can provide our own error handling.
285   for (const auto &it_child : column_child_tree.items()) {
286     // Save the data for each of the attributes into variables. We'll use these to construct later.
287     if (it_child.key() == "name") {
288       name = it_child.value();
289     } else if (it_child.key() == "type") {
290       type_str = it_child.value();
291     } else if (it_child.key() == "rank") {
292       rank_value = it_child.value();
293     } else if (it_child.key() == "t_impl") {
294       STR_TO_TENSORIMPL(it_child.value(), t_impl_value);
295     } else if (it_child.key() == "shape") {
296       shape_field_exists = true;
297       RETURN_IF_NOT_OK(BuildShape(it_child.value(), &tmp_shape));
298     } else {
299       std::string err_msg = "Invalid data, unexpected column attribute " + it_child.key() + " for column " + col_name +
300                             ", expected attribute: name, type, rank, t_impl or shape.";
301       RETURN_STATUS_UNEXPECTED(err_msg);
302     }
303   }
304   if (!name.empty()) {
305     if (!col_name.empty() && col_name != name) {
306       std::string err_msg = "Invalid data, failed to find column: " + col_name + " in JSON schema file.";
307       RETURN_STATUS_UNEXPECTED(err_msg);
308     }
309   } else {
310     if (col_name.empty()) {
311       std::string err_msg = "Invalid data, \"name\" field is missing for column " + col_name + " in JSON schema file.";
312       RETURN_STATUS_UNEXPECTED(err_msg);
313     } else {
314       name = col_name;
315     }
316   }
317   // data type is mandatory field
318   if (type_str.empty()) {
319     RETURN_STATUS_UNEXPECTED("Invalid data, \"type\" field is missing for column " + col_name +
320                              " in JSON schema file.");
321   }
322 
323   // rank number is mandatory field
324   if (rank_value <= -1) {
325     RETURN_STATUS_UNEXPECTED("Invalid data, \"rank\" field of column " + col_name +
326                              " must have value >= 0 in JSON schema file.");
327   }
328 
329   // Create the column descriptor for this column from the data we pulled from the json file
330   TensorShape col_shape = TensorShape(tmp_shape);
331   if (shape_field_exists) {
332     RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value, &col_shape)));
333   } else {
334     // Create a column descriptor that doesn't have a shape
335     RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value)));
336   }
337   return Status::OK();
338 }
339 
340 // Parses a schema json file and populates the columns and meta info.
LoadSchemaFile(const std::string & schema_file_path,const std::vector<std::string> & columns_to_load)341 Status DataSchema::LoadSchemaFile(const std::string &schema_file_path,
342                                   const std::vector<std::string> &columns_to_load) {
343   try {
344     std::ifstream in(schema_file_path, std::ifstream::in);
345 
346     nlohmann::json js;
347     in >> js;
348     auto s = PreLoadExceptionCheck(js);
349     if (s != Status::OK()) {
350       in.close();
351       return s;
352     }
353     try {
354       num_rows_ = js.at("numRows").get<int64_t>();
355     } catch (nlohmann::json::out_of_range &e) {
356       num_rows_ = 0;
357     } catch (nlohmann::json::exception &e) {
358       in.close();
359       RETURN_STATUS_UNEXPECTED("Invalid data, unable to parse \"numRows\" field from JSON schema file: " +
360                                schema_file_path + ", check syntax with JSON tool.");
361     }
362     nlohmann::json column_tree = js.at("columns");
363     if (column_tree.empty()) {
364       in.close();
365       RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in JSON schema file: " + schema_file_path);
366     }
367     if (columns_to_load.empty()) {
368       // Parse the json tree and load the schema's columns in whatever order that the json
369       // layout decides
370       Status rc = this->AnyOrderLoad(column_tree);
371       if (rc.IsError()) {
372         in.close();
373         return rc;
374       }
375     } else {
376       Status rc = this->ColumnOrderLoad(column_tree, columns_to_load);
377       if (rc.IsError()) {
378         rc.SetErrDescription(rc.GetErrDescription() + " file: " + schema_file_path);
379         in.close();
380         return rc;
381       }
382     }
383     in.close();
384   } catch (const std::exception &err) {
385     // Catch any exception and convert to Status return code
386     RETURN_STATUS_UNEXPECTED("Invalid file, failed to load and parse JSON schema file: " + schema_file_path +
387                              ", check syntax with JSON tools.");
388   }
389   return Status::OK();
390 }
391 
392 // Parses a schema json string and populates the columns and meta info.
LoadSchemaString(const std::string & schema_json_string,const std::vector<std::string> & columns_to_load)393 Status DataSchema::LoadSchemaString(const std::string &schema_json_string,
394                                     const std::vector<std::string> &columns_to_load) {
395   try {
396     nlohmann::json js = nlohmann::json::parse(schema_json_string);
397     RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
398     num_rows_ = js.value("numRows", 0);
399     nlohmann::json column_tree = js.at("columns");
400     if (column_tree.empty()) {
401       RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in JSON schema string.");
402     }
403     if (columns_to_load.empty()) {
404       // Parse the json tree and load the schema's columns in whatever order that the json
405       // layout decides
406       RETURN_IF_NOT_OK(this->AnyOrderLoad(column_tree));
407     } else {
408       Status rc = this->ColumnOrderLoad(column_tree, columns_to_load);
409       if (rc.IsError()) {
410         rc.SetErrDescription(rc.GetErrDescription() + " file content: " + schema_json_string);
411         return rc;
412       }
413     }
414   } catch (const std::exception &err) {
415     // Catch any exception and convert to Status return code
416     RETURN_STATUS_UNEXPECTED("Invalid data, failed to load and parse JSON schema string, check syntax with JSON tool.");
417   }
418   return Status::OK();
419 }
420 
421 // Destructor
422 DataSchema::~DataSchema() = default;
423 
424 // Getter for the ColDescriptor by index
Column(int32_t idx) const425 const ColDescriptor &DataSchema::Column(int32_t idx) const {
426   MS_ASSERT(idx < static_cast<int>(col_descs_.size()));
427   return col_descs_[idx];
428 }
429 
430 // A print method typically used for debugging
Print(std::ostream & out) const431 void DataSchema::Print(std::ostream &out) const {
432   out << "Dataset schema: (";
433   for (const auto &col_desc : col_descs_) {
434     out << col_desc << "\n";
435   }
436 }
437 
438 // Adds a column descriptor to the schema
AddColumn(const ColDescriptor & cd)439 Status DataSchema::AddColumn(const ColDescriptor &cd) {
440   // Sanity check there's not a duplicate name before adding the column
441   for (auto i = 0; i < col_descs_.size(); ++i) {
442     if (col_descs_[i].Name() == cd.Name()) {
443       std::ostringstream ss;
444       ss << "column name '" << cd.Name() << "' already exists in schema.";
445       std::string err_msg = ss.str();
446       RETURN_STATUS_UNEXPECTED(err_msg);
447     }
448   }
449   col_descs_.push_back(cd);
450   return Status::OK();
451 }
452 
453 // Internal helper function. Performs sanity checks on the json file setup.
PreLoadExceptionCheck(const nlohmann::json & js)454 Status DataSchema::PreLoadExceptionCheck(const nlohmann::json &js) {
455   // Check if columns node exists.  It is required for building schema from file.
456   if (js.find("columns") == js.end()) {
457     RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in the JSON schema file.");
458   }
459   return Status::OK();
460 }
461 
462 // Loops through all columns in the schema and returns a map with the column
463 // name to column index number.
GetColumnNameMap(std::unordered_map<std::string,int32_t> * out_column_name_map)464 Status DataSchema::GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map) {
465   if (out_column_name_map == nullptr) {
466     RETURN_STATUS_UNEXPECTED("unexpected null output column name map.");
467   }
468 
469   for (size_t i = 0; i < col_descs_.size(); ++i) {
470     if (col_descs_[i].Name().empty()) {
471       RETURN_STATUS_UNEXPECTED("Constructing column name map from schema, but found empty column name.");
472     }
473     (*out_column_name_map)[col_descs_[i].Name()] = i;
474   }
475 
476   return Status::OK();
477 }
478 
GetColumnName(std::vector<std::string> * column_names) const479 Status DataSchema::GetColumnName(std::vector<std::string> *column_names) const {
480   RETURN_UNEXPECTED_IF_NULL(column_names);
481   column_names->clear();
482   for (const auto &col_desc : col_descs_) {
483     if (col_desc.Name().empty()) {
484       RETURN_STATUS_UNEXPECTED("Found empty column name in schema.");
485     }
486     column_names->emplace_back(col_desc.Name());
487   }
488   return Status::OK();
489 }
490 }  // namespace dataset
491 }  // namespace mindspore
492