• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/engine/data_schema.h"
17 
18 #include <algorithm>
19 #include <fstream>
20 #include <iostream>
21 #include <map>
22 #include <memory>
23 #include <nlohmann/json.hpp>
24 
25 #include "utils/ms_utils.h"
26 #include "minddata/dataset/util/status.h"
27 #include "minddata/dataset/core/tensor_shape.h"
28 #include "minddata/dataset/util/log_adapter.h"
29 
30 namespace mindspore {
31 namespace dataset {
32 // A macro for converting an input string representing the column type to it's actual
33 // numeric column type.
34 #define STR_TO_TENSORIMPL(in_col_str, out_type) \
35   do {                                          \
36     if (in_col_str == "cvmat") {                \
37       out_type = TensorImpl::kCv;               \
38     } else if (in_col_str == "flex") {          \
39       out_type = TensorImpl::kFlexible;         \
40     } else if (in_col_str == "np") {            \
41       out_type = TensorImpl::kNP;               \
42     } else {                                    \
43       out_type = TensorImpl::kNone;             \
44     }                                           \
45   } while (false)
46 
47 // Constructor 1: Simple constructor that leaves things uninitialized.
ColDescriptor()48 ColDescriptor::ColDescriptor()
49     : type_(DataType::DE_UNKNOWN), rank_(0), tensor_impl_(TensorImpl::kNone), tensor_shape_(nullptr) {}
50 
51 // Constructor 2: Main constructor
ColDescriptor(const std::string & col_name,DataType col_type,TensorImpl tensor_impl,int32_t rank,const TensorShape * in_shape)52 ColDescriptor::ColDescriptor(const std::string &col_name, DataType col_type, TensorImpl tensor_impl, int32_t rank,
53                              const TensorShape *in_shape)
54     : type_(col_type), rank_(rank), tensor_impl_(tensor_impl), col_name_(col_name) {
55   // If a shape was provided, create unique pointer for it and copy construct it into
56   // our shape.  Otherwise, set our shape to be empty.
57   if (in_shape != nullptr) {
58     // Create a shape and copy construct it into our column's shape.
59     tensor_shape_ = std::make_unique<TensorShape>(*in_shape);
60   } else {
61     tensor_shape_ = nullptr;
62   }
63   // If the user input a shape, then the rank of the input shape needs to match
64   // the input rank
65   if (in_shape != nullptr && in_shape->known() && in_shape->Size() != rank_) {
66     rank_ = in_shape->Size();
67     MS_LOG(WARNING) << "Rank does not match the number of dimensions in the provided shape."
68                     << " Overriding rank with the number of dimensions in the provided shape.";
69   }
70 }
71 
72 // Explicit copy constructor is required
ColDescriptor(const ColDescriptor & in_cd)73 ColDescriptor::ColDescriptor(const ColDescriptor &in_cd)
74     : type_(in_cd.type_), rank_(in_cd.rank_), tensor_impl_(in_cd.tensor_impl_), col_name_(in_cd.col_name_) {
75   // If it has a tensor shape, make a copy of it with our own unique_ptr.
76   tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
77 }
78 
79 // Assignment overload
operator =(const ColDescriptor & in_cd)80 ColDescriptor &ColDescriptor::operator=(const ColDescriptor &in_cd) {
81   if (&in_cd != this) {
82     type_ = in_cd.type_;
83     rank_ = in_cd.rank_;
84     tensor_impl_ = in_cd.tensor_impl_;
85     col_name_ = in_cd.col_name_;
86     // If it has a tensor shape, make a copy of it with our own unique_ptr.
87     tensor_shape_ = in_cd.HasShape() ? std::make_unique<TensorShape>(in_cd.Shape()) : nullptr;
88   }
89   return *this;
90 }
91 
92 // Destructor
93 ColDescriptor::~ColDescriptor() = default;
94 
95 // A print method typically used for debugging
Print(std::ostream & out) const96 void ColDescriptor::Print(std::ostream &out) const {
97   out << "  Name          : " << col_name_ << "\n  Type          : " << type_ << "\n  Rank          : " << rank_
98       << "\n  Shape         : (";
99   if (tensor_shape_) {
100     out << *tensor_shape_ << ")\n";
101   } else {
102     out << "no shape provided)\n";
103   }
104 }
105 
106 // Given a number of elements, this function will compute what the actual Tensor shape would be.
107 // If there is no starting TensorShape in this column, or if there is a shape but it contains
108 // an unknown dimension, then the output shape returned shall resolve dimensions as needed.
MaterializeTensorShape(int32_t num_elements,TensorShape * out_shape) const109 Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *out_shape) const {
110   if (out_shape == nullptr) {
111     RETURN_STATUS_UNEXPECTED("Unexpected null output shape argument.");
112   }
113 
114   // If the shape is not given in this column, then we assume the shape will be: {numElements}
115   if (tensor_shape_ == nullptr) {
116     if (this->Rank() == 0 && num_elements == 1) {
117       *out_shape = TensorShape::CreateScalar();
118       return Status::OK();
119     }
120     *out_shape = TensorShape({num_elements});
121     return Status::OK();
122   }
123 
124   // Build the real TensorShape based on the requested shape and the number of elements in the data.
125   // If there are unknown dimensions, then the unknown dimension needs to be filled in.
126   // Example: requestedShape: {?,4,3}.
127   // If numElements is 24, then the output shape can be computed to: {2,4,3}
128   std::vector<dsize_t> requested_shape = tensor_shape_->AsVector();
129   int64_t num_elements_of_shape = 1;  // init to 1 as a starting multiplier.
130 
131   // unknownDimPosition variable is overloaded to provide 2 meanings:
132   // 1) If it's set to DIM_UNKNOWN, then it provides a boolean knowledge to tell us if there are
133   //    any unknown dimensions.  i.e. if it's set to unknown, then there are no unknown dimensions.
134   // 2) If it's set to a numeric value, then this is the vector index position within the shape
135   //    where the single unknown dimension can be found.
136   int64_t unknown_dim_position = TensorShape::kDimUnknown;  // Assume there are no unknown dims to start
137 
138   for (int i = 0; i < requested_shape.size(); ++i) {
139     // If we already had an unknown dimension, then we cannot have a second unknown dimension.
140     // We only support the compute of a single unknown dim.
141     if (requested_shape[i] == TensorShape::kDimUnknown && unknown_dim_position != TensorShape::kDimUnknown) {
142       return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
143                     "Requested shape has more than one unknown dimension!");
144     }
145 
146     // If the current dimension in the requested shape is a known value, then compute the number of
147     // elements so far.
148     if (requested_shape[i] != TensorShape::kDimUnknown) {
149       num_elements_of_shape *= requested_shape[i];
150     } else {
151       // This dimension is unknown so track which dimension position has it.
152       unknown_dim_position = i;
153     }
154   }
155 
156   // Sanity check the the computed element counts divide evenly into the input element count
157   if (num_elements < num_elements_of_shape || num_elements_of_shape == 0 || num_elements % num_elements_of_shape != 0) {
158     std::string err = "Requested shape has an invalid element count! Number elements: " + std::to_string(num_elements) +
159                       ", number elements of shape: " + std::to_string(num_elements_of_shape);
160     RETURN_STATUS_UNEXPECTED(err);
161   }
162 
163   // If there was any unknown dimensions, then update the requested shape to fill in the unknown
164   // dimension with the correct value.  If there were no unknown dim's then the output shape will
165   // remain to be the same as the requested shape.
166   if (unknown_dim_position != TensorShape::kDimUnknown) {
167     requested_shape[unknown_dim_position] = (num_elements / num_elements_of_shape);
168   }
169 
170   // Any unknown dimension is filled in now.  Set the output shape
171   *out_shape = TensorShape(requested_shape);
172   return Status::OK();
173 }
174 
175 // getter function for the shape
Shape() const176 TensorShape ColDescriptor::Shape() const {
177   if (tensor_shape_ != nullptr) {
178     return *tensor_shape_;  // copy construct a shape to return
179   } else {
180     return TensorShape::CreateUnknownRankShape();  // empty shape to return
181   }
182 }
183 
184 const char DataSchema::DEFAULT_DATA_SCHEMA_FILENAME[] = "datasetSchema.json";
185 
186 // Constructor 1: Simple constructor that leaves things uninitialized.
DataSchema()187 DataSchema::DataSchema() : num_rows_(0) {}
188 
189 // Internal helper function. Parses the json schema file in any order and produces a schema that
190 // does not follow any particular order (json standard does not enforce any ordering protocol).
191 // This one produces a schema that contains all of the columns from the schema file.
AnyOrderLoad(nlohmann::json column_tree)192 Status DataSchema::AnyOrderLoad(nlohmann::json column_tree) {
193   // Iterate over the json file.  Each parent json node is the column name,
194   // followed by the column properties in the child tree under the column.
195   // Outer loop here iterates over the parents (i.e. the column name)
196   if (!column_tree.is_array()) {
197     for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
198       std::string col_name = it.key();
199       nlohmann::json column_child_tree = it.value();
200       RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
201     }
202   } else {
203     // Case where the schema is a list of columns not a dict
204     for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
205       nlohmann::json column_child_tree = it.value();
206       RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, ""));
207     }
208   }
209   return Status::OK();
210 }
211 
212 // Internal helper function. For each input column name, perform a lookup to the json document to
213 // find the matching column.  When the match is found, process that column to build the column
214 // descriptor and add to the schema in the order in which the input column names are given.id
ColumnOrderLoad(nlohmann::json column_tree,const std::vector<std::string> & columns_to_load)215 Status DataSchema::ColumnOrderLoad(nlohmann::json column_tree, const std::vector<std::string> &columns_to_load) {
216   if (!column_tree.is_array()) {
217     // the json file is dict (e.g., {image: ...})
218     // Loop over the column name list
219     for (const auto &curr_col_name : columns_to_load) {
220       // Find the column in the json document
221       auto column_info = column_tree.find(common::SafeCStr(curr_col_name));
222       if (column_info == column_tree.end()) {
223         RETURN_STATUS_UNEXPECTED("Invalid data, failed to find column name: " + curr_col_name + " in given json file.");
224       }
225       // At this point, columnInfo.value() is the subtree in the json document that contains
226       // all of the data for a given column.  This data will formulate our schema column.
227       const std::string &col_name = column_info.key();
228       nlohmann::json column_child_tree = column_info.value();
229       RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
230     }
231   } else {
232     // the json file is array (e.g., [name: image...])
233     // Loop over the column name list
234     for (const auto &curr_col_name : columns_to_load) {
235       // Find the column in the json document
236       int32_t index = -1;
237       int32_t i = 0;
238       for (const auto &it_child : column_tree.items()) {
239         auto name = it_child.value().find("name");
240         if (name == it_child.value().end()) {
241           RETURN_STATUS_UNEXPECTED("Invalid data, \"name\" field is missing for column: " + curr_col_name);
242         }
243         if (name.value() == curr_col_name) {
244           index = i;
245           break;
246         }
247         i++;
248       }
249       if (index == -1) {
250         RETURN_STATUS_UNEXPECTED("Invalid data, failed to find column name: " + curr_col_name + " in given json file.");
251       }
252       nlohmann::json column_child_tree = column_tree[index];
253       RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, curr_col_name));
254     }
255   }
256   return Status::OK();
257 }
258 
259 // Internal helper function for parsing shape info and building a vector for the shape construction.
BuildShape(const nlohmann::json & shapeVal,std::vector<dsize_t> * outShape)260 static Status BuildShape(const nlohmann::json &shapeVal, std::vector<dsize_t> *outShape) {
261   if (outShape == nullptr) {
262     RETURN_STATUS_UNEXPECTED("outShape can not be nullptr.");
263   }
264   if (shapeVal.empty()) return Status::OK();
265 
266   // Iterate over the integer list and add those values to the output shape tensor
267   auto items = shapeVal.items();
268   using it_type = decltype(items.begin());
269   (void)std::transform(items.begin(), items.end(), std::back_inserter(*outShape), [](it_type j) { return j.value(); });
270   return Status::OK();
271 }
272 
273 // Internal helper function. Given the json tree for a given column, load it into our schema.
ColumnLoad(nlohmann::json column_child_tree,const std::string & col_name)274 Status DataSchema::ColumnLoad(nlohmann::json column_child_tree, const std::string &col_name) {
275   int32_t rank_value = -1;
276   TensorImpl t_impl_value = TensorImpl::kFlexible;
277   std::string name = "";
278   std::string type_str = "";
279   std::vector<dsize_t> tmp_shape = {};
280   bool shape_field_exists = false;
281   // Iterate over this column's attributes.
282   // Manually iterating each of the child nodes/trees here so that we can provide our own error handling.
283   for (const auto &it_child : column_child_tree.items()) {
284     // Save the data for each of the attributes into variables. We'll use these to construct later.
285     if (it_child.key() == "name") {
286       name = it_child.value();
287     } else if (it_child.key() == "type") {
288       type_str = it_child.value();
289     } else if (it_child.key() == "rank") {
290       rank_value = it_child.value();
291     } else if (it_child.key() == "t_impl") {
292       STR_TO_TENSORIMPL(it_child.value(), t_impl_value);
293     } else if (it_child.key() == "shape") {
294       shape_field_exists = true;
295       RETURN_IF_NOT_OK(BuildShape(it_child.value(), &tmp_shape));
296     } else {
297       std::string err_msg = "Invalid data, unexpected column attribute " + it_child.key() + " for column " + col_name +
298                             ", expected attribute: name, type, rank, t_impl or shape.";
299       RETURN_STATUS_UNEXPECTED(err_msg);
300     }
301   }
302   if (!name.empty()) {
303     if (!col_name.empty() && col_name != name) {
304       std::string err_msg =
305         "Invalid data, json schema file for column " + col_name + " has column name that does not match columnsToLoad";
306       RETURN_STATUS_UNEXPECTED(err_msg);
307     }
308   } else {
309     if (col_name.empty()) {
310       std::string err_msg =
311         "Invalid data, json schema file for column " + col_name + " has invalid or missing column name.";
312       RETURN_STATUS_UNEXPECTED(err_msg);
313     } else {
314       name = col_name;
315     }
316   }
317   // data type is mandatory field
318   if (type_str.empty())
319     return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
320                   "Invalid data, json schema file for column " + col_name + " has invalid or missing column type.");
321 
322   // rank number is mandatory field
323   if (rank_value <= -1)
324     return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
325                   "Invalid data, json schema file for column " + col_name + " must define a positive rank value.");
326 
327   // Create the column descriptor for this column from the data we pulled from the json file
328   TensorShape col_shape = TensorShape(tmp_shape);
329   if (shape_field_exists)
330     RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value, &col_shape)));
331   else
332     // Create a column descriptor that doesn't have a shape
333     RETURN_IF_NOT_OK(this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value)));
334   return Status::OK();
335 }
336 
337 // Parses a schema json file and populates the columns and meta info.
LoadSchemaFile(const std::string & schema_file_path,const std::vector<std::string> & columns_to_load)338 Status DataSchema::LoadSchemaFile(const std::string &schema_file_path,
339                                   const std::vector<std::string> &columns_to_load) {
340   try {
341     std::ifstream in(schema_file_path);
342 
343     nlohmann::json js;
344     in >> js;
345     RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
346     try {
347       num_rows_ = js.at("numRows").get<int64_t>();
348     } catch (nlohmann::json::out_of_range &e) {
349       num_rows_ = 0;
350     } catch (nlohmann::json::exception &e) {
351       in.close();
352       RETURN_STATUS_UNEXPECTED("Invalid data, unable to parse \"numRows\" from schema file: " + schema_file_path);
353     }
354     nlohmann::json column_tree = js.at("columns");
355     if (column_tree.empty()) {
356       in.close();
357       RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in schema file: " + schema_file_path);
358     }
359     if (columns_to_load.empty()) {
360       // Parse the json tree and load the schema's columns in whatever order that the json
361       // layout decides
362       Status rc = this->AnyOrderLoad(column_tree);
363       if (rc.IsError()) {
364         in.close();
365         return rc;
366       }
367     } else {
368       Status rc = this->ColumnOrderLoad(column_tree, columns_to_load);
369       if (rc.IsError()) {
370         rc.SetErrDescription(rc.GetErrDescription() + " file: " + schema_file_path);
371         in.close();
372         return rc;
373       }
374     }
375     in.close();
376   } catch (const std::exception &err) {
377     // Catch any exception and convert to Status return code
378     RETURN_STATUS_UNEXPECTED("Invalid file, failed to load and parse schema file: " + schema_file_path);
379   }
380   return Status::OK();
381 }
382 
383 // Parses a schema json string and populates the columns and meta info.
LoadSchemaString(const std::string & schema_json_string,const std::vector<std::string> & columns_to_load)384 Status DataSchema::LoadSchemaString(const std::string &schema_json_string,
385                                     const std::vector<std::string> &columns_to_load) {
386   try {
387     nlohmann::json js = nlohmann::json::parse(schema_json_string);
388     RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
389     num_rows_ = js.value("numRows", 0);
390     nlohmann::json column_tree = js.at("columns");
391     if (column_tree.empty()) {
392       RETURN_STATUS_UNEXPECTED("Invalid data, \"columns\" field is missing in schema string.");
393     }
394     if (columns_to_load.empty()) {
395       // Parse the json tree and load the schema's columns in whatever order that the json
396       // layout decides
397       RETURN_IF_NOT_OK(this->AnyOrderLoad(column_tree));
398     } else {
399       Status rc = this->ColumnOrderLoad(column_tree, columns_to_load);
400       if (rc.IsError()) {
401         rc.SetErrDescription(rc.GetErrDescription() + " file content: " + schema_json_string);
402         return rc;
403       }
404     }
405   } catch (const std::exception &err) {
406     // Catch any exception and convert to Status return code
407     RETURN_STATUS_UNEXPECTED("Invalid data, failed to load and parse schema string.");
408   }
409   return Status::OK();
410 }
411 
412 // Destructor
413 DataSchema::~DataSchema() = default;
414 
415 // Getter for the ColDescriptor by index
Column(int32_t idx) const416 const ColDescriptor &DataSchema::Column(int32_t idx) const {
417   MS_ASSERT(idx < static_cast<int>(col_descs_.size()));
418   return col_descs_[idx];
419 }
420 
421 // A print method typically used for debugging
Print(std::ostream & out) const422 void DataSchema::Print(std::ostream &out) const {
423   out << "Dataset schema: (";
424   for (const auto &col_desc : col_descs_) {
425     out << col_desc << "\n";
426   }
427 }
428 
429 // Adds a column descriptor to the schema
AddColumn(const ColDescriptor & cd)430 Status DataSchema::AddColumn(const ColDescriptor &cd) {
431   // Sanity check there's not a duplicate name before adding the column
432   for (auto i = 0; i < col_descs_.size(); ++i) {
433     if (col_descs_[i].Name() == cd.Name()) {
434       std::ostringstream ss;
435       ss << "column name '" << cd.Name() << "' already exists in schema.";
436       std::string err_msg = ss.str();
437       RETURN_STATUS_UNEXPECTED(err_msg);
438     }
439   }
440   col_descs_.push_back(cd);
441   return Status::OK();
442 }
443 
444 // Internal helper function. Performs sanity checks on the json file setup.
PreLoadExceptionCheck(const nlohmann::json & js)445 Status DataSchema::PreLoadExceptionCheck(const nlohmann::json &js) {
446   // Check if columns node exists.  It is required for building schema from file.
447   if (js.find("columns") == js.end())
448     return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
449                   "Invalid data, \"columns\" node is required in the schema json file.");
450   return Status::OK();
451 }
452 
453 // Loops through all columns in the schema and returns a map with the column
454 // name to column index number.
GetColumnNameMap(std::unordered_map<std::string,int32_t> * out_column_name_map)455 Status DataSchema::GetColumnNameMap(std::unordered_map<std::string, int32_t> *out_column_name_map) {
456   if (out_column_name_map == nullptr) {
457     return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__, "unexpected null output column name map.");
458   }
459 
460   for (size_t i = 0; i < col_descs_.size(); ++i) {
461     if (col_descs_[i].Name().empty()) {
462       return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
463                     "Constructing column name map from schema, but found empty column name.");
464     }
465     (*out_column_name_map)[col_descs_[i].Name()] = i;
466   }
467 
468   return Status::OK();
469 }
470 }  // namespace dataset
471 }  // namespace mindspore
472