• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/engine/datasetops/source/manifest_op.h"
17 
18 #include <algorithm>
19 #include <fstream>
20 #include <nlohmann/json.hpp>
21 
22 #include "utils/file_utils.h"
23 #include "utils/ms_utils.h"
24 #include "minddata/dataset/core/config_manager.h"
25 #include "minddata/dataset/core/tensor_shape.h"
26 #include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
27 #include "minddata/dataset/engine/db_connector.h"
28 #include "minddata/dataset/engine/execution_tree.h"
29 
30 namespace mindspore {
31 namespace dataset {
ManifestOp(int32_t num_works,std::string file,int32_t queue_size,bool decode,const std::map<std::string,int32_t> & class_index,std::unique_ptr<DataSchema> data_schema,std::shared_ptr<SamplerRT> sampler,std::string usage)32 ManifestOp::ManifestOp(int32_t num_works, std::string file, int32_t queue_size, bool decode,
33                        const std::map<std::string, int32_t> &class_index, std::unique_ptr<DataSchema> data_schema,
34                        std::shared_ptr<SamplerRT> sampler, std::string usage)
35     : MappableLeafOp(num_works, queue_size, std::move(sampler)),
36       io_block_pushed_(0),
37       sampler_ind_(0),
38       data_schema_(std::move(data_schema)),
39       file_(std::move(file)),
40       class_index_(class_index),
41       decode_(decode),
42       usage_(usage) {
43   io_block_queues_.Init(num_workers_, queue_size);
44   (void)std::transform(usage_.begin(), usage_.end(), usage_.begin(), ::tolower);
45 }
46 
LaunchThreadsAndInitOp()47 Status ManifestOp::LaunchThreadsAndInitOp() {
48   if (tree_ == nullptr) {
49     RETURN_STATUS_UNEXPECTED("Pipeline init failed, Execution tree not set.");
50   }
51   RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
52   RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks()));
53 
54   RETURN_IF_NOT_OK(
55     tree_->LaunchWorkers(num_workers_, std::bind(&ManifestOp::WorkerEntry, this, std::placeholders::_1), "", id()));
56   TaskManager::FindMe()->Post();
57   RETURN_IF_NOT_OK(ParseManifestFile());
58   RETURN_IF_NOT_OK(CountDatasetInfo());
59   RETURN_IF_NOT_OK(InitSampler());
60   return Status::OK();
61 }
62 
63 // Load 1 TensorRow (image,label) using 1 ImageLabelPair. 1 function call produces 1 TensorTow
LoadTensorRow(row_id_type row_id,TensorRow * trow)64 Status ManifestOp::LoadTensorRow(row_id_type row_id, TensorRow *trow) {
65   std::pair<std::string, std::vector<std::string>> data = image_labelname_[static_cast<size_t>(row_id)];
66   std::shared_ptr<Tensor> image;
67   std::shared_ptr<Tensor> label;
68   std::vector<int32_t> label_index(data.second.size());
69   (void)std::transform(data.second.begin(), data.second.end(), label_index.begin(),
70                        [this](const std::string &label_name) { return label_index_[label_name]; });
71   RETURN_IF_NOT_OK(Tensor::CreateFromVector(label_index, &label));
72   if (label_index.size() == 1) {
73     RETURN_IF_NOT_OK(label->Reshape(TensorShape({})));
74   } else {
75     RETURN_IF_NOT_OK(label->Reshape(TensorShape(std::vector<dsize_t>(1, label_index.size()))));
76   }
77 
78   RETURN_IF_NOT_OK(Tensor::CreateFromFile(data.first, &image));
79   if (decode_ == true) {
80     Status rc = Decode(image, &image);
81     if (rc.IsError()) {
82       std::string err = "Invalid data, failed to decode image: " + data.first;
83       RETURN_STATUS_UNEXPECTED(err);
84     }
85   }
86   (*trow) = TensorRow(row_id, {std::move(image), std::move(label)});
87   trow->setPath({data.first, file_});
88   return Status::OK();
89 }
90 
Print(std::ostream & out,bool show_all) const91 void ManifestOp::Print(std::ostream &out, bool show_all) const {
92   if (!show_all) {
93     // Call the super class for displaying any common 1-liner info
94     ParallelOp::Print(out, show_all);
95     // Then show any custom derived-internal 1-liner info for this op
96     out << "\n";
97   } else {
98     // Call the super class for displaying any common detailed info
99     ParallelOp::Print(out, show_all);
100     // Then show any custom derived-internal stuff
101     out << "\nNumber of rows:" << num_rows_ << "\nManifest file: " << file_ << "\nDecode: " << (decode_ ? "yes" : "no")
102         << "\n\n";
103   }
104 }
105 
106 // Derived from RandomAccessOp
GetClassIds(std::map<int32_t,std::vector<int64_t>> * cls_ids) const107 Status ManifestOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_ids) const {
108   if (cls_ids == nullptr || !cls_ids->empty() || image_labelname_.empty()) {
109     if (image_labelname_.empty()) {
110       RETURN_STATUS_UNEXPECTED("Invalid data, no image found in dataset.");
111     } else {
112       RETURN_STATUS_UNEXPECTED(
113         "[Internal ERROR] Map for containing image-index pair is nullptr or has been set in other place,"
114         "it must be empty before using GetClassIds.");
115     }
116   }
117 
118   for (size_t i = 0; i < image_labelname_.size(); i++) {
119     size_t image_index = i;
120     for (size_t j = 0; j < image_labelname_[image_index].second.size(); j++) {
121       std::string label_name = (image_labelname_[image_index].second)[j];
122       int32_t label_index = label_index_.at(label_name);
123       (*cls_ids)[label_index].emplace_back(image_index);
124     }
125   }
126 
127   for (auto &pair : (*cls_ids)) {
128     pair.second.shrink_to_fit();
129   }
130   return Status::OK();
131 }
132 
133 // Manifest file content
134 // {"source": "/path/to/image1.jpg", "usage":"train", annotation": ...}
135 // {"source": "/path/to/image2.jpg", "usage":"eval", "annotation": ...}
ParseManifestFile()136 Status ManifestOp::ParseManifestFile() {
137   auto realpath = FileUtils::GetRealPath(file_.data());
138   if (!realpath.has_value()) {
139     MS_LOG(ERROR) << "Invalid file, get real path failed, path=" << file_;
140     RETURN_STATUS_UNEXPECTED("Invalid data, get real path failed, path=" + file_);
141   }
142 
143   std::ifstream file_handle(realpath.value());
144   if (!file_handle.is_open()) {
145     RETURN_STATUS_UNEXPECTED("Invalid file, failed to open Manifest file: " + file_);
146   }
147   std::string line;
148   std::set<std::string> classes;
149   uint64_t line_count = 1;
150   while (getline(file_handle, line)) {
151     try {
152       nlohmann::json js = nlohmann::json::parse(line);
153       std::string image_file_path = js.value("source", "");
154       if (image_file_path == "") {
155         file_handle.close();
156         RETURN_STATUS_UNEXPECTED("Invalid data, 'source' is not found in Manifest file: " + file_ + " at line " +
157                                  std::to_string(line_count));
158       }
159       // If image is not JPEG/PNG/GIF/BMP, drop it
160       bool valid = false;
161       RETURN_IF_NOT_OK(CheckImageType(image_file_path, &valid));
162       if (!valid) {
163         continue;
164       }
165       std::string usage = js.value("usage", "");
166       if (usage == "") {
167         file_handle.close();
168         RETURN_STATUS_UNEXPECTED("Invalid data, 'usage' is not found in Manifest file: " + file_ + " at line " +
169                                  std::to_string(line_count));
170       }
171       (void)std::transform(usage.begin(), usage.end(), usage.begin(), ::tolower);
172       if (usage != usage_) {
173         continue;
174       }
175       std::vector<std::string> labels;
176       nlohmann::json annotations = js.at("annotation");
177       for (nlohmann::json::iterator it = annotations.begin(); it != annotations.end(); ++it) {
178         nlohmann::json annotation = it.value();
179         std::string label_name = annotation.value("name", "");
180         classes.insert(label_name);
181         if (label_name == "") {
182           file_handle.close();
183           RETURN_STATUS_UNEXPECTED("Invalid data, 'name' of label is not found in Manifest file: " + file_ +
184                                    " at line " + std::to_string(line_count));
185         }
186         if (class_index_.empty() || class_index_.find(label_name) != class_index_.end()) {
187           if (label_index_.find(label_name) == label_index_.end()) {
188             label_index_[label_name] = 0;
189           }
190           labels.emplace_back(label_name);
191         }
192       }
193       if (!labels.empty()) {
194         image_labelname_.emplace_back(std::make_pair(image_file_path, labels));
195       }
196       line_count++;
197     } catch (const std::exception &err) {
198       file_handle.close();
199       RETURN_STATUS_UNEXPECTED("Invalid file, failed to parse manifest file: " + file_);
200     }
201   }
202   num_classes_ = classes.size();
203   file_handle.close();
204 
205   return Status::OK();
206 }
207 
208 // Only support JPEG/PNG/GIF/BMP
CheckImageType(const std::string & file_name,bool * valid)209 Status ManifestOp::CheckImageType(const std::string &file_name, bool *valid) {
210   auto realpath = FileUtils::GetRealPath(file_name.data());
211   if (!realpath.has_value()) {
212     MS_LOG(ERROR) << "Invalid file, get real path failed, path=" << file_name;
213     RETURN_STATUS_UNEXPECTED("Invalid file, get real path failed, path=" + file_name);
214   }
215 
216   std::ifstream file_handle;
217   constexpr int read_num = 3;
218   *valid = false;
219   file_handle.open(realpath.value(), std::ios::binary | std::ios::in);
220   if (!file_handle.is_open()) {
221     RETURN_STATUS_UNEXPECTED("Invalid file, failed to open image file: " + file_name);
222   }
223   unsigned char file_type[read_num];
224   (void)file_handle.read(reinterpret_cast<char *>(file_type), read_num);
225 
226   if (file_handle.fail()) {
227     file_handle.close();
228     RETURN_STATUS_UNEXPECTED("Invalid data, failed to read image file: " + file_name);
229   }
230   file_handle.close();
231   if (file_type[0] == 0xff && file_type[1] == 0xd8 && file_type[2] == 0xff) {
232     // Normal JPEGs start with \xff\xd8\xff\xe0
233     // JPEG with EXIF stats with \xff\xd8\xff\xe1
234     // Use \xff\xd8\xff to cover both.
235     *valid = true;
236   } else if (file_type[0] == 0x89 && file_type[1] == 0x50 && file_type[2] == 0x4e) {
237     // It's a PNG
238     *valid = true;
239   } else if (file_type[0] == 0x47 && file_type[1] == 0x49 && file_type[2] == 0x46) {
240     // It's a GIF
241     *valid = true;
242   } else if (file_type[0] == 0x42 && file_type[1] == 0x4d) {
243     // It's a BMP
244     *valid = true;
245   }
246   return Status::OK();
247 }
248 
CountDatasetInfo()249 Status ManifestOp::CountDatasetInfo() {
250   int32_t index = 0;
251   for (auto &label : label_index_) {
252     label.second = class_index_.empty() ? index : class_index_[label.first];
253     index++;
254   }
255 
256   num_rows_ = static_cast<int64_t>(image_labelname_.size());
257   if (num_rows_ == 0) {
258     RETURN_STATUS_UNEXPECTED(
259       "Invalid data, ManifestDataset API can't read the data file (interface mismatch or no data found). "
260       "Check file path: " +
261       file_);
262   }
263   return Status::OK();
264 }
265 
CountTotalRows(int64_t * count)266 Status ManifestOp::CountTotalRows(int64_t *count) {
267   *count = 0;
268   RETURN_IF_NOT_OK(ParseManifestFile());
269   *count = static_cast<int64_t>(image_labelname_.size());
270   return Status::OK();
271 }
272 
ComputeColMap()273 Status ManifestOp::ComputeColMap() {
274   // Set the column name map (base class field)
275   if (column_name_id_map_.empty()) {
276     for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
277       column_name_id_map_[data_schema_->Column(i).Name()] = i;
278     }
279   } else {
280     MS_LOG(WARNING) << "Column name map is already set!";
281   }
282   return Status::OK();
283 }
284 
285 // Get number of classes
GetNumClasses(int64_t * num_classes)286 Status ManifestOp::GetNumClasses(int64_t *num_classes) {
287   if (num_classes_ > 0) {
288     *num_classes = num_classes_;
289     return Status::OK();
290   }
291   int64_t classes_count;
292   RETURN_IF_NOT_OK(ParseManifestFile());
293   classes_count = static_cast<int64_t>(label_index_.size());
294   *num_classes = classes_count;
295   num_classes_ = classes_count;
296   return Status::OK();
297 }
298 
GetClassIndexing(std::vector<std::pair<std::string,std::vector<int32_t>>> * output_class_indexing)299 Status ManifestOp::GetClassIndexing(std::vector<std::pair<std::string, std::vector<int32_t>>> *output_class_indexing) {
300   if ((*output_class_indexing).empty()) {
301     RETURN_IF_NOT_OK(ParseManifestFile());
302     RETURN_IF_NOT_OK(CountDatasetInfo());
303     int32_t count = 0;
304     for (const auto &label : label_index_) {
305       if (!class_index_.empty()) {
306         (*output_class_indexing)
307           .emplace_back(std::make_pair(label.first, std::vector<int32_t>(1, class_index_[label.first])));
308       } else {
309         (*output_class_indexing).emplace_back(std::make_pair(label.first, std::vector<int32_t>(1, count)));
310       }
311       count++;
312     }
313   }
314   return Status::OK();
315 }
316 
317 }  // namespace dataset
318 }  // namespace mindspore
319