1 /**
2 * Copyright 2019-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/engine/datasetops/source/manifest_op.h"
17
18 #include <algorithm>
19 #include <fstream>
20 #include <nlohmann/json.hpp>
21
22 #include "utils/file_utils.h"
23 #include "utils/ms_utils.h"
24 #include "minddata/dataset/core/config_manager.h"
25 #include "minddata/dataset/core/tensor_shape.h"
26 #include "minddata/dataset/engine/datasetops/source/sampler/sequential_sampler.h"
27 #include "minddata/dataset/engine/db_connector.h"
28 #include "minddata/dataset/engine/execution_tree.h"
29
30 namespace mindspore {
31 namespace dataset {
ManifestOp(int32_t num_works,std::string file,int32_t queue_size,bool decode,const std::map<std::string,int32_t> & class_index,std::unique_ptr<DataSchema> data_schema,std::shared_ptr<SamplerRT> sampler,std::string usage)32 ManifestOp::ManifestOp(int32_t num_works, std::string file, int32_t queue_size, bool decode,
33 const std::map<std::string, int32_t> &class_index, std::unique_ptr<DataSchema> data_schema,
34 std::shared_ptr<SamplerRT> sampler, std::string usage)
35 : MappableLeafOp(num_works, queue_size, std::move(sampler)),
36 io_block_pushed_(0),
37 sampler_ind_(0),
38 data_schema_(std::move(data_schema)),
39 file_(std::move(file)),
40 class_index_(class_index),
41 decode_(decode),
42 usage_(usage) {
43 io_block_queues_.Init(num_workers_, queue_size);
44 (void)std::transform(usage_.begin(), usage_.end(), usage_.begin(), ::tolower);
45 }
46
LaunchThreadsAndInitOp()47 Status ManifestOp::LaunchThreadsAndInitOp() {
48 if (tree_ == nullptr) {
49 RETURN_STATUS_UNEXPECTED("Pipeline init failed, Execution tree not set.");
50 }
51 RETURN_IF_NOT_OK(io_block_queues_.Register(tree_->AllTasks()));
52 RETURN_IF_NOT_OK(wait_for_workers_post_.Register(tree_->AllTasks()));
53
54 RETURN_IF_NOT_OK(
55 tree_->LaunchWorkers(num_workers_, std::bind(&ManifestOp::WorkerEntry, this, std::placeholders::_1), "", id()));
56 TaskManager::FindMe()->Post();
57 RETURN_IF_NOT_OK(ParseManifestFile());
58 RETURN_IF_NOT_OK(CountDatasetInfo());
59 RETURN_IF_NOT_OK(InitSampler());
60 return Status::OK();
61 }
62
63 // Load 1 TensorRow (image,label) using 1 ImageLabelPair. 1 function call produces 1 TensorTow
LoadTensorRow(row_id_type row_id,TensorRow * trow)64 Status ManifestOp::LoadTensorRow(row_id_type row_id, TensorRow *trow) {
65 std::pair<std::string, std::vector<std::string>> data = image_labelname_[static_cast<size_t>(row_id)];
66 std::shared_ptr<Tensor> image;
67 std::shared_ptr<Tensor> label;
68 std::vector<int32_t> label_index(data.second.size());
69 (void)std::transform(data.second.begin(), data.second.end(), label_index.begin(),
70 [this](const std::string &label_name) { return label_index_[label_name]; });
71 RETURN_IF_NOT_OK(Tensor::CreateFromVector(label_index, &label));
72 if (label_index.size() == 1) {
73 RETURN_IF_NOT_OK(label->Reshape(TensorShape({})));
74 } else {
75 RETURN_IF_NOT_OK(label->Reshape(TensorShape(std::vector<dsize_t>(1, label_index.size()))));
76 }
77
78 RETURN_IF_NOT_OK(Tensor::CreateFromFile(data.first, &image));
79 if (decode_ == true) {
80 Status rc = Decode(image, &image);
81 if (rc.IsError()) {
82 std::string err = "Invalid data, failed to decode image: " + data.first;
83 RETURN_STATUS_UNEXPECTED(err);
84 }
85 }
86 (*trow) = TensorRow(row_id, {std::move(image), std::move(label)});
87 trow->setPath({data.first, file_});
88 return Status::OK();
89 }
90
Print(std::ostream & out,bool show_all) const91 void ManifestOp::Print(std::ostream &out, bool show_all) const {
92 if (!show_all) {
93 // Call the super class for displaying any common 1-liner info
94 ParallelOp::Print(out, show_all);
95 // Then show any custom derived-internal 1-liner info for this op
96 out << "\n";
97 } else {
98 // Call the super class for displaying any common detailed info
99 ParallelOp::Print(out, show_all);
100 // Then show any custom derived-internal stuff
101 out << "\nNumber of rows:" << num_rows_ << "\nManifest file: " << file_ << "\nDecode: " << (decode_ ? "yes" : "no")
102 << "\n\n";
103 }
104 }
105
106 // Derived from RandomAccessOp
GetClassIds(std::map<int32_t,std::vector<int64_t>> * cls_ids) const107 Status ManifestOp::GetClassIds(std::map<int32_t, std::vector<int64_t>> *cls_ids) const {
108 if (cls_ids == nullptr || !cls_ids->empty() || image_labelname_.empty()) {
109 if (image_labelname_.empty()) {
110 RETURN_STATUS_UNEXPECTED("Invalid data, no image found in dataset.");
111 } else {
112 RETURN_STATUS_UNEXPECTED(
113 "[Internal ERROR] Map for containing image-index pair is nullptr or has been set in other place,"
114 "it must be empty before using GetClassIds.");
115 }
116 }
117
118 for (size_t i = 0; i < image_labelname_.size(); i++) {
119 size_t image_index = i;
120 for (size_t j = 0; j < image_labelname_[image_index].second.size(); j++) {
121 std::string label_name = (image_labelname_[image_index].second)[j];
122 int32_t label_index = label_index_.at(label_name);
123 (*cls_ids)[label_index].emplace_back(image_index);
124 }
125 }
126
127 for (auto &pair : (*cls_ids)) {
128 pair.second.shrink_to_fit();
129 }
130 return Status::OK();
131 }
132
133 // Manifest file content
134 // {"source": "/path/to/image1.jpg", "usage":"train", annotation": ...}
135 // {"source": "/path/to/image2.jpg", "usage":"eval", "annotation": ...}
ParseManifestFile()136 Status ManifestOp::ParseManifestFile() {
137 auto realpath = FileUtils::GetRealPath(file_.data());
138 if (!realpath.has_value()) {
139 MS_LOG(ERROR) << "Invalid file, get real path failed, path=" << file_;
140 RETURN_STATUS_UNEXPECTED("Invalid data, get real path failed, path=" + file_);
141 }
142
143 std::ifstream file_handle(realpath.value());
144 if (!file_handle.is_open()) {
145 RETURN_STATUS_UNEXPECTED("Invalid file, failed to open Manifest file: " + file_);
146 }
147 std::string line;
148 std::set<std::string> classes;
149 uint64_t line_count = 1;
150 while (getline(file_handle, line)) {
151 try {
152 nlohmann::json js = nlohmann::json::parse(line);
153 std::string image_file_path = js.value("source", "");
154 if (image_file_path == "") {
155 file_handle.close();
156 RETURN_STATUS_UNEXPECTED("Invalid data, 'source' is not found in Manifest file: " + file_ + " at line " +
157 std::to_string(line_count));
158 }
159 // If image is not JPEG/PNG/GIF/BMP, drop it
160 bool valid = false;
161 RETURN_IF_NOT_OK(CheckImageType(image_file_path, &valid));
162 if (!valid) {
163 continue;
164 }
165 std::string usage = js.value("usage", "");
166 if (usage == "") {
167 file_handle.close();
168 RETURN_STATUS_UNEXPECTED("Invalid data, 'usage' is not found in Manifest file: " + file_ + " at line " +
169 std::to_string(line_count));
170 }
171 (void)std::transform(usage.begin(), usage.end(), usage.begin(), ::tolower);
172 if (usage != usage_) {
173 continue;
174 }
175 std::vector<std::string> labels;
176 nlohmann::json annotations = js.at("annotation");
177 for (nlohmann::json::iterator it = annotations.begin(); it != annotations.end(); ++it) {
178 nlohmann::json annotation = it.value();
179 std::string label_name = annotation.value("name", "");
180 classes.insert(label_name);
181 if (label_name == "") {
182 file_handle.close();
183 RETURN_STATUS_UNEXPECTED("Invalid data, 'name' of label is not found in Manifest file: " + file_ +
184 " at line " + std::to_string(line_count));
185 }
186 if (class_index_.empty() || class_index_.find(label_name) != class_index_.end()) {
187 if (label_index_.find(label_name) == label_index_.end()) {
188 label_index_[label_name] = 0;
189 }
190 labels.emplace_back(label_name);
191 }
192 }
193 if (!labels.empty()) {
194 image_labelname_.emplace_back(std::make_pair(image_file_path, labels));
195 }
196 line_count++;
197 } catch (const std::exception &err) {
198 file_handle.close();
199 RETURN_STATUS_UNEXPECTED("Invalid file, failed to parse manifest file: " + file_);
200 }
201 }
202 num_classes_ = classes.size();
203 file_handle.close();
204
205 return Status::OK();
206 }
207
208 // Only support JPEG/PNG/GIF/BMP
CheckImageType(const std::string & file_name,bool * valid)209 Status ManifestOp::CheckImageType(const std::string &file_name, bool *valid) {
210 auto realpath = FileUtils::GetRealPath(file_name.data());
211 if (!realpath.has_value()) {
212 MS_LOG(ERROR) << "Invalid file, get real path failed, path=" << file_name;
213 RETURN_STATUS_UNEXPECTED("Invalid file, get real path failed, path=" + file_name);
214 }
215
216 std::ifstream file_handle;
217 constexpr int read_num = 3;
218 *valid = false;
219 file_handle.open(realpath.value(), std::ios::binary | std::ios::in);
220 if (!file_handle.is_open()) {
221 RETURN_STATUS_UNEXPECTED("Invalid file, failed to open image file: " + file_name);
222 }
223 unsigned char file_type[read_num];
224 (void)file_handle.read(reinterpret_cast<char *>(file_type), read_num);
225
226 if (file_handle.fail()) {
227 file_handle.close();
228 RETURN_STATUS_UNEXPECTED("Invalid data, failed to read image file: " + file_name);
229 }
230 file_handle.close();
231 if (file_type[0] == 0xff && file_type[1] == 0xd8 && file_type[2] == 0xff) {
232 // Normal JPEGs start with \xff\xd8\xff\xe0
233 // JPEG with EXIF stats with \xff\xd8\xff\xe1
234 // Use \xff\xd8\xff to cover both.
235 *valid = true;
236 } else if (file_type[0] == 0x89 && file_type[1] == 0x50 && file_type[2] == 0x4e) {
237 // It's a PNG
238 *valid = true;
239 } else if (file_type[0] == 0x47 && file_type[1] == 0x49 && file_type[2] == 0x46) {
240 // It's a GIF
241 *valid = true;
242 } else if (file_type[0] == 0x42 && file_type[1] == 0x4d) {
243 // It's a BMP
244 *valid = true;
245 }
246 return Status::OK();
247 }
248
CountDatasetInfo()249 Status ManifestOp::CountDatasetInfo() {
250 int32_t index = 0;
251 for (auto &label : label_index_) {
252 label.second = class_index_.empty() ? index : class_index_[label.first];
253 index++;
254 }
255
256 num_rows_ = static_cast<int64_t>(image_labelname_.size());
257 if (num_rows_ == 0) {
258 RETURN_STATUS_UNEXPECTED(
259 "Invalid data, ManifestDataset API can't read the data file (interface mismatch or no data found). "
260 "Check file path: " +
261 file_);
262 }
263 return Status::OK();
264 }
265
CountTotalRows(int64_t * count)266 Status ManifestOp::CountTotalRows(int64_t *count) {
267 *count = 0;
268 RETURN_IF_NOT_OK(ParseManifestFile());
269 *count = static_cast<int64_t>(image_labelname_.size());
270 return Status::OK();
271 }
272
ComputeColMap()273 Status ManifestOp::ComputeColMap() {
274 // Set the column name map (base class field)
275 if (column_name_id_map_.empty()) {
276 for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
277 column_name_id_map_[data_schema_->Column(i).Name()] = i;
278 }
279 } else {
280 MS_LOG(WARNING) << "Column name map is already set!";
281 }
282 return Status::OK();
283 }
284
285 // Get number of classes
GetNumClasses(int64_t * num_classes)286 Status ManifestOp::GetNumClasses(int64_t *num_classes) {
287 if (num_classes_ > 0) {
288 *num_classes = num_classes_;
289 return Status::OK();
290 }
291 int64_t classes_count;
292 RETURN_IF_NOT_OK(ParseManifestFile());
293 classes_count = static_cast<int64_t>(label_index_.size());
294 *num_classes = classes_count;
295 num_classes_ = classes_count;
296 return Status::OK();
297 }
298
GetClassIndexing(std::vector<std::pair<std::string,std::vector<int32_t>>> * output_class_indexing)299 Status ManifestOp::GetClassIndexing(std::vector<std::pair<std::string, std::vector<int32_t>>> *output_class_indexing) {
300 if ((*output_class_indexing).empty()) {
301 RETURN_IF_NOT_OK(ParseManifestFile());
302 RETURN_IF_NOT_OK(CountDatasetInfo());
303 int32_t count = 0;
304 for (const auto &label : label_index_) {
305 if (!class_index_.empty()) {
306 (*output_class_indexing)
307 .emplace_back(std::make_pair(label.first, std::vector<int32_t>(1, class_index_[label.first])));
308 } else {
309 (*output_class_indexing).emplace_back(std::make_pair(label.first, std::vector<int32_t>(1, count)));
310 }
311 count++;
312 }
313 }
314 return Status::OK();
315 }
316
317 } // namespace dataset
318 } // namespace mindspore
319