1 /**
2 * Copyright 2019-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/engine/datasetops/dataset_op.h"
17
18 #include <iomanip>
19 #include <iostream>
20 #include <memory>
21 #include <regex>
22 #include <utility>
23 #include <string>
24 #include <algorithm>
25
26 #include "minddata/dataset/engine/datasetops/device_queue_op.h"
27 #include "minddata/dataset/engine/datasetops/source/sampler/sampler.h"
28
29 #include "minddata/dataset/engine/db_connector.h"
30 #ifndef ENABLE_ANDROID
31 #include "utils/system/crc32c.h"
32 #include "utils/log_adapter.h"
33 #else
34 #include "mindspore/lite/src/common/log_adapter.h"
35 #endif
36
37 namespace mindspore {
38 namespace dataset {
39 // Constructor
DatasetOp(int32_t op_connector_size,std::shared_ptr<SamplerRT> sampler)40 DatasetOp::DatasetOp(int32_t op_connector_size, std::shared_ptr<SamplerRT> sampler)
41 : oc_queue_size_(op_connector_size),
42 sampler_(sampler),
43 operator_id_(kInvalidOperatorId),
44 tree_(nullptr),
45 state_(OpState::kDeOpIdle),
46 op_total_repeats_(kInfiniteRepeat),
47 op_num_repeats_per_epoch_(kInfiniteRepeat),
48 op_current_repeats_(0),
49 op_current_epochs_(0),
50 out_connector_(nullptr),
51 dataset_size_(-1),
52 num_classes_(-1) {
53 // The operator starts out with an invalid operator id. The only way to
54 // get it out of invalid state is to assign the operator to an execution tree.
55 }
56
57 // Adds a operator to become our child.
AddChild(std::shared_ptr<DatasetOp> child)58 Status DatasetOp::AddChild(std::shared_ptr<DatasetOp> child) {
59 if (std::dynamic_pointer_cast<DeviceQueueOp>(child) != nullptr) {
60 std::string err_msg(
61 "Unsupported scenario, \'send\' operator can only be after \'device_queue\' operation, but got " + Name());
62 RETURN_STATUS_UNEXPECTED(err_msg);
63 }
64 if (operator_id_ == kInvalidOperatorId) {
65 std::string err_msg(
66 "Cannot add child node. Tree node connections can only "
67 "be made if the node belongs to a tree.");
68 RETURN_STATUS_UNEXPECTED(err_msg);
69 }
70
71 // disallow relationships with other trees
72 if (tree_ != child->tree_) {
73 std::string err_msg(
74 "Cannot add child node. Tree node connections can only be made if both nodes belong to the same tree.");
75 RETURN_STATUS_UNEXPECTED(err_msg);
76 }
77 child_.push_back(child);
78 child->AddParent(this);
79 return Status::OK();
80 }
81
RemoveChild(std::shared_ptr<DatasetOp> child)82 Status DatasetOp::RemoveChild(std::shared_ptr<DatasetOp> child) {
83 if (operator_id_ == kInvalidOperatorId) {
84 std::string err_msg(
85 "Cannot remove child node. Tree node connections can only "
86 "be made if the node belongs to a tree.");
87 RETURN_STATUS_UNEXPECTED(err_msg);
88 }
89
90 // disallow relationships with other trees
91 if (tree_ != child->tree_) {
92 std::string err_msg(
93 "Cannot remove child node. Tree node connections can only be made if both nodes belong to the same tree.");
94 RETURN_STATUS_UNEXPECTED(err_msg);
95 }
96
97 child_.erase(std::remove(child_.begin(), child_.end(), child), child_.end());
98 child->RemoveParent(this);
99 return Status::OK();
100 }
101
InsertAsParent(std::shared_ptr<DatasetOp> to_add)102 Status DatasetOp::InsertAsParent(std::shared_ptr<DatasetOp> to_add) {
103 RETURN_UNEXPECTED_IF_NULL(to_add);
104 for (auto &prev_parent : this->parent_) {
105 RETURN_IF_NOT_OK(prev_parent->RemoveChild(shared_from_this()));
106 RETURN_IF_NOT_OK(prev_parent->AddChild(to_add));
107 }
108 RETURN_IF_NOT_OK(to_add->AddChild(shared_from_this()));
109 if (tree_->root()->id() == this->id()) {
110 RETURN_IF_NOT_OK(tree_->AssignRoot(to_add));
111 }
112 return Status::OK();
113 }
114 // Removes child operator in this operator.
RemoveChildren()115 Status DatasetOp::RemoveChildren() {
116 for (const auto &child : child_) {
117 child->RemoveParent(this);
118 }
119 child_.clear();
120
121 return Status::OK();
122 }
123
124 // Adds a parent operator to this operator
AddParent(DatasetOp * parent)125 void DatasetOp::AddParent(DatasetOp *parent) { parent_.push_back(parent); }
126
127 // Removes a parent operator from this operator
RemoveParent(const DatasetOp * parent)128 void DatasetOp::RemoveParent(const DatasetOp *parent) {
129 parent_.erase(std::remove(parent_.begin(), parent_.end(), parent), parent_.end());
130 }
131
132 // Removes this node from the tree and connects it's parent/child together
Remove()133 Status DatasetOp::Remove() {
134 if (parent_.size() > 1) {
135 std::string err_msg("[Internal ERROR], no support for the relationship between operators is not one-to-one.");
136 RETURN_STATUS_UNEXPECTED(err_msg);
137 }
138 if (child_.size() > 1) {
139 std::string err_msg("[Internal ERROR], no support for the relationship between operators is not one-to-one.");
140 RETURN_STATUS_UNEXPECTED(err_msg);
141 }
142
143 // Scenario's when removing node B:
144 // A -> B -> C
145 // A -> B
146 // B -> C
147 //
148 // If we remove B, then first take our child A and update it's parent to be C
149 // It's possible the parent is null if we are the root node being removed.
150 if (!child_.empty()) {
151 // If we have a parent, then assign child's parent to point to our parent.
152 if (!parent_.empty()) {
153 CHECK_FAIL_RETURN_UNEXPECTED(parent_[0]->Children().size() == 1,
154 "Removing a node whose parent has more than 1 child is not supported.");
155 child_[0]->parent_[0] = parent_[0];
156 } else {
157 // We don't have a parent, so we are the root node being removed.
158 // clear the parent list of our child so that it becomes the new root.
159 child_[0]->parent_.clear();
160 RETURN_IF_NOT_OK(tree_->AssignRoot(child_[0]));
161 }
162 }
163
164 // Next, if we had a parent, then set it's child to be our child.
165 if (!parent_.empty()) {
166 // if we have a child, then set our parent to point to it
167 if (!child_.empty()) {
168 parent_[0]->child_[0] = child_[0];
169 } else {
170 // We don't have a child, so clear the child list of the current
171 // parent because it will be empty once we are removed.
172 parent_[0]->child_.clear();
173 }
174 }
175
176 // Finally, clear "this" op's parent and child pointers since we have just
177 // disconnected it from the tree and invalidate it's fields.
178 child_.clear();
179 parent_.clear();
180 operator_id_ = kInvalidOperatorId;
181 tree_ = nullptr;
182
183 return Status::OK();
184 }
185
186 // Getter function to get a shared pointer to our child
child(int32_t child_index) const187 std::shared_ptr<DatasetOp> DatasetOp::child(int32_t child_index) const {
188 std::shared_ptr<DatasetOp> return_op = nullptr;
189 if (child_.empty()) {
190 return return_op;
191 }
192 MS_ASSERT(child_index < static_cast<int>(child_.size()));
193 // Return a shared pointer
194 return child_[child_index];
195 }
196
197 // Getter function to get the parent pointer
Parent(DatasetOp ** parent,int32_t parent_index) const198 void DatasetOp::Parent(DatasetOp **parent, int32_t parent_index) const {
199 if (parent_.empty()) {
200 // common case if this is a root node
201 *parent = nullptr;
202 } else {
203 MS_ASSERT(parent_index < static_cast<int>(parent_.size()));
204 *parent = parent_[parent_index];
205 }
206 }
207
208 // Getter function to get all of our parents.
parents() const209 std::vector<DatasetOp *> DatasetOp::parents() const { return parent_; }
210
211 // Creates the connector within this operator
CreateConnector(int32_t num_producers,int32_t num_consumers)212 void DatasetOp::CreateConnector(int32_t num_producers, int32_t num_consumers) {
213 MS_LOG(DEBUG) << "Creating connector in tree operator: " << operator_id_ << ". Producer: " << num_producers
214 << ". Consumer: " << num_consumers << ".";
215 if (oc_queue_size_ > 0) {
216 out_connector_ = std::make_unique<DbConnector>(num_producers, // The number of producers
217 num_consumers, // Only one consumer (the training App)
218 oc_queue_size_);
219 } else {
220 // Some op's may choose not to have an output connector
221 MS_LOG(DEBUG) << "Bypassed connector creation for tree operator: " << operator_id_ << ".";
222 out_connector_ = nullptr;
223 }
224 }
225
226 // A print method typically used for debugging. showAll of true will recursively descend to child prints
Print(std::ostream & out,bool show_all) const227 void DatasetOp::Print(std::ostream &out, bool show_all) const {
228 // When show_all is false, we display a 1 liner piece of text for the op.
229 // When show_all is true, we display more detailed output for the op.
230 // Derived printers should show their own header info, then call base class printer, followed by
231 // derived-specific items.
232
233 // Always show the id and name as first line regardless if this summary or detailed print
234 out << "(" << std::setw(2) << operator_id_ << ") <" << Name() << ">:";
235
236 if (show_all) {
237 // The detailed display will show common base class info of the op. Allow the derived class to print
238 // it's own id and name though as the first line.
239 out << "\nNumber of children : " << child_.size();
240 for (size_t i = 0; i < child_.size(); i++) {
241 out << "\n Child[" << i << "] id: " << child_[i]->id();
242 }
243 out << "\nNumber of parents : " << parent_.size();
244 for (size_t i = 0; i < parent_.size(); i++) {
245 out << "\n Parent[" << i << "] id: " << parent_[i]->id();
246 }
247 out << "\nConnector queue size : " << oc_queue_size_ << "\nTotal repeats : " << op_total_repeats_
248 << "\nNumber repeats per epoch : " << op_num_repeats_per_epoch_;
249 if (sampler_) {
250 out << "\nSampler:\n";
251 sampler_->SamplerPrint(out, show_all);
252 }
253 }
254 }
255
GetNextRowPullMode(TensorRow * const row)256 Status DatasetOp::GetNextRowPullMode(TensorRow *const row) {
257 RETURN_UNEXPECTED_IF_NULL(child_[0]);
258 return child_[0]->GetNextRowPullMode(row);
259 }
260
261 // Gets the next row from the given child
GetNextRow(TensorRow * row,int32_t worker_id,bool retry_if_eoe)262 Status DatasetOp::GetNextRow(TensorRow *row, int32_t worker_id, bool retry_if_eoe) {
263 // pop is a blocked call and will throw an interruption if the whole group shuts down.
264 RETURN_IF_NOT_OK(out_connector_->PopWithRetry(static_cast<int>(worker_id), row, retry_if_eoe));
265 return Status::OK();
266 }
267
268 // Gets the number of classes
GetNumClasses(int64_t * num_classes)269 Status DatasetOp::GetNumClasses(int64_t *num_classes) {
270 if (child_.size() == 1) {
271 return child_[0]->GetNumClasses(num_classes);
272 } else if (child_.size() > 1) {
273 // It is okay for dataset to have more than 1 child, GetNumClasses shouldn't fail in this case.
274 // This is done mostly for cache, which injects cache lookup/merge operators. Cache path will
275 // always be in front of the child_ structure, so we get num classes from the last child.
276 return child_[child_.size() - 1]->GetNumClasses(num_classes);
277 } else {
278 // when num classes isn't found, the default behavior is to return -1
279 MS_LOG(WARNING) << "Num classes not defined for : " << Name();
280 *num_classes = -1;
281 return Status::OK();
282 }
283 }
284
GetClassIndexing(std::vector<std::pair<std::string,std::vector<int32_t>>> * output_class_indexing)285 Status DatasetOp::GetClassIndexing(std::vector<std::pair<std::string, std::vector<int32_t>>> *output_class_indexing) {
286 if (child_.size() == 1) {
287 return child_[0]->GetClassIndexing(output_class_indexing);
288 } else if (child_.size() > 1) {
289 // It is okay for dataset to have more than 1 child, GetClassIndexing shouldn't fail in this case.
290 // This is done mostly for cache, which injects cache lookup/merge operators. Cache path will
291 // always be in the front of the child_ structure, so we get data from the last child.
292 return child_[child_.size() - 1]->GetClassIndexing(output_class_indexing);
293 } else {
294 *output_class_indexing = {};
295 RETURN_STATUS_UNEXPECTED("Trying to get class index from leaf node, missing override.");
296 }
297 }
298
299 // Performs handling for when an eoe message is received.
300 // The base class implementation simply flows the eoe message to output. Derived classes
301 // may override if they need to perform special eoe handling.
EoeReceived(int32_t worker_id)302 Status DatasetOp::EoeReceived(int32_t worker_id) { return out_connector_->SendEOE(worker_id); }
303
304 // Performs handling for when an eof message is received.
305 // The base class implementation simply flows the eof message to output. Derived classes
306 // may override if they need to perform special eof handling.
EofReceived(int32_t worker_id)307 Status DatasetOp::EofReceived(int32_t worker_id) { return out_connector_->SendEOF(worker_id); }
308
309 // During tree prepare phase, operators may have specific post-operations to perform depending on their role.
PrepareOperator()310 Status DatasetOp::PrepareOperator() {
311 // Creating Connector object for each op.
312 // The consumer of the root node is assumed to be one thread.
313 // If multiple threads are consuming from the root node, they will get the ordered data in round robin fashion.
314 if (parent_.empty()) {
315 this->CreateConnector(NumProducers(), 1);
316 } else {
317 this->CreateConnector(NumProducers(), parent_[0]->NumConsumers());
318 }
319 if (out_connector_) {
320 RETURN_IF_NOT_OK(out_connector_->Register(tree_->AllTasks()));
321 }
322 RETURN_IF_NOT_OK(this->RegisterWorkerConnectors());
323
324 // Generate the column name map for the current op.
325 RETURN_IF_NOT_OK(this->ComputeColMap());
326
327 return Status::OK();
328 }
329
330 // Derived classes may implement the reset function if the operator is stateful and needs
331 // specific reset handling that is not contained in this common code version of the reset.
Reset()332 Status DatasetOp::Reset() {
333 state_ = OpState::kDeOpRunning;
334 return Status::OK();
335 }
336
337 // gives a string output for the column map for handy debug printing
ColumnNameMapAsString() const338 std::string DatasetOp::ColumnNameMapAsString() const {
339 std::string outStr = "Column name id map: ";
340 for (auto &it : column_name_id_map_) {
341 outStr += (" " + it.first + ":" + std::to_string(it.second));
342 }
343 return outStr;
344 }
345
346 // Computing the assignment of the column name map.
347 // This just inherits the column map from its first child, can only be used if the number of children is 1.
348 // Operations changing the column map must overwrite this function.
ComputeColMap()349 Status DatasetOp::ComputeColMap() {
350 if (child_.size() > 1) {
351 RETURN_STATUS_UNEXPECTED("[Internal ERROR], no support for the relationship between operators is not one-to-one.");
352 }
353 if (column_name_id_map_.empty()) {
354 column_name_id_map_ = child_[0]->column_name_id_map();
355 if (column_name_id_map_.empty()) {
356 RETURN_STATUS_UNEXPECTED("Child column name map cannot be empty!");
357 }
358 MS_LOG(DEBUG) << "Setting column map:\n" << DatasetOp::ColumnNameMapAsString();
359 } else {
360 MS_LOG(WARNING) << "Column name map is already set!";
361 }
362 return Status::OK();
363 }
364
365 // Getter for the sampler, and it also removes the sampler from the op
FetchRemoveSampler(std::shared_ptr<SamplerRT> * sampler)366 Status DatasetOp::FetchRemoveSampler(std::shared_ptr<SamplerRT> *sampler) {
367 *sampler = sampler_; // It's okay if it sampler_ points to nullptr
368 sampler_.reset(); // clear our member-copy of this pointer. We no longer have this sampler
369 return Status::OK();
370 }
371
372 #ifndef ENABLE_ANDROID
GenerateCRC(const std::shared_ptr<DatasetOp> & op)373 uint32_t DatasetOp::GenerateCRC(const std::shared_ptr<DatasetOp> &op) {
374 std::stringstream ss;
375 op->tree_->Print(ss, op);
376 std::string ss_str = ss.str();
377
378 // Filter out the Num workers field when generating the check sum
379 ss_str = std::regex_replace(ss_str, std::regex("Number of ShardReader workers.*\n"), "");
380 ss_str = std::regex_replace(ss_str, std::regex("Num workers.*\n"), "");
381 ss_str = std::regex_replace(ss_str, std::regex("\\[workers.*?\\]"), "");
382 ss_str = std::regex_replace(ss_str, std::regex("Connector queue size.*\n"), "");
383
384 // Filter out tcp/ip information
385 ss_str = std::regex_replace(ss_str, std::regex("Hostname.*\n"), "");
386 ss_str = std::regex_replace(ss_str, std::regex("Port.*\n"), "");
387 ss_str = std::regex_replace(ss_str, std::regex("Number of rpc workers.*\n"), "");
388 ss_str = std::regex_replace(ss_str, std::regex("Prefetch size.*\n"), "");
389 ss_str = std::regex_replace(ss_str, std::regex("Local client support.*\n"), "");
390
391 // Filter out Number of rows when generating the check sum
392 ss_str = std::regex_replace(ss_str, std::regex("Number of rows.*\n"), "");
393
394 // Filter out the Operator control flags field when generating the check sum
395 ss_str = std::regex_replace(ss_str, std::regex("Operator control flags.*\n"), "");
396
397 // Filter out the Device id field to allow cache sharing for a distributed run of the same pipeline
398 ss_str = std::regex_replace(ss_str, std::regex("Device id.*\n"), "");
399 ss_str = std::regex_replace(ss_str, std::regex("device_id.*\n"), "");
400
401 // Filter out the operator id field
402 ss_str = std::regex_replace(ss_str, std::regex(" *Parent.*\n"), "");
403 ss_str = std::regex_replace(ss_str, std::regex(" *Child.*\n"), "");
404 ss_str = std::regex_replace(ss_str, std::regex(R"(\(\s*\d+?\))"), "");
405
406 // Doesn't matter whether there is any parent node above CacheOp or not.
407 ss_str = std::regex_replace(ss_str, std::regex("Number of parents.*\n"), "");
408
409 // Filter out shuffle seed from ShuffleOp
410 ss_str = std::regex_replace(ss_str, std::regex("Shuffle seed.*\n"), "");
411
412 // Filter out the total repeats and number repeats per epoch field
413 ss_str = std::regex_replace(ss_str, std::regex("Total repeats.*\n"), "");
414 ss_str = std::regex_replace(ss_str, std::regex("Number repeats per epoch.*\n"), "");
415
416 // The Cache crc and Server cache id field is different when creating new cache_client and re-using the same
417 // cache_client later. So we filter out these two fields to allow cache sharing.
418 ss_str = std::regex_replace(ss_str, std::regex("Cache crc.*\n"), "");
419 ss_str = std::regex_replace(ss_str, std::regex("Server cache id.*\n"), "");
420
421 MS_LOG(DEBUG) << "Printing the tree for generating crc:\n" << ss_str;
422
423 uint32_t cache_crc = system::Crc32c::GetMaskCrc32cValue(ss_str.c_str(), ss_str.length());
424 return cache_crc;
425 }
426 #endif
427
UpdateRepeatAndEpochCounter()428 void DatasetOp::UpdateRepeatAndEpochCounter() {
429 op_current_repeats_++;
430 if (op_current_repeats_ % op_num_repeats_per_epoch_ == 0) op_current_epochs_++;
431 MS_LOG(DEBUG) << Name() << " current repeats: " << op_current_repeats_ << ", current epochs: " << op_current_epochs_;
432 }
433
GetTreeBatchSize()434 int64_t DatasetOp::GetTreeBatchSize() {
435 if (child_.size() == 1) {
436 return child_[0]->GetTreeBatchSize();
437 } else if (child_.size() > 1) {
438 // It is okay for dataset to have more than 1 child, GetBatchSize shouldn't fail in this case.
439 // This is done mostly for cache, which injects cache lookup/merge operators. Cache path will
440 // always be in front of the child_ structure, so we get data from the last child.
441 return child_[child_.size() - 1]->GetTreeBatchSize();
442 } else {
443 return 1;
444 }
445 }
446
GetTreeRepeatCount()447 int64_t DatasetOp::GetTreeRepeatCount() {
448 if (child_.size() == 1) {
449 return child_[0]->GetTreeRepeatCount();
450 } else if (child_.size() > 1) {
451 // It is okay for dataset to have more than 1 child, GetRepeatCount shouldn't fail in this case.
452 // This is done mostly for cache, which injects cache lookup/merge operators. Cache path will
453 // always be in front of the child_ structure, so we get data from the last child.
454 return child_[child_.size() - 1]->GetTreeRepeatCount();
455 } else {
456 return 1;
457 }
458 }
459 } // namespace dataset
460 } // namespace mindspore
461