1 /** 2 * Copyright 2019-2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASET_ITERATOR_H_ 17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASET_ITERATOR_H_ 18 19 #include <memory> 20 #include <string> 21 #include <unordered_map> 22 #include <utility> 23 #include <vector> 24 25 #include "minddata/dataset/core/tensor.h" 26 #include "minddata/dataset/engine/datasetops/dataset_op.h" 27 #include "minddata/dataset/engine/execution_tree.h" 28 #include "minddata/dataset/engine/perf/dataset_iterator_tracing.h" 29 #include "minddata/dataset/util/status.h" 30 31 namespace mindspore { 32 namespace dataset { 33 using TensorMap = std::unordered_map<std::string, std::shared_ptr<Tensor>>; 34 35 // forward declare 36 class ExecutionTree; 37 38 // The DatasetIterator derived class is for fetching rows off the end/root of the execution tree. 39 class DatasetIterator { 40 public: 41 // Constructor of the DatasetIterator 42 // @param exe_tree The execution tree we want to pull/iterate the data from using it's root node. 43 explicit DatasetIterator(std::shared_ptr<ExecutionTree> exe_tree); 44 45 // Destructor 46 ~DatasetIterator(); 47 48 // Getter 49 // @return The string to column id mapping. 50 std::unordered_map<std::string, int32_t> GetColumnNameMap() const; 51 EofHandled()52 bool EofHandled() const { return eof_handled_; } 53 54 // Fetches one row of data from the iterator. 55 // the base class version simply performs error handling and returns empty row. Actual 56 // functionality exists in the derived versions of this function. 57 // @param out_row - A TensorRow (vector of shared pointers to Tensors). If any of the of data 58 // messages are encountered (such as eoe or eof), then an empty TensorRow is returned back. 59 // @return Status The status code returned 60 // @note The position of a Tensor/column might be different from the initial column order 61 // in corresponding Dataset Op. User must be aware that MapOp, ZipOps, and others might change 62 // the column ordering. 63 Status FetchNextTensorRow(TensorRow *out_row); 64 65 // Fetches one row of data from the iterator as a column map. 66 // @return A unordered map from column name to shared pointer to Tensor. 67 Status GetNextAsMap(TensorMap *out_map); 68 69 private: 70 std::shared_ptr<DatasetOp> root_; // saves the root of the executionTree 71 TensorRow device_queue_row_; 72 #ifndef ENABLE_SECURITY 73 std::shared_ptr<DatasetIteratorTracing> tracing_; // trace profiling data 74 #endif 75 int32_t cur_batch_num_; // current batch number,used for profiling 76 int32_t cur_connector_size_; // current connector size of root op,used for profiling 77 int32_t cur_connector_capacity_; // current connector capacity of root op, used for profiling 78 bool eof_handled_; // T/F if this op got an eof 79 std::unordered_map<std::string, int32_t> col_name_id_map_; 80 std::vector<std::pair<std::string, int32_t>> column_order_; // key: column name, val: column id 81 }; 82 83 // The ChildIterator derived class is for fetching rows from intermediate nodes of execution tree. 84 // This one should only be used by internal Dataset operators, rather than an end-user. 85 class ChildIterator { 86 public: 87 // Constructor of the DatasetIterator 88 // @param current_op - The parent op from which we'll fetch from it's children. 89 // @param worker_id - The worker id to use when fetching from the children. 90 // @param child_idx - The index to the child to fetch from. 91 ChildIterator(DatasetOp *current_op, int32_t worker_id, int32_t child_idx); 92 93 // Destructor 94 ~ChildIterator(); 95 96 // Fetches one row of data from the iterator. Overrides the base class. This one fetches 97 // only from the child/worker id as given from the constructor. 98 // @param out_row - A TensorRow (vector of shared pointers to Tensors). If any of the of data 99 // messages are encountered (such as eoe or eof), then an empty TensorRow is returned back. 100 // @return Status The status code returned 101 Status FetchNextTensorRow(TensorRow *out_row); 102 103 // This function drains buffer until next eoe has been received. 104 // It will be a no-op if the previous row returned is empty. 105 // @return Status The status code returned 106 Status Drain(); 107 108 // Getter 109 // @return The string to column id mapping. 110 std::unordered_map<std::string, int32_t> GetColumnNameMap() const; 111 112 // Return T/F if end of epoch EndOfEpoch()113 bool EndOfEpoch() { return end_epoch_; } 114 115 // Getter 116 // @return T/F if this iterator is completely done after getting an eof EofHandled()117 bool EofHandled() const { return eof_handled_; } 118 119 private: 120 DatasetOp *current_op_; // The parent operator. We consume from it's children. 121 int32_t child_idx_; // The specific child this iterator will fetch from. 122 int32_t worker_id_; // The worker id uses for fetching the child data. 123 bool end_epoch_; // the flag used when an empty row has been returned. 124 bool eof_handled_; // T/F if this op got an eof 125 }; 126 } // namespace dataset 127 } // namespace mindspore 128 129 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASET_ITERATOR_H_ 130