1 /** 2 * Copyright 2020 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MINDDATA_NODE_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MINDDATA_NODE_H_ 19 20 #include <map> 21 #include <memory> 22 #include <string> 23 #include <vector> 24 25 #include "minddata/dataset/engine/datasetops/source/mindrecord_op.h" 26 #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" 27 28 namespace mindspore { 29 namespace dataset { 30 31 class MindDataNode : public MappableSourceNode { 32 public: 33 /// \brief Constructor 34 MindDataNode(const std::vector<std::string> &dataset_files, const std::vector<std::string> &columns_list, 35 const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample, int64_t num_padded, 36 ShuffleMode shuffle_mode = ShuffleMode::kGlobal, std::shared_ptr<DatasetCache> cache = nullptr); 37 38 /// \brief Constructor 39 MindDataNode(const std::string &dataset_file, const std::vector<std::string> &columns_list, 40 const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample, int64_t num_padded, 41 ShuffleMode shuffle_mode = ShuffleMode::kGlobal, std::shared_ptr<DatasetCache> cache = nullptr); 42 43 /// \brief Destructor 44 ~MindDataNode() = default; 45 46 /// \brief Node name getter 47 /// \return Name of the current node Name()48 std::string Name() const override { return kMindDataNode; } 49 50 /// \brief Print the description 51 /// \param out - The output stream to write output to 52 void Print(std::ostream &out) const override; 53 54 /// \brief Copy the node to a new object 55 /// \return A shared pointer to the new copy 56 std::shared_ptr<DatasetNode> Copy() override; 57 58 /// \brief a base class override function to create the required runtime dataset op objects for this class 59 /// \param node_ops - A vector containing shared pointer to the Dataset Ops that this object will create 60 /// \return Status Status::OK() if build successfully 61 Status Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) override; 62 63 /// \brief Parameters validation 64 /// \return Status Status::OK() if all the parameters are valid 65 Status ValidateParams() override; 66 67 /// \brief Get the shard id of node 68 /// \return Status Status::OK() if get shard id successfully 69 Status GetShardId(int32_t *shard_id) override; 70 71 /// \brief Build sampler chain for minddata dataset 72 /// \return Status Status::OK() if input sampler is valid 73 Status BuildMindDatasetSamplerChain(const std::shared_ptr<SamplerObj> &sampler, 74 std::vector<std::shared_ptr<mindrecord::ShardOperator>> *operators_, 75 int64_t num_padded, ShuffleMode shuffle_mode); 76 77 /// \brief Set sample_bytes when padded_sample has py::byte value 78 /// \note Pybind will use this function to set sample_bytes into MindDataNode 79 void SetSampleBytes(std::map<std::string, std::string> *sample_bytes); 80 81 /// \brief Base-class override for GetDatasetSize 82 /// \param[in] size_getter Shared pointer to DatasetSizeGetter 83 /// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting 84 /// dataset size at the expense of accuracy. 85 /// \param[out] dataset_size the size of the dataset 86 /// \return Status of the function 87 Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate, 88 int64_t *dataset_size) override; 89 90 /// \brief Sampler getter 91 /// \return SamplerObj of the current node Sampler()92 std::shared_ptr<SamplerObj> Sampler() override { return sampler_; } 93 94 /// \brief Sampler setter SetSampler(std::shared_ptr<SamplerObj> sampler)95 void SetSampler(std::shared_ptr<SamplerObj> sampler) override { sampler_ = sampler; } 96 97 /// \brief Base-class override for accepting IRNodePass visitor 98 /// \param[in] p The node to visit 99 /// \param[out] modified Indicator if the node was modified 100 /// \return Status of the node visit 101 Status Accept(IRNodePass *const p, bool *const modified) override; 102 103 /// \brief Base-class override for accepting IRNodePass visitor 104 /// \param[in] p The node to visit 105 /// \param[out] modified Indicator if the node was modified 106 /// \return Status of the node visit 107 Status AcceptAfter(IRNodePass *const p, bool *const modified) override; 108 109 private: 110 std::string dataset_file_; // search_for_pattern_ will be true in this mode 111 std::vector<std::string> dataset_files_; // search_for_pattern_ will be false in this mode 112 bool search_for_pattern_; 113 std::vector<std::string> columns_list_; 114 std::shared_ptr<SamplerObj> input_sampler_; // The sampler from users input, will be used to create a set of shard 115 // operators. 116 std::shared_ptr<SamplerObj> sampler_; // An auto-created sampler, IR of runtime MindRecordSamplerRT sampler 117 nlohmann::json padded_sample_; 118 std::map<std::string, std::string> sample_bytes_; // enable in python 119 int64_t num_padded_; 120 std::vector<std::shared_ptr<ShardOperator>> operators_; 121 ShuffleMode shuffle_mode_; 122 }; 123 124 } // namespace dataset 125 } // namespace mindspore 126 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MINDDATA_NODE_H_ 127