1 /** 2 * Copyright 2020 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MINDDATA_NODE_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MINDDATA_NODE_H_ 19 20 #include <map> 21 #include <memory> 22 #include <string> 23 #include <vector> 24 25 #include "minddata/dataset/engine/datasetops/source/mindrecord_op.h" 26 #include "minddata/dataset/engine/ir/datasetops/dataset_node.h" 27 28 namespace mindspore { 29 namespace dataset { 30 class MindDataNode : public MappableSourceNode { 31 public: 32 /// \brief Constructor 33 MindDataNode(const std::vector<std::string> &dataset_files, const std::vector<std::string> &columns_list, 34 const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample, int64_t num_padded, 35 ShuffleMode shuffle_mode = ShuffleMode::kGlobal, std::shared_ptr<DatasetCache> cache = nullptr); 36 37 /// \brief Constructor 38 MindDataNode(const std::string &dataset_file, const std::vector<std::string> &columns_list, 39 const std::shared_ptr<SamplerObj> &sampler, nlohmann::json padded_sample, int64_t num_padded, 40 ShuffleMode shuffle_mode = ShuffleMode::kGlobal, std::shared_ptr<DatasetCache> cache = nullptr); 41 42 /// \brief Destructor 43 ~MindDataNode() override = default; 44 45 /// \brief Node name getter 46 /// \return Name of the current node Name()47 std::string Name() const override { return kMindDataNode; } 48 49 /// \brief Print the description 50 /// \param out - The output stream to write output to 51 void Print(std::ostream &out) const override; 52 53 /// \brief Copy the node to a new object 54 /// \return A shared pointer to the new copy 55 std::shared_ptr<DatasetNode> Copy() override; 56 57 /// \brief a base class override function to create the required runtime dataset op objects for this class 58 /// \param node_ops - A vector containing shared pointer to the Dataset Ops that this object will create 59 /// \return Status Status::OK() if build successfully 60 Status Build(std::vector<std::shared_ptr<DatasetOp>> *const node_ops) override; 61 62 /// \brief Parameters validation 63 /// \return Status Status::OK() if all the parameters are valid 64 Status ValidateParams() override; 65 66 /// \brief Get the shard id of node 67 /// \return Status Status::OK() if get shard id successfully 68 Status GetShardId(int32_t *shard_id) override; 69 70 /// \brief Build sampler chain for minddata dataset 71 /// \return Status Status::OK() if input sampler is valid 72 Status BuildMindDatasetSamplerChain(const std::shared_ptr<SamplerObj> &sampler, 73 std::vector<std::shared_ptr<mindrecord::ShardOperator>> *operators_, 74 int64_t num_padded, ShuffleMode shuffle_mode); 75 76 /// \brief Set sample_bytes when padded_sample has py::byte value 77 /// \note Pybind will use this function to set sample_bytes into MindDataNode 78 void SetSampleBytes(std::map<std::string, std::string> *sample_bytes); 79 80 /// \brief Base-class override for GetDatasetSize 81 /// \param[in] size_getter Shared pointer to DatasetSizeGetter 82 /// \param[in] estimate This is only supported by some of the ops and it's used to speed up the process of getting 83 /// dataset size at the expense of accuracy. 84 /// \param[out] dataset_size the size of the dataset 85 /// \return Status of the function 86 Status GetDatasetSize(const std::shared_ptr<DatasetSizeGetter> &size_getter, bool estimate, 87 int64_t *dataset_size) override; 88 89 /// \brief Sampler getter 90 /// \return SamplerObj of the current node Sampler()91 std::shared_ptr<SamplerObj> Sampler() override { return sampler_; } 92 93 /// \brief Sampler setter SetSampler(std::shared_ptr<SamplerObj> sampler)94 void SetSampler(std::shared_ptr<SamplerObj> sampler) override { sampler_ = sampler; } 95 96 /// \brief Base-class override for accepting IRNodePass visitor 97 /// \param[in] p The node to visit 98 /// \param[out] modified Indicator if the node was modified 99 /// \return Status of the node visit 100 Status Accept(IRNodePass *const p, bool *const modified) override; 101 102 /// \brief Base-class override for accepting IRNodePass visitor 103 /// \param[in] p The node to visit 104 /// \param[out] modified Indicator if the node was modified 105 /// \return Status of the node visit 106 Status AcceptAfter(IRNodePass *const p, bool *const modified) override; 107 108 private: 109 std::string dataset_file_; // search_for_pattern_ will be true in this mode 110 std::vector<std::string> dataset_files_; // search_for_pattern_ will be false in this mode 111 bool search_for_pattern_; 112 std::vector<std::string> columns_list_; 113 std::shared_ptr<SamplerObj> input_sampler_; // The sampler from users input, will be used to create a set of shard 114 // operators. 115 std::shared_ptr<SamplerObj> sampler_; // An auto-created sampler, IR of runtime MindRecordSamplerRT sampler 116 nlohmann::json padded_sample_; 117 std::map<std::string, std::string> sample_bytes_; // enable in python 118 int64_t num_padded_; 119 std::vector<std::shared_ptr<ShardOperator>> operators_; 120 ShuffleMode shuffle_mode_; 121 }; 122 } // namespace dataset 123 } // namespace mindspore 124 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_IR_DATASETOPS_SOURCE_MINDDATA_NODE_H_ 125