1 /** 2 * Copyright 2020-2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CLUE_OP_H_ 17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CLUE_OP_H_ 18 19 #include <memory> 20 #include <map> 21 #include <mutex> 22 #include <string> 23 #include <utility> 24 #include <vector> 25 #include <nlohmann/json.hpp> 26 27 #include "minddata/dataset/util/auto_index.h" 28 #include "minddata/dataset/engine/datasetops/parallel_op.h" 29 #include "minddata/dataset/engine/datasetops/source/nonmappable_leaf_op.h" 30 #include "minddata/dataset/engine/jagged_connector.h" 31 32 namespace mindspore { 33 namespace dataset { 34 using StringIndex = AutoIndexObj<std::string>; 35 using ColKeyMap = std::map<std::string, std::vector<std::string>>; 36 37 class JaggedConnector; 38 39 class ClueOp : public NonMappableLeafOp { 40 public: 41 // Constructor of ClueOp 42 ClueOp(int32_t num_workers, int64_t num_samples, int32_t worker_connector_size, ColKeyMap cols_to_keyword, 43 std::vector<std::string> clue_files_list, int32_t op_connector_size, bool shuffle_files, int32_t num_devices, 44 int32_t device_id); 45 46 // Default destructor 47 ~ClueOp() = default; 48 49 // A print method typically used for debugging 50 // @param out - The output stream to write output to 51 // @param show_all - A bool to control if you want to show all info or just a summary 52 void Print(std::ostream &out, bool show_all) const override; 53 54 // Instantiates the internal queues and connectors 55 // @return Status - the error code returned 56 Status Init() override; 57 58 // Get total rows in files. 59 // @param files - all clue files. 60 // @param count - number of rows. 61 // @return Status - the error coed returned. 62 static Status CountAllFileRows(const std::vector<std::string> &files, int64_t *count); 63 64 // File names getter 65 // @return Vector of the input file names FileNames()66 std::vector<std::string> FileNames() { return clue_files_list_; } 67 68 // Op name getter 69 // @return Name of the current Op Name()70 std::string Name() const override { return "ClueOp"; } 71 72 private: 73 // Reads a clue file and loads the data into multiple TensorRows. 74 // @param file - the file to read. 75 // @param start_offset - the start offset of file. 76 // @param end_offset - the end offset of file. 77 // @param worker_id - the id of the worker that is executing this function. 78 // @return Status - the error code returned. 79 Status LoadFile(const std::string &file, int64_t start_offset, int64_t end_offset, int32_t worker_id) override; 80 81 // Fill the IOBlockQueue. 82 // @para i_keys - keys of file to fill to the IOBlockQueue 83 // @return Status - the error code returned. 84 Status FillIOBlockQueue(const std::vector<int64_t> &i_keys) override; 85 86 // Calculate number of rows in each shard. 87 // @return Status - the error code returned. 88 Status CalculateNumRowsPerShard() override; 89 90 // Count number of rows in each file. 91 // @param filename - clue file name. 92 // @return int64_t - the total number of rows in file. 93 int64_t CountTotalRows(const std::string &file); 94 95 // @return Status - the error code returned. 96 Status GetValue(const nlohmann::json &js, std::vector<std::string> key_chain, std::shared_ptr<Tensor> *t); 97 98 // Private function for computing the assignment of the column name map. 99 // @return - Status 100 Status ComputeColMap() override; 101 102 std::vector<std::string> clue_files_list_; 103 ColKeyMap cols_to_keyword_; 104 }; 105 } // namespace dataset 106 } // namespace mindspore 107 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CLUE_OP_H_ 108