• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CLUE_OP_H_
17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CLUE_OP_H_
18 
19 #include <memory>
20 #include <map>
21 #include <mutex>
22 #include <string>
23 #include <utility>
24 #include <vector>
25 #include <nlohmann/json.hpp>
26 
27 #include "minddata/dataset/util/auto_index.h"
28 #include "minddata/dataset/engine/datasetops/parallel_op.h"
29 #include "minddata/dataset/engine/datasetops/source/nonmappable_leaf_op.h"
30 #include "minddata/dataset/engine/jagged_connector.h"
31 
32 namespace mindspore {
33 namespace dataset {
34 using StringIndex = AutoIndexObj<std::string>;
35 using ColKeyMap = std::map<std::string, std::vector<std::string>>;
36 
37 class JaggedConnector;
38 
39 class ClueOp : public NonMappableLeafOp {
40  public:
41   // Constructor of ClueOp
42   ClueOp(int32_t num_workers, int64_t num_samples, int32_t worker_connector_size, ColKeyMap cols_to_keyword,
43          std::vector<std::string> clue_files_list, int32_t op_connector_size, bool shuffle_files, int32_t num_devices,
44          int32_t device_id);
45 
46   // Default destructor
47   ~ClueOp() = default;
48 
49   // A print method typically used for debugging
50   // @param out - The output stream to write output to
51   // @param show_all - A bool to control if you want to show all info or just a summary
52   void Print(std::ostream &out, bool show_all) const override;
53 
54   // Instantiates the internal queues and connectors
55   // @return Status - the error code returned
56   Status Init() override;
57 
58   // Get total rows in files.
59   // @param files - all clue files.
60   // @param count - number of rows.
61   // @return Status - the error coed returned.
62   static Status CountAllFileRows(const std::vector<std::string> &files, int64_t *count);
63 
64   // File names getter
65   // @return Vector of the input file names
FileNames()66   std::vector<std::string> FileNames() { return clue_files_list_; }
67 
68   // Op name getter
69   // @return Name of the current Op
Name()70   std::string Name() const override { return "ClueOp"; }
71 
72  private:
73   // Reads a clue file and loads the data into multiple TensorRows.
74   // @param file - the file to read.
75   // @param start_offset - the start offset of file.
76   // @param end_offset - the end offset of file.
77   // @param worker_id - the id of the worker that is executing this function.
78   // @return Status - the error code returned.
79   Status LoadFile(const std::string &file, int64_t start_offset, int64_t end_offset, int32_t worker_id) override;
80 
81   // Fill the IOBlockQueue.
82   // @para i_keys - keys of file to fill to the IOBlockQueue
83   // @return Status - the error code returned.
84   Status FillIOBlockQueue(const std::vector<int64_t> &i_keys) override;
85 
86   // Calculate number of rows in each shard.
87   // @return Status - the error code returned.
88   Status CalculateNumRowsPerShard() override;
89 
90   // Count number of rows in each file.
91   // @param filename - clue file name.
92   // @return int64_t - the total number of rows in file.
93   int64_t CountTotalRows(const std::string &file);
94 
95   // @return Status - the error code returned.
96   Status GetValue(const nlohmann::json &js, std::vector<std::string> key_chain, std::shared_ptr<Tensor> *t);
97 
98   // Private function for computing the assignment of the column name map.
99   // @return - Status
100   Status ComputeColMap() override;
101 
102   std::vector<std::string> clue_files_list_;
103   ColKeyMap cols_to_keyword_;
104 };
105 }  // namespace dataset
106 }  // namespace mindspore
107 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_DATASETOPS_SOURCE_CLUE_OP_H_
108