1 /** 2 * Copyright 2024 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_DATA_PARSE_EXAMPLE_OP_H_ 17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_DATA_PARSE_EXAMPLE_OP_H_ 18 19 #include <unsupported/Eigen/CXX11/ThreadPool> 20 21 #include <memory> 22 #include <string> 23 #include <unordered_map> 24 #include <utility> 25 #include <vector> 26 27 #include "minddata/dataset/core/tensor.h" 28 #include "minddata/dataset/engine/data_schema.h" 29 #include "minddata/dataset/kernels/tensor_op.h" 30 31 namespace mindspore { 32 namespace dataset { 33 constexpr int kThreadPoolSize = 32; 34 35 struct VarLenTensorBuffer { 36 std::vector<std::shared_ptr<Tensor>> numeric_tensor; // store the minibatch of numeric tensors 37 std::vector<std::string> string_tensor; // store the minibatch of strings 38 size_t string_length; // store the lengtn of string in minibatch 39 }; 40 41 class ParseExampleOp : public TensorOp { 42 public: ParseExampleOp(DataSchema data_schema,std::vector<std::string> column_list,bool parallel_parse)43 ParseExampleOp(DataSchema data_schema, std::vector<std::string> column_list, bool parallel_parse) 44 : data_schema_(std::move(data_schema)), 45 column_list_(std::move(column_list)), 46 parallel_parse_(parallel_parse), 47 pool_(nullptr) { 48 if (parallel_parse) { 49 pool_ = std::make_unique<Eigen::ThreadPool>(kThreadPoolSize); 50 } 51 } 52 53 ~ParseExampleOp() override = default; 54 55 Status Compute(const TensorRow &input, TensorRow *output) override; 56 Name()57 std::string Name() const override { return kParseExampleOp; } 58 59 private: 60 Status ParseSingleExample(const TensorRow &raw_bytes, TensorRow *parsed_row); 61 62 Status ParallelParseExample(const TensorRow &raw_bytes, TensorRow *parsed_row); 63 64 Status ParseSerializedExample(const std::string &example_bytes, TensorRow *parsed_row, 65 std::unordered_map<int32_t, std::vector<std::string>> *string_column_map, 66 std::vector<VarLenTensorBuffer> *varlen_tensor_vector, size_t tensor_index); 67 68 Status ConstructColumnMap(const std::string &example_bytes); 69 70 DataSchema data_schema_; 71 std::vector<std::string> column_list_; 72 bool parallel_parse_; 73 std::unique_ptr<Eigen::ThreadPool> pool_; 74 std::unordered_map<std::string, int32_t> column_name_id_map_; 75 }; 76 } // namespace dataset 77 } // namespace mindspore 78 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_DATA_PARSE_EXAMPLE_OP_H_ 79