• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2024 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_DATA_PARSE_EXAMPLE_OP_H_
17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_DATA_PARSE_EXAMPLE_OP_H_
18 
19 #include <unsupported/Eigen/CXX11/ThreadPool>
20 
21 #include <memory>
22 #include <string>
23 #include <unordered_map>
24 #include <utility>
25 #include <vector>
26 
27 #include "minddata/dataset/core/tensor.h"
28 #include "minddata/dataset/engine/data_schema.h"
29 #include "minddata/dataset/kernels/tensor_op.h"
30 
31 namespace mindspore {
32 namespace dataset {
33 constexpr int kThreadPoolSize = 32;
34 
35 struct VarLenTensorBuffer {
36   std::vector<std::shared_ptr<Tensor>> numeric_tensor;  // store the minibatch of numeric tensors
37   std::vector<std::string> string_tensor;               // store the minibatch of strings
38   size_t string_length;                                 // store the lengtn of string in minibatch
39 };
40 
41 class ParseExampleOp : public TensorOp {
42  public:
ParseExampleOp(DataSchema data_schema,std::vector<std::string> column_list,bool parallel_parse)43   ParseExampleOp(DataSchema data_schema, std::vector<std::string> column_list, bool parallel_parse)
44       : data_schema_(std::move(data_schema)),
45         column_list_(std::move(column_list)),
46         parallel_parse_(parallel_parse),
47         pool_(nullptr) {
48     if (parallel_parse) {
49       pool_ = std::make_unique<Eigen::ThreadPool>(kThreadPoolSize);
50     }
51   }
52 
53   ~ParseExampleOp() override = default;
54 
55   Status Compute(const TensorRow &input, TensorRow *output) override;
56 
Name()57   std::string Name() const override { return kParseExampleOp; }
58 
59  private:
60   Status ParseSingleExample(const TensorRow &raw_bytes, TensorRow *parsed_row);
61 
62   Status ParallelParseExample(const TensorRow &raw_bytes, TensorRow *parsed_row);
63 
64   Status ParseSerializedExample(const std::string &example_bytes, TensorRow *parsed_row,
65                                 std::unordered_map<int32_t, std::vector<std::string>> *string_column_map,
66                                 std::vector<VarLenTensorBuffer> *varlen_tensor_vector, size_t tensor_index);
67 
68   Status ConstructColumnMap(const std::string &example_bytes);
69 
70   DataSchema data_schema_;
71   std::vector<std::string> column_list_;
72   bool parallel_parse_;
73   std::unique_ptr<Eigen::ThreadPool> pool_;
74   std::unordered_map<std::string, int32_t> column_name_id_map_;
75 };
76 }  // namespace dataset
77 }  // namespace mindspore
78 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_KERNELS_DATA_PARSE_EXAMPLE_OP_H_
79