1 /**
2 * Copyright 2020-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/api/python/pybind_conversion.h"
17
18 namespace mindspore {
19 namespace dataset {
toFloat(const py::handle & handle)20 float toFloat(const py::handle &handle) { return py::reinterpret_borrow<py::float_>(handle); }
21
toInt(const py::handle & handle)22 int toInt(const py::handle &handle) { return py::reinterpret_borrow<py::int_>(handle); }
23
toInt64(const py::handle & handle)24 int64_t toInt64(const py::handle &handle) { return py::reinterpret_borrow<py::int_>(handle); }
25
toBool(const py::handle & handle)26 bool toBool(const py::handle &handle) { return py::reinterpret_borrow<py::bool_>(handle); }
27
toString(const py::handle & handle)28 std::string toString(const py::handle &handle) { return py::reinterpret_borrow<py::str>(handle); }
29
toStringSet(const py::list list)30 std::set<std::string> toStringSet(const py::list list) {
31 std::set<std::string> set;
32 if (!list.empty()) {
33 for (auto l : list) {
34 if (!l.is_none()) {
35 (void)set.insert(py::str(l));
36 }
37 }
38 }
39 return set;
40 }
41
toStringMap(const py::dict dict)42 std::map<std::string, int32_t> toStringMap(const py::dict dict) {
43 std::map<std::string, int32_t> map;
44 if (!dict.empty()) {
45 for (auto p : dict) {
46 (void)map.emplace(toString(p.first), toInt(p.second));
47 }
48 }
49 return map;
50 }
51
toStringFloatMap(const py::dict dict)52 std::map<std::string, float> toStringFloatMap(const py::dict dict) {
53 std::map<std::string, float> map;
54 if (!dict.empty()) {
55 for (auto p : dict) {
56 (void)map.emplace(toString(p.first), toFloat(p.second));
57 }
58 }
59 return map;
60 }
61
toStringVector(const py::list list)62 std::vector<std::string> toStringVector(const py::list list) {
63 std::vector<std::string> vector;
64 if (!list.empty()) {
65 for (auto l : list) {
66 if (l.is_none()) {
67 vector.emplace_back("");
68 } else {
69 vector.push_back(py::str(l));
70 }
71 }
72 }
73 return vector;
74 }
75
toIntVector(const py::list input_list)76 std::vector<pid_t> toIntVector(const py::list input_list) {
77 std::vector<pid_t> vector;
78 if (!input_list.empty()) {
79 std::transform(input_list.begin(), input_list.end(), std::back_inserter(vector),
80 [&](const py::handle &handle) { return static_cast<pid_t>(toInt(handle)); });
81 }
82 return vector;
83 }
84
toInt64Vector(const py::list input_list)85 std::vector<int64_t> toInt64Vector(const py::list input_list) {
86 std::vector<int64_t> vector;
87 if (!input_list.empty()) {
88 std::transform(input_list.begin(), input_list.end(), std::back_inserter(vector),
89 [&](const py::handle &handle) { return static_cast<int64_t>(toInt64(handle)); });
90 }
91 return vector;
92 }
93
toIntMap(const py::dict input_dict)94 std::unordered_map<int32_t, std::vector<pid_t>> toIntMap(const py::dict input_dict) {
95 std::unordered_map<int32_t, std::vector<pid_t>> map;
96 if (!input_dict.empty()) {
97 for (auto p : input_dict) {
98 (void)map.emplace(toInt(p.first), toIntVector(py::reinterpret_borrow<py::list>(p.second)));
99 }
100 }
101 return map;
102 }
103
toIntPair(const py::tuple tuple)104 std::pair<int64_t, int64_t> toIntPair(const py::tuple tuple) {
105 std::pair<int64_t, int64_t> pair;
106 if (tuple.size() == 2) {
107 pair = std::make_pair(toInt64((tuple)[0]), toInt64((tuple)[1]));
108 }
109 return pair;
110 }
111
toPairVector(const py::list list)112 std::vector<std::pair<int, int>> toPairVector(const py::list list) {
113 std::vector<std::pair<int, int>> vector;
114 if (list) {
115 for (auto data : list) {
116 auto l = data.cast<py::tuple>();
117 if (l[1].is_none()) {
118 vector.emplace_back(toInt64(l[0]), 0);
119 } else {
120 vector.emplace_back(toInt64(l[0]), toInt64(l[1]));
121 }
122 }
123 }
124 return vector;
125 }
126
toTensorOperations(py::list operations)127 std::vector<std::shared_ptr<TensorOperation>> toTensorOperations(py::list operations) {
128 std::vector<std::shared_ptr<TensorOperation>> vector;
129 if (!operations.empty()) {
130 for (auto op : operations) {
131 std::shared_ptr<TensorOp> tensor_op;
132 if (py::isinstance<TensorOp>(op)) {
133 tensor_op = op.cast<std::shared_ptr<TensorOp>>();
134 vector.push_back(std::make_shared<transforms::PreBuiltOperation>(tensor_op));
135 } else if (py::isinstance<py::function>(op)) {
136 tensor_op = std::make_shared<PyFuncOp>(op.cast<py::function>());
137 vector.push_back(std::make_shared<transforms::PreBuiltOperation>(tensor_op));
138 } else {
139 if (py::isinstance<TensorOperation>(op)) {
140 vector.push_back(op.cast<std::shared_ptr<TensorOperation>>());
141 } else {
142 THROW_IF_ERROR([]() {
143 RETURN_STATUS_UNEXPECTED(
144 "Error: tensor_op is not recognised (not TensorOp, TensorOperation and not pyfunc).");
145 }());
146 }
147 }
148 }
149 }
150 return vector;
151 }
152
toTensorOperation(py::handle operation)153 std::shared_ptr<TensorOperation> toTensorOperation(py::handle operation) {
154 std::shared_ptr<TensorOperation> op;
155 std::shared_ptr<TensorOp> tensor_op;
156 if (py::isinstance<TensorOperation>(operation)) {
157 op = operation.cast<std::shared_ptr<TensorOperation>>();
158 } else if (py::isinstance<TensorOp>(operation)) {
159 tensor_op = operation.cast<std::shared_ptr<TensorOp>>();
160 op = std::make_shared<transforms::PreBuiltOperation>(tensor_op);
161 } else {
162 THROW_IF_ERROR(
163 []() { RETURN_STATUS_UNEXPECTED("Error: input operation is not a tensor_op or TensorOperation."); }());
164 }
165 return op;
166 }
167
toDatasetNode(std::shared_ptr<DatasetNode> self,py::list datasets)168 std::vector<std::shared_ptr<DatasetNode>> toDatasetNode(std::shared_ptr<DatasetNode> self, py::list datasets) {
169 std::vector<std::shared_ptr<DatasetNode>> vector;
170 vector.push_back(self);
171 if (datasets) {
172 for (auto ds : *datasets) {
173 if (py::isinstance<DatasetNode>(ds)) {
174 vector.push_back(ds.cast<std::shared_ptr<DatasetNode>>());
175 } else {
176 THROW_IF_ERROR(
177 []() { RETURN_STATUS_UNEXPECTED("Error: datasets is not recognised (not a DatasetNode instance)."); }());
178 }
179 }
180 }
181 return vector;
182 }
183
toSamplerObj(const py::handle py_sampler,bool isMindDataset)184 std::shared_ptr<SamplerObj> toSamplerObj(const py::handle py_sampler, bool isMindDataset) {
185 if (py_sampler.is_none()) {
186 return nullptr;
187 }
188 if (py_sampler) {
189 std::shared_ptr<SamplerObj> sampler_obj;
190 if (!isMindDataset) {
191 auto parse = py::reinterpret_borrow<py::object>(py_sampler).attr("parse");
192 sampler_obj = parse().cast<std::shared_ptr<SamplerObj>>();
193 } else {
194 // Mindrecord Sampler
195 std::shared_ptr<mindrecord::ShardOperator> sampler;
196 auto parse = py::reinterpret_borrow<py::object>(py_sampler).attr("parse_for_minddataset");
197 sampler = parse().cast<std::shared_ptr<mindrecord::ShardOperator>>();
198 sampler_obj = std::make_shared<PreBuiltSamplerObj>(std::move(sampler));
199 }
200 return sampler_obj;
201 } else {
202 THROW_IF_ERROR([]() { RETURN_STATUS_UNEXPECTED("Error: sampler input is not SamplerRT."); }());
203 }
204 return nullptr;
205 }
206
207 // Here we take in a python object, that holds a reference to a C++ object
toDatasetCache(std::shared_ptr<CacheClient> cc)208 std::shared_ptr<DatasetCache> toDatasetCache(std::shared_ptr<CacheClient> cc) {
209 if (cc) {
210 std::shared_ptr<DatasetCache> built_cache;
211 built_cache = std::make_shared<PreBuiltDatasetCache>(std::move(cc));
212 return built_cache;
213 } else {
214 // don't need to check here as cache is not enabled.
215 return nullptr;
216 }
217 }
218
toShuffleMode(const int32_t shuffle)219 ShuffleMode toShuffleMode(const int32_t shuffle) {
220 if (shuffle == 0) {
221 return ShuffleMode::kFalse;
222 }
223 if (shuffle == 1) {
224 return ShuffleMode::kFiles;
225 }
226 if (shuffle == 2) {
227 return ShuffleMode::kGlobal;
228 }
229 return ShuffleMode();
230 }
231
toCSVBase(py::list csv_bases)232 std::vector<std::shared_ptr<CsvBase>> toCSVBase(py::list csv_bases) {
233 std::vector<std::shared_ptr<CsvBase>> vector;
234 if (csv_bases) {
235 for (auto base : *csv_bases) {
236 if (py::isinstance<py::int_>(base)) {
237 vector.push_back(std::make_shared<CsvRecord<int>>(CsvType::INT, toInt(base)));
238 } else if (py::isinstance<py::float_>(base)) {
239 vector.push_back(std::make_shared<CsvRecord<float>>(CsvType::FLOAT, toFloat(base)));
240 } else if (py::isinstance<py::str>(base)) {
241 vector.push_back(std::make_shared<CsvRecord<std::string>>(CsvType::STRING, toString(base)));
242 } else {
243 THROW_IF_ERROR([]() { RETURN_STATUS_UNEXPECTED("Error: each default value must be int, float, or string"); }());
244 }
245 }
246 }
247 return vector;
248 }
249
ToJson(const py::handle & padded_sample,nlohmann::json * const padded_sample_json,std::map<std::string,std::string> * sample_bytes)250 Status ToJson(const py::handle &padded_sample, nlohmann::json *const padded_sample_json,
251 std::map<std::string, std::string> *sample_bytes) {
252 RETURN_UNEXPECTED_IF_NULL(padded_sample_json);
253 for (const py::handle &key : padded_sample) {
254 if (py::isinstance<py::bytes>(padded_sample[key])) {
255 (*sample_bytes)[py::str(key).cast<std::string>()] = padded_sample[key].cast<std::string>();
256 // py::str(key) enter here will loss its key name, so we create an unuse key for it in json, to pass ValidateParam
257 (*padded_sample_json)[py::str(key).cast<std::string>()] = nlohmann::json::object();
258 } else {
259 nlohmann::json obj_json;
260 if (padded_sample[key].is_none()) {
261 obj_json = nullptr;
262 } else if (py::isinstance<py::int_>(padded_sample[key])) {
263 obj_json = padded_sample[key].cast<int64_t>();
264 } else if (py::isinstance<py::float_>(padded_sample[key])) {
265 obj_json = padded_sample[key].cast<double>();
266 } else if (py::isinstance<py::str>(padded_sample[key])) {
267 obj_json = padded_sample[key].cast<std::string>(); // also catch py::bytes
268 } else {
269 std::string err_msg = "Python object convert to json failed: " + py::cast<std::string>(padded_sample[key]);
270 LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
271 }
272 (*padded_sample_json)[py::str(key).cast<std::string>()] = obj_json;
273 }
274 }
275 return Status::OK();
276 }
277
toPadInfo(const py::dict & value,std::map<std::string,std::pair<TensorShape,std::shared_ptr<Tensor>>> * pad_info)278 Status toPadInfo(const py::dict &value,
279 std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>> *pad_info) {
280 RETURN_UNEXPECTED_IF_NULL(pad_info);
281 constexpr size_t kExpectedTupleSize = 2;
282 for (auto p : value) {
283 if (!p.second.is_none()) {
284 auto tp = py::reinterpret_borrow<py::tuple>(p.second);
285 CHECK_FAIL_RETURN_UNEXPECTED(tp.size() == kExpectedTupleSize,
286 "tuple in pad_info must be (list,int) or (list,float)");
287 TensorShape shape = tp[0].is_none() ? TensorShape::CreateUnknownRankShape() : TensorShape(tp[0]);
288 std::shared_ptr<Tensor> pad_val = nullptr;
289 // Do not change the order of py::bytes and py::str. Because py::bytes is also an instance of py::str.
290 if (py::isinstance<py::bytes>(tp[1])) {
291 std::string pad_val_string = tp[1].is_none() ? "" : toString(tp[1]);
292 CHECK_FAIL_RETURN_UNEXPECTED(
293 Tensor::CreateFromVector(std::vector<std::string>{pad_val_string}, TensorShape::CreateScalar(),
294 DataType(DataType::DE_BYTES), &pad_val),
295 "Cannot create pad_value Tensor");
296 } else if (py::isinstance<py::str>(tp[1])) {
297 std::string pad_val_string = tp[1].is_none() ? "" : toString(tp[1]);
298 CHECK_FAIL_RETURN_UNEXPECTED(
299 Tensor::CreateFromVector(std::vector<std::string>{pad_val_string}, TensorShape::CreateScalar(),
300 DataType(DataType::DE_STRING), &pad_val),
301 "Cannot create pad_value Tensor");
302 } else {
303 float pad_val_float = tp[1].is_none() ? 0 : toFloat(tp[1]);
304 CHECK_FAIL_RETURN_UNEXPECTED(
305 Tensor::CreateEmpty(TensorShape::CreateScalar(), DataType(DataType::DE_FLOAT32), &pad_val),
306 "Cannot create pad_value Tensor");
307 RETURN_IF_NOT_OK(pad_val->SetItemAt<float>({}, pad_val_float));
308 }
309 (void)pad_info->insert({toString(p.first), {shape, pad_val}});
310 } else { // tuple is None
311 (void)pad_info->insert({toString(p.first), {TensorShape({}), nullptr}});
312 }
313 }
314 return Status::OK();
315 }
316
toPyFuncOp(py::object func,DataType::Type data_type)317 std::shared_ptr<TensorOp> toPyFuncOp(py::object func, DataType::Type data_type) {
318 std::shared_ptr<TensorOp> py_func;
319 if (!func.is_none()) {
320 py::function py_function = func.cast<py::function>();
321 py_func = std::make_shared<PyFuncOp>(py_function, data_type);
322 } else {
323 py_func = nullptr;
324 }
325 return py_func;
326 }
327
shapesToListOfShape(std::vector<TensorShape> shapes)328 py::list shapesToListOfShape(std::vector<TensorShape> shapes) {
329 py::list shape_list;
330 for (const auto &shape : shapes) {
331 py::list per_col_shape;
332 for (auto &elem : shape.AsVector()) {
333 if (elem == -1) {
334 per_col_shape.append(py::none());
335 } else {
336 per_col_shape.append(elem);
337 }
338 }
339 shape_list.append(per_col_shape);
340 }
341 return shape_list;
342 }
343
typesToListOfType(std::vector<DataType> types)344 py::list typesToListOfType(std::vector<DataType> types) {
345 py::list type_list;
346 for (const auto &type : types) {
347 type_list.append(type.AsNumpyType());
348 }
349 return type_list;
350 }
351
toIntMapTensor(py::dict value,std::unordered_map<std::int16_t,std::shared_ptr<Tensor>> * feature)352 Status toIntMapTensor(py::dict value, std::unordered_map<std::int16_t, std::shared_ptr<Tensor>> *feature) {
353 RETURN_UNEXPECTED_IF_NULL(feature);
354 for (const auto &p : value) {
355 // do some judge, as whether it is none
356 std::shared_ptr<Tensor> feat_tensor = nullptr;
357 RETURN_IF_NOT_OK(Tensor::CreateFromNpArray(py::reinterpret_borrow<py::array>(p.second), &feat_tensor));
358 (void)feature->insert({toInt(p.first), feat_tensor});
359 }
360 return Status::OK();
361 }
362 } // namespace dataset
363 } // namespace mindspore
364