1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "minddata/dataset/text/kernels/to_number_op.h"
18
19 #include "minddata/dataset/core/data_type.h"
20 #include "minddata/dataset/core/tensor.h"
21 #include "minddata/dataset/core/tensor_shape.h"
22 #include "minddata/dataset/kernels/data/data_utils.h"
23 #include "minddata/dataset/util/status.h"
24
25 namespace mindspore {
26 namespace dataset {
27
ToNumberOp(const DataType & data_type)28 ToNumberOp::ToNumberOp(const DataType &data_type) : cast_to_type_(data_type) {}
29
ToNumberOp(const std::string & data_type)30 ToNumberOp::ToNumberOp(const std::string &data_type) : cast_to_type_(DataType(data_type)) {}
31
Compute(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)32 Status ToNumberOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
33 CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "ToNumber: input should be string datatype.");
34
35 switch (cast_to_type_.value()) {
36 case DataType::DE_INT8:
37 RETURN_IF_NOT_OK(ToSignedIntegral<int8_t>(input, output));
38 break;
39 case DataType::DE_INT16:
40 RETURN_IF_NOT_OK(ToSignedIntegral<int16_t>(input, output));
41 break;
42 case DataType::DE_INT32:
43 RETURN_IF_NOT_OK(ToSignedIntegral<int32_t>(input, output));
44 break;
45 case DataType::DE_INT64:
46 RETURN_IF_NOT_OK(ToSignedIntegral<int64_t>(input, output));
47 break;
48 case DataType::DE_UINT8:
49 RETURN_IF_NOT_OK(ToUnsignedIntegral<uint8_t>(input, output));
50 break;
51 case DataType::DE_UINT16:
52 RETURN_IF_NOT_OK(ToUnsignedIntegral<uint16_t>(input, output));
53 break;
54 case DataType::DE_UINT32:
55 RETURN_IF_NOT_OK(ToUnsignedIntegral<uint32_t>(input, output));
56 break;
57 case DataType::DE_UINT64:
58 RETURN_IF_NOT_OK(ToUnsignedIntegral<uint64_t>(input, output));
59 break;
60 case DataType::DE_FLOAT16:
61 RETURN_IF_NOT_OK(this->ToFloat16(input, output));
62 break;
63 case DataType::DE_FLOAT32:
64 RETURN_IF_NOT_OK(ToFloat(input, output));
65 break;
66 case DataType::DE_FLOAT64:
67 RETURN_IF_NOT_OK(ToDouble(input, output));
68 break;
69 default:
70 RETURN_STATUS_UNEXPECTED(
71 "ToNumber: "
72 "unsupported cast type: " +
73 cast_to_type_.ToString());
74 }
75
76 return Status::OK();
77 }
78
Print(std::ostream & out) const79 void ToNumberOp::Print(std::ostream &out) const { out << "ToNumberOp: casting to " << '\n'; }
80
OutputShape(const std::vector<TensorShape> & input_shapes,std::vector<TensorShape> & output_shapes)81 Status ToNumberOp::OutputShape(const std::vector<TensorShape> &input_shapes, std::vector<TensorShape> &output_shapes) {
82 (void)std::copy(input_shapes.begin(), input_shapes.end(), std::back_inserter(output_shapes));
83 return Status::OK();
84 }
85
86 template <typename T>
ToSignedIntegral(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const87 Status ToNumberOp::ToSignedIntegral(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
88 std::vector<T> casted;
89
90 for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
91 bool is_cast_out_of_range = false;
92 int64_t result = 0;
93
94 try {
95 result = std::stoll(std::string(*it));
96 } catch (const std::out_of_range &) {
97 is_cast_out_of_range = true;
98 } catch (const std::invalid_argument &) {
99 RETURN_STATUS_UNEXPECTED(
100 "ToNumber: "
101 "it is invalid to convert \"" +
102 std::string(*it) + "\" to a number.");
103 }
104
105 if (result > std::numeric_limits<T>::max() || result < std::numeric_limits<T>::min() || is_cast_out_of_range) {
106 std::string error_message =
107 "ToNumber: "
108 "string input " +
109 std::string(*it) + " will be out of bounds if cast to " + cast_to_type_.ToString() + ". The valid range is: [" +
110 std::to_string(std::numeric_limits<T>::min()) + ", " + std::to_string(std::numeric_limits<T>::max()) + "].";
111
112 RETURN_STATUS_UNEXPECTED(error_message);
113 }
114
115 T casted_result = static_cast<T>(result);
116 casted.push_back(casted_result);
117 }
118
119 RETURN_IF_NOT_OK(Tensor::CreateFromVector(casted, input->shape(), output));
120 return Status::OK();
121 }
122
123 template <typename T>
ToUnsignedIntegral(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const124 Status ToNumberOp::ToUnsignedIntegral(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
125 std::vector<T> casted;
126
127 for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
128 bool is_cast_out_of_range = false;
129 uint64_t result = 0;
130
131 // If there is a - at the start of the string, it is considered by us to
132 // be out of bounds. If the - is somewhere else in the string, it is
133 // deemed invalid by std::stoull and will throw std::invalid_argument
134 for (int i = 0; i < (*it).size(); i++) {
135 if ((*it)[i] == '-') {
136 is_cast_out_of_range = true;
137 break;
138 }
139 }
140
141 try {
142 result = std::stoull(std::string(*it));
143 } catch (const std::out_of_range &) {
144 is_cast_out_of_range = true;
145 } catch (const std::invalid_argument &) {
146 RETURN_STATUS_UNEXPECTED(
147 "ToNumber: "
148 "It is invalid to convert \"" +
149 std::string(*it) + "\" to an unsigned integer.");
150 }
151
152 if (result > std::numeric_limits<T>::max() || result < std::numeric_limits<T>::min() || is_cast_out_of_range) {
153 std::string error_message =
154 "ToNumber: "
155 "string input " +
156 std::string(*it) + " will be out of bounds if cast to " + cast_to_type_.ToString() + ". The valid range is: [" +
157 std::to_string(std::numeric_limits<T>::min()) + ", " + std::to_string(std::numeric_limits<T>::max()) + "].";
158
159 RETURN_STATUS_UNEXPECTED(error_message);
160 }
161
162 T casted_result = static_cast<T>(result);
163 casted.push_back(casted_result);
164 }
165
166 RETURN_IF_NOT_OK(Tensor::CreateFromVector(casted, input->shape(), output));
167 return Status::OK();
168 }
169
ToFloat16(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const170 Status ToNumberOp::ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
171 // special case, float16 does not exist in c++, no native support for
172 // casting, so cast to float first then use this method, which use Eigen.
173 std::shared_ptr<Tensor> temp;
174 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), DataType("float32"), &temp));
175 RETURN_IF_NOT_OK(ToFloat(input, &temp));
176 RETURN_IF_NOT_OK(mindspore::dataset::ToFloat16(temp, output));
177 return Status::OK();
178 }
179
ToFloat(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const180 Status ToNumberOp::ToFloat(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
181 std::vector<float> casted;
182
183 for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
184 bool is_cast_out_of_range = false;
185 float result = 0;
186
187 try {
188 result = std::stof(std::string(*it));
189 } catch (const std::out_of_range &) {
190 is_cast_out_of_range = true;
191 } catch (const std::invalid_argument &) {
192 RETURN_STATUS_UNEXPECTED(
193 "ToNumber: "
194 "it is invalid to convert \"" +
195 std::string(*it) + "\" to an unsigned integer.");
196 }
197
198 if (result > std::numeric_limits<float>::max() || result < std::numeric_limits<float>::lowest() ||
199 is_cast_out_of_range) {
200 std::string error_message =
201 "ToNumber: "
202 "string input " +
203 std::string(*it) + " will be out of bounds if cast to " + cast_to_type_.ToString() + ". The valid range is: [" +
204 std::to_string(std::numeric_limits<float>::lowest()) + ", " +
205 std::to_string(std::numeric_limits<float>::max()) + "].";
206
207 RETURN_STATUS_UNEXPECTED(error_message);
208 }
209
210 float casted_result = static_cast<float>(result);
211 casted.push_back(casted_result);
212 }
213
214 RETURN_IF_NOT_OK(Tensor::CreateFromVector(casted, input->shape(), output));
215 return Status::OK();
216 }
217
ToDouble(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const218 Status ToNumberOp::ToDouble(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
219 std::vector<double> casted;
220
221 for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
222 bool is_cast_out_of_range = false;
223 double result = 0;
224
225 try {
226 result = std::stod(std::string(*it));
227 } catch (const std::out_of_range &) {
228 is_cast_out_of_range = true;
229 } catch (const std::invalid_argument &) {
230 RETURN_STATUS_UNEXPECTED(
231 "ToNumber: "
232 "it is invalid to convert \"" +
233 std::string(*it) + "\" to an unsigned integer.");
234 }
235
236 if (result > std::numeric_limits<double>::max() || result < std::numeric_limits<double>::lowest() ||
237 is_cast_out_of_range) {
238 std::string error_message =
239 "ToNumber: "
240 "string input " +
241 std::string(*it) + " will be out of bounds if cast to " + cast_to_type_.ToString() + ". The valid range is: [" +
242 std::to_string(std::numeric_limits<double>::lowest()) + ", " +
243 std::to_string(std::numeric_limits<double>::max()) + "].";
244
245 RETURN_STATUS_UNEXPECTED(error_message);
246 }
247
248 double casted_result = static_cast<double>(result);
249 casted.push_back(casted_result);
250 }
251
252 RETURN_IF_NOT_OK(Tensor::CreateFromVector(casted, input->shape(), output));
253 return Status::OK();
254 }
255
256 } // namespace dataset
257 } // namespace mindspore
258