• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minddata/dataset/text/kernels/to_number_op.h"
18 
19 #include "minddata/dataset/core/data_type.h"
20 #include "minddata/dataset/core/tensor.h"
21 #include "minddata/dataset/core/tensor_shape.h"
22 #include "minddata/dataset/kernels/data/data_utils.h"
23 #include "minddata/dataset/util/status.h"
24 
25 namespace mindspore {
26 namespace dataset {
27 
ToNumberOp(const DataType & data_type)28 ToNumberOp::ToNumberOp(const DataType &data_type) : cast_to_type_(data_type) {}
29 
ToNumberOp(const std::string & data_type)30 ToNumberOp::ToNumberOp(const std::string &data_type) : cast_to_type_(DataType(data_type)) {}
31 
Compute(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)32 Status ToNumberOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
33   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "ToNumber: input should be string datatype.");
34 
35   switch (cast_to_type_.value()) {
36     case DataType::DE_INT8:
37       RETURN_IF_NOT_OK(ToSignedIntegral<int8_t>(input, output));
38       break;
39     case DataType::DE_INT16:
40       RETURN_IF_NOT_OK(ToSignedIntegral<int16_t>(input, output));
41       break;
42     case DataType::DE_INT32:
43       RETURN_IF_NOT_OK(ToSignedIntegral<int32_t>(input, output));
44       break;
45     case DataType::DE_INT64:
46       RETURN_IF_NOT_OK(ToSignedIntegral<int64_t>(input, output));
47       break;
48     case DataType::DE_UINT8:
49       RETURN_IF_NOT_OK(ToUnsignedIntegral<uint8_t>(input, output));
50       break;
51     case DataType::DE_UINT16:
52       RETURN_IF_NOT_OK(ToUnsignedIntegral<uint16_t>(input, output));
53       break;
54     case DataType::DE_UINT32:
55       RETURN_IF_NOT_OK(ToUnsignedIntegral<uint32_t>(input, output));
56       break;
57     case DataType::DE_UINT64:
58       RETURN_IF_NOT_OK(ToUnsignedIntegral<uint64_t>(input, output));
59       break;
60     case DataType::DE_FLOAT16:
61       RETURN_IF_NOT_OK(this->ToFloat16(input, output));
62       break;
63     case DataType::DE_FLOAT32:
64       RETURN_IF_NOT_OK(ToFloat(input, output));
65       break;
66     case DataType::DE_FLOAT64:
67       RETURN_IF_NOT_OK(ToDouble(input, output));
68       break;
69     default:
70       RETURN_STATUS_UNEXPECTED(
71         "ToNumber: "
72         "unsupported cast type: " +
73         cast_to_type_.ToString());
74   }
75 
76   return Status::OK();
77 }
78 
Print(std::ostream & out) const79 void ToNumberOp::Print(std::ostream &out) const { out << "ToNumberOp: casting to " << '\n'; }
80 
OutputShape(const std::vector<TensorShape> & input_shapes,std::vector<TensorShape> & output_shapes)81 Status ToNumberOp::OutputShape(const std::vector<TensorShape> &input_shapes, std::vector<TensorShape> &output_shapes) {
82   (void)std::copy(input_shapes.begin(), input_shapes.end(), std::back_inserter(output_shapes));
83   return Status::OK();
84 }
85 
86 template <typename T>
ToSignedIntegral(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const87 Status ToNumberOp::ToSignedIntegral(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
88   std::vector<T> casted;
89 
90   for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
91     bool is_cast_out_of_range = false;
92     int64_t result = 0;
93 
94     try {
95       result = std::stoll(std::string(*it));
96     } catch (const std::out_of_range &) {
97       is_cast_out_of_range = true;
98     } catch (const std::invalid_argument &) {
99       RETURN_STATUS_UNEXPECTED(
100         "ToNumber: "
101         "it is invalid to convert \"" +
102         std::string(*it) + "\" to a number.");
103     }
104 
105     if (result > std::numeric_limits<T>::max() || result < std::numeric_limits<T>::min() || is_cast_out_of_range) {
106       std::string error_message =
107         "ToNumber: "
108         "string input " +
109         std::string(*it) + " will be out of bounds if cast to " + cast_to_type_.ToString() + ". The valid range is: [" +
110         std::to_string(std::numeric_limits<T>::min()) + ", " + std::to_string(std::numeric_limits<T>::max()) + "].";
111 
112       RETURN_STATUS_UNEXPECTED(error_message);
113     }
114 
115     T casted_result = static_cast<T>(result);
116     casted.push_back(casted_result);
117   }
118 
119   RETURN_IF_NOT_OK(Tensor::CreateFromVector(casted, input->shape(), output));
120   return Status::OK();
121 }
122 
123 template <typename T>
ToUnsignedIntegral(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const124 Status ToNumberOp::ToUnsignedIntegral(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
125   std::vector<T> casted;
126 
127   for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
128     bool is_cast_out_of_range = false;
129     uint64_t result = 0;
130 
131     // If there is a - at the start of the string, it is considered by us to
132     // be out of bounds. If the - is somewhere else in the string, it is
133     // deemed invalid by std::stoull and will throw std::invalid_argument
134     for (int i = 0; i < (*it).size(); i++) {
135       if ((*it)[i] == '-') {
136         is_cast_out_of_range = true;
137         break;
138       }
139     }
140 
141     try {
142       result = std::stoull(std::string(*it));
143     } catch (const std::out_of_range &) {
144       is_cast_out_of_range = true;
145     } catch (const std::invalid_argument &) {
146       RETURN_STATUS_UNEXPECTED(
147         "ToNumber: "
148         "It is invalid to convert \"" +
149         std::string(*it) + "\" to an unsigned integer.");
150     }
151 
152     if (result > std::numeric_limits<T>::max() || result < std::numeric_limits<T>::min() || is_cast_out_of_range) {
153       std::string error_message =
154         "ToNumber: "
155         "string input " +
156         std::string(*it) + " will be out of bounds if cast to " + cast_to_type_.ToString() + ". The valid range is: [" +
157         std::to_string(std::numeric_limits<T>::min()) + ", " + std::to_string(std::numeric_limits<T>::max()) + "].";
158 
159       RETURN_STATUS_UNEXPECTED(error_message);
160     }
161 
162     T casted_result = static_cast<T>(result);
163     casted.push_back(casted_result);
164   }
165 
166   RETURN_IF_NOT_OK(Tensor::CreateFromVector(casted, input->shape(), output));
167   return Status::OK();
168 }
169 
ToFloat16(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const170 Status ToNumberOp::ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
171   // special case, float16 does not exist in c++, no native support for
172   // casting, so cast to float first then use this method, which use Eigen.
173   std::shared_ptr<Tensor> temp;
174   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), DataType("float32"), &temp));
175   RETURN_IF_NOT_OK(ToFloat(input, &temp));
176   RETURN_IF_NOT_OK(mindspore::dataset::ToFloat16(temp, output));
177   return Status::OK();
178 }
179 
ToFloat(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const180 Status ToNumberOp::ToFloat(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
181   std::vector<float> casted;
182 
183   for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
184     bool is_cast_out_of_range = false;
185     float result = 0;
186 
187     try {
188       result = std::stof(std::string(*it));
189     } catch (const std::out_of_range &) {
190       is_cast_out_of_range = true;
191     } catch (const std::invalid_argument &) {
192       RETURN_STATUS_UNEXPECTED(
193         "ToNumber: "
194         "it is invalid to convert \"" +
195         std::string(*it) + "\" to an unsigned integer.");
196     }
197 
198     if (result > std::numeric_limits<float>::max() || result < std::numeric_limits<float>::lowest() ||
199         is_cast_out_of_range) {
200       std::string error_message =
201         "ToNumber: "
202         "string input " +
203         std::string(*it) + " will be out of bounds if cast to " + cast_to_type_.ToString() + ". The valid range is: [" +
204         std::to_string(std::numeric_limits<float>::lowest()) + ", " +
205         std::to_string(std::numeric_limits<float>::max()) + "].";
206 
207       RETURN_STATUS_UNEXPECTED(error_message);
208     }
209 
210     float casted_result = static_cast<float>(result);
211     casted.push_back(casted_result);
212   }
213 
214   RETURN_IF_NOT_OK(Tensor::CreateFromVector(casted, input->shape(), output));
215   return Status::OK();
216 }
217 
ToDouble(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output) const218 Status ToNumberOp::ToDouble(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) const {
219   std::vector<double> casted;
220 
221   for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
222     bool is_cast_out_of_range = false;
223     double result = 0;
224 
225     try {
226       result = std::stod(std::string(*it));
227     } catch (const std::out_of_range &) {
228       is_cast_out_of_range = true;
229     } catch (const std::invalid_argument &) {
230       RETURN_STATUS_UNEXPECTED(
231         "ToNumber: "
232         "it is invalid to convert \"" +
233         std::string(*it) + "\" to an unsigned integer.");
234     }
235 
236     if (result > std::numeric_limits<double>::max() || result < std::numeric_limits<double>::lowest() ||
237         is_cast_out_of_range) {
238       std::string error_message =
239         "ToNumber: "
240         "string input " +
241         std::string(*it) + " will be out of bounds if cast to " + cast_to_type_.ToString() + ". The valid range is: [" +
242         std::to_string(std::numeric_limits<double>::lowest()) + ", " +
243         std::to_string(std::numeric_limits<double>::max()) + "].";
244 
245       RETURN_STATUS_UNEXPECTED(error_message);
246     }
247 
248     double casted_result = static_cast<double>(result);
249     casted.push_back(casted_result);
250   }
251 
252   RETURN_IF_NOT_OK(Tensor::CreateFromVector(casted, input->shape(), output));
253   return Status::OK();
254 }
255 
256 }  // namespace dataset
257 }  // namespace mindspore
258