1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/text/kernels/to_vectors_op.h"
17
18 namespace mindspore {
19 namespace dataset {
ToVectorsOp(const std::shared_ptr<Vectors> & vectors,const std::vector<float> & unk_init,bool lower_case_backup)20 ToVectorsOp::ToVectorsOp(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
21 bool lower_case_backup)
22 : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
23
Compute(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)24 Status ToVectorsOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
25 IO_CHECK(input, output);
26 CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "ToVectors: input tensor type should be string.");
27 CHECK_FAIL_RETURN_UNEXPECTED(unk_init_.size() == 0 || unk_init_.size() == vectors_->Dim(),
28 "ToVectors: unk_init must be the same length as vectors, but got unk_init: " +
29 std::to_string(unk_init_.size()) + " and vectors: " + std::to_string(vectors_->Dim()));
30
31 std::vector<float> vectors_vec;
32 int len = 0;
33 for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); ++itr) {
34 std::vector<float> vectors_value = vectors_->Lookup(std::string(*itr), unk_init_, lower_case_backup_);
35 CHECK_FAIL_RETURN_UNEXPECTED(!vectors_value.empty(), "ToVectors: invalid data, token: \"" + std::string(*itr) +
36 "\" doesn't exist in vectors and no unk_init is specified.");
37 vectors_vec.insert(vectors_vec.end(), vectors_value.begin(), vectors_value.end());
38 len++;
39 }
40
41 int dim = static_cast<int>(vectors_vec.size() / len);
42 if (vectors_vec.size() == dim) {
43 RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, output));
44 } else {
45 RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, TensorShape({len, dim}), output));
46 }
47 return Status::OK();
48 }
49
OutputType(const std::vector<DataType> & inputs,std::vector<DataType> & outputs)50 Status ToVectorsOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
51 CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() == NumInput() && outputs.size() == NumOutput(),
52 "ToVectors: input and output size don't match.");
53 CHECK_FAIL_RETURN_UNEXPECTED(inputs[0] == DataType::DE_STRING, "ToVectors: input tensor type should be string.");
54 outputs[0] = DataType(DataType::DE_FLOAT32);
55 return Status::OK();
56 }
57 } // namespace dataset
58 } // namespace mindspore
59