1 /**
2 * Copyright 2020-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "minddata/dataset/text/kernels/data_utils.h"
18
19 #include <algorithm>
20 #include <string>
21
22 #include "minddata/dataset/core/pybind_support.h"
23 #include "minddata/dataset/kernels/data/slice_op.h"
24 #include "minddata/dataset/kernels/data/concatenate_op.h"
25 #include "minddata/dataset/kernels/data/data_utils.h"
26
27 namespace mindspore {
28 namespace dataset {
SlidingWindowHelper(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,TensorShape out_shape,uint32_t width,int32_t axis)29 Status SlidingWindowHelper(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, TensorShape out_shape,
30 uint32_t width, int32_t axis) {
31 // if the data row has fewer items than width, the corresponding result row will be empty
32 if (out_shape.Size() == 0) {
33 MS_LOG(WARNING) << "The data row has fewer items than width, the result will be empty.";
34 return Tensor::CreateEmpty(TensorShape({0}), input->type(), output);
35 }
36
37 axis = Tensor::HandleNeg(axis, input->shape().Size());
38 int32_t axis_end = input->shape()[axis];
39 std::shared_ptr<Tensor> tmp;
40 auto concatenate_op = std::make_unique<ConcatenateOp>(axis, nullptr, nullptr);
41
42 // Slice on specified axis and concatenate on new axis
43 for (int32_t i = 0; i + width <= axis_end; i++) {
44 auto slice_op = std::make_unique<SliceOp>(Slice(i, i + width, 1));
45 RETURN_IF_NOT_OK(slice_op->Compute(input, &tmp));
46 if (i == 0) {
47 *output = tmp;
48 } else {
49 TensorRow in({*output, tmp});
50 TensorRow out_row;
51 RETURN_IF_NOT_OK(concatenate_op->Compute(in, &out_row));
52 *output = out_row[0];
53 }
54 }
55 RETURN_IF_NOT_OK((*output)->Reshape(out_shape));
56 return Status::OK();
57 }
58
AppendOffsetsHelper(const std::vector<uint32_t> & offsets_start,const std::vector<uint32_t> & offsets_limit,TensorRow * output)59 Status AppendOffsetsHelper(const std::vector<uint32_t> &offsets_start, const std::vector<uint32_t> &offsets_limit,
60 TensorRow *output) {
61 std::shared_ptr<Tensor> offsets_start_tensor, offsets_limit_tensor;
62 RETURN_IF_NOT_OK(Tensor::CreateFromVector(offsets_start, &offsets_start_tensor));
63 RETURN_IF_NOT_OK(Tensor::CreateFromVector(offsets_limit, &offsets_limit_tensor));
64
65 output->push_back(offsets_start_tensor);
66 output->push_back(offsets_limit_tensor);
67 return Status::OK();
68 }
69
AddToken(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,const std::string & token,bool begin)70 Status AddToken(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const std::string &token,
71 bool begin) {
72 if (input->Rank() == 1) {
73 std::shared_ptr<Tensor> append;
74 RETURN_IF_NOT_OK(Tensor::CreateFromVector(std::vector<std::string>({token}), &append));
75 TensorRow in({input});
76 TensorRow out;
77 RETURN_IF_NOT_OK(Concatenate(in, &out, 0, begin ? append : nullptr, begin ? nullptr : append));
78 *output = out[0];
79 } else {
80 std::vector<std::string> output_vector;
81 int dim = input->shape()[0];
82 int shape = input->shape()[-1];
83 int count = 0;
84 for (auto it = input->begin<std::string_view>(); it != input->end<std::string_view>(); ++it) {
85 if (count >= shape) {
86 count = 0;
87 }
88 if (begin && count == 0) {
89 output_vector.emplace_back(token);
90 }
91 output_vector.emplace_back(*it);
92 if (!begin && count == shape - 1) {
93 output_vector.emplace_back(token);
94 }
95 count++;
96 }
97 shape++;
98 RETURN_IF_NOT_OK(Tensor::CreateFromVector(output_vector, TensorShape({dim, shape}), output));
99 }
100 return Status::OK();
101 }
102
Truncate(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int max_seq_len)103 Status Truncate(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int max_seq_len) {
104 if (input->shape().Rank() == 1) {
105 return input->Slice(output, {SliceOption(Slice(max_seq_len))});
106 } else {
107 int dim = input->shape()[0];
108 Slice slice_dim = Slice(dim);
109 Slice slice_len = Slice(max_seq_len);
110 return input->Slice(output, {SliceOption(slice_dim), SliceOption(slice_len)});
111 }
112 }
113 } // namespace dataset
114 } // namespace mindspore
115