1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/text/kernels/tokenizer_op.h"
17 #include "minddata/dataset/text/kernels/data_utils.h"
18
19 namespace mindspore {
20 namespace dataset {
21 const bool TokenizerOp::kDefWithOffsets = false;
22
Compute(const TensorRow & input,TensorRow * output)23 Status TokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
24 IO_CHECK_VECTOR(input, output);
25 CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, Name() + ": input should be one column data.");
26 if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
27 RETURN_STATUS_UNEXPECTED(Name() + ": the input shape should be scalar and the input datatype should be string.");
28 }
29 std::string_view str;
30 RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
31 std::shared_ptr<Tensor> token_tensor;
32 std::vector<uint32_t> offsets_start, offsets_limit;
33 std::vector<std::string> splits;
34 RETURN_IF_NOT_OK(Tokenize(str, &splits, &offsets_start, &offsets_limit));
35
36 if (splits.empty()) {
37 (void)splits.emplace_back("");
38 offsets_start.push_back(0);
39 offsets_limit.push_back(0);
40 }
41 RETURN_IF_NOT_OK(Tensor::CreateFromVector(splits, &token_tensor));
42 output->push_back(token_tensor);
43 if (with_offsets_) {
44 RETURN_IF_NOT_OK(AppendOffsetsHelper(offsets_start, offsets_limit, output));
45 }
46 return Status::OK();
47 }
48 } // namespace dataset
49 } // namespace mindspore
50