• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/text/kernels/tokenizer_op.h"
17 #include "minddata/dataset/text/kernels/data_utils.h"
18 
19 namespace mindspore {
20 namespace dataset {
21 const bool TokenizerOp::kDefWithOffsets = false;
22 
Compute(const TensorRow & input,TensorRow * output)23 Status TokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
24   IO_CHECK_VECTOR(input, output);
25   CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, Name() + ": input should be one column data.");
26   if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
27     RETURN_STATUS_UNEXPECTED(Name() + ": the input shape should be scalar and the input datatype should be string.");
28   }
29   std::string_view str;
30   RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
31   std::shared_ptr<Tensor> token_tensor;
32   std::vector<uint32_t> offsets_start, offsets_limit;
33   std::vector<std::string> splits;
34   RETURN_IF_NOT_OK(Tokenize(str, &splits, &offsets_start, &offsets_limit));
35 
36   if (splits.empty()) {
37     (void)splits.emplace_back("");
38     offsets_start.push_back(0);
39     offsets_limit.push_back(0);
40   }
41   RETURN_IF_NOT_OK(Tensor::CreateFromVector(splits, &token_tensor));
42   output->push_back(token_tensor);
43   if (with_offsets_) {
44     RETURN_IF_NOT_OK(AppendOffsetsHelper(offsets_start, offsets_limit, output));
45   }
46   return Status::OK();
47 }
48 }  // namespace dataset
49 }  // namespace mindspore
50