• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/text/kernels/normalize_utf8_op.h"
17 #include <memory>
18 #include <string_view>
19 #include <vector>
20 
21 #include "unicode/errorcode.h"
22 #include "unicode/normalizer2.h"
23 
24 namespace mindspore {
25 namespace dataset {
26 const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
Compute(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)27 Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
28   IO_CHECK(input, output);
29   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "NormalizeUTF8: input is not string datatype.");
30 
31   icu::ErrorCode error;
32   const icu::Normalizer2 *normalize = nullptr;
33   switch (normalize_form_) {
34     case NormalizeForm::kNone: {
35       *output = input;
36       return Status::OK();
37     }
38     case NormalizeForm::kNfc: {
39       normalize = icu::Normalizer2::getNFCInstance(error);
40       CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFCInstance failed.");
41       break;
42     }
43     case NormalizeForm::kNfkc: {
44       normalize = icu::Normalizer2::getNFKCInstance(error);
45       CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFKCInstance failed.");
46       break;
47     }
48     case NormalizeForm::kNfd: {
49       normalize = icu::Normalizer2::getNFDInstance(error);
50       CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFDInstance failed.");
51       break;
52     }
53     case NormalizeForm::kNfkd: {
54       normalize = icu::Normalizer2::getNFKDInstance(error);
55       CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFKDInstance failed.");
56       break;
57     }
58     default: {
59       RETURN_STATUS_UNEXPECTED("NormalizeUTF8: unknown normalize form.");
60       break;
61     }
62   }
63   std::vector<std::string> strs(input->Size());
64   int i = 0;
65   for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
66     icu::StringByteSink<std::string> sink(&strs[i++]);
67     normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
68     CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: NormalizeUTF8 failed.");
69   }
70   return Tensor::CreateFromVector(strs, input->shape(), output);
71 }
72 }  // namespace dataset
73 }  // namespace mindspore
74