• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/text/kernels/normalize_utf8_op.h"
17 #include <memory>
18 #include <string_view>
19 #include <vector>
20 
21 #include "unicode/errorcode.h"
22 #include "unicode/normalizer2.h"
23 
24 namespace mindspore {
25 namespace dataset {
26 // global lock for icu::Normalizer2 *normalize
27 std::mutex icu_normalizer2_mux_;
28 
29 const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
Compute(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)30 Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
31   IO_CHECK(input, output);
32   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "NormalizeUTF8: input is not string datatype.");
33 
34   icu::ErrorCode error;
35   const icu::Normalizer2 *normalize = nullptr;
36   switch (normalize_form_) {
37     case NormalizeForm::kNone: {
38       *output = input;
39       return Status::OK();
40     }
41     case NormalizeForm::kNfc: {
42       normalize = icu::Normalizer2::getNFCInstance(error);
43       CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFCInstance failed.");
44       break;
45     }
46     case NormalizeForm::kNfkc: {
47       normalize = icu::Normalizer2::getNFKCInstance(error);
48       CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFKCInstance failed.");
49       break;
50     }
51     case NormalizeForm::kNfd: {
52       normalize = icu::Normalizer2::getNFDInstance(error);
53       CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFDInstance failed.");
54       break;
55     }
56     case NormalizeForm::kNfkd: {
57       normalize = icu::Normalizer2::getNFKDInstance(error);
58       CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFKDInstance failed.");
59       break;
60     }
61     default: {
62       RETURN_STATUS_UNEXPECTED("NormalizeUTF8: unknown normalize form.");
63       break;
64     }
65   }
66   std::vector<std::string> strs(input->Size());
67   int i = 0;
68   for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
69     icu::StringByteSink<std::string> sink(&strs[i++]);
70     {
71       std::unique_lock<std::mutex> _lock(icu_normalizer2_mux_);
72       normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
73     }
74     CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: NormalizeUTF8 failed.");
75   }
76   return Tensor::CreateFromVector(strs, input->shape(), output);
77 }
78 }  // namespace dataset
79 }  // namespace mindspore
80