1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/text/kernels/normalize_utf8_op.h"
17 #include <memory>
18 #include <string_view>
19 #include <vector>
20
21 #include "unicode/errorcode.h"
22 #include "unicode/normalizer2.h"
23
24 namespace mindspore {
25 namespace dataset {
26 // global lock for icu::Normalizer2 *normalize
27 std::mutex icu_normalizer2_mux_;
28
29 const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
Compute(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)30 Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
31 IO_CHECK(input, output);
32 CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "NormalizeUTF8: input is not string datatype.");
33
34 icu::ErrorCode error;
35 const icu::Normalizer2 *normalize = nullptr;
36 switch (normalize_form_) {
37 case NormalizeForm::kNone: {
38 *output = input;
39 return Status::OK();
40 }
41 case NormalizeForm::kNfc: {
42 normalize = icu::Normalizer2::getNFCInstance(error);
43 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFCInstance failed.");
44 break;
45 }
46 case NormalizeForm::kNfkc: {
47 normalize = icu::Normalizer2::getNFKCInstance(error);
48 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFKCInstance failed.");
49 break;
50 }
51 case NormalizeForm::kNfd: {
52 normalize = icu::Normalizer2::getNFDInstance(error);
53 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFDInstance failed.");
54 break;
55 }
56 case NormalizeForm::kNfkd: {
57 normalize = icu::Normalizer2::getNFKDInstance(error);
58 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFKDInstance failed.");
59 break;
60 }
61 default: {
62 RETURN_STATUS_UNEXPECTED("NormalizeUTF8: unknown normalize form.");
63 break;
64 }
65 }
66 std::vector<std::string> strs(input->Size());
67 int i = 0;
68 for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
69 icu::StringByteSink<std::string> sink(&strs[i++]);
70 {
71 std::unique_lock<std::mutex> _lock(icu_normalizer2_mux_);
72 normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
73 }
74 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: NormalizeUTF8 failed.");
75 }
76 return Tensor::CreateFromVector(strs, input->shape(), output);
77 }
78 } // namespace dataset
79 } // namespace mindspore
80