1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/text/kernels/normalize_utf8_op.h"
17 #include <memory>
18 #include <string_view>
19 #include <vector>
20
21 #include "unicode/errorcode.h"
22 #include "unicode/normalizer2.h"
23
24 namespace mindspore {
25 namespace dataset {
26 const NormalizeForm NormalizeUTF8Op::kDefNormalizeForm = NormalizeForm::kNfkc;
Compute(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)27 Status NormalizeUTF8Op::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
28 IO_CHECK(input, output);
29 CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "NormalizeUTF8: input is not string datatype.");
30
31 icu::ErrorCode error;
32 const icu::Normalizer2 *normalize = nullptr;
33 switch (normalize_form_) {
34 case NormalizeForm::kNone: {
35 *output = input;
36 return Status::OK();
37 }
38 case NormalizeForm::kNfc: {
39 normalize = icu::Normalizer2::getNFCInstance(error);
40 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFCInstance failed.");
41 break;
42 }
43 case NormalizeForm::kNfkc: {
44 normalize = icu::Normalizer2::getNFKCInstance(error);
45 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFKCInstance failed.");
46 break;
47 }
48 case NormalizeForm::kNfd: {
49 normalize = icu::Normalizer2::getNFDInstance(error);
50 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFDInstance failed.");
51 break;
52 }
53 case NormalizeForm::kNfkd: {
54 normalize = icu::Normalizer2::getNFKDInstance(error);
55 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: getNFKDInstance failed.");
56 break;
57 }
58 default: {
59 RETURN_STATUS_UNEXPECTED("NormalizeUTF8: unknown normalize form.");
60 break;
61 }
62 }
63 std::vector<std::string> strs(input->Size());
64 int i = 0;
65 for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
66 icu::StringByteSink<std::string> sink(&strs[i++]);
67 normalize->normalizeUTF8(0, icu::StringPiece((*iter).data(), (*iter).size()), sink, nullptr, error);
68 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "NormalizeUTF8: NormalizeUTF8 failed.");
69 }
70 return Tensor::CreateFromVector(strs, input->shape(), output);
71 }
72 } // namespace dataset
73 } // namespace mindspore
74