1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/text/kernels/basic_tokenizer_op.h"
17 #include <memory>
18 #include <queue>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23
24 #include "unicode/errorcode.h"
25 #include "unicode/normalizer2.h"
26
27 namespace mindspore {
28 namespace dataset {
29
30 const bool BasicTokenizerOp::kDefLowerCase = false;
31 const bool BasicTokenizerOp::kDefKeepWhitespace = false;
32 const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
33 const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
34 const char BasicTokenizerOp::kCommonPattern[] =
35 "[!-/]"
36 "|[:-@]"
37 "|[\\[-`]"
38 "|[{-~]"
39 "|[\\p{P}]"
40 "|[\\x{4E00}-\\x{9FFF}]"
41 "|[\\x{3400}-\\x{4DBF}]"
42 "|[\\x{20000}-\\x{2A6DF}]"
43 "|[\\x{2A700}-\\x{2B73F}]"
44 "|[\\x{2B740}-\\x{2B81F}]"
45 "|[\\x{2B820}-\\x{2CEAF}]"
46 "|[\\x{F900}-\\x{FAFF}]"
47 "|[\\x{2F800}-\\x{2FA1F}]";
48 const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|\\[unused\\d+\\]|";
49 const std::unordered_set<std::string> BasicTokenizerOp::kUnusedWords{"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"};
50
BasicTokenizerOp(const bool & lower_case,const bool & keep_whitespace,const NormalizeForm & normalization_form,const bool & preserve_unused_token,const bool & with_offsets)51 BasicTokenizerOp::BasicTokenizerOp(const bool &lower_case, const bool &keep_whitespace,
52 const NormalizeForm &normalization_form, const bool &preserve_unused_token,
53 const bool &with_offsets)
54 : TokenizerOp(with_offsets),
55 lower_case_(lower_case),
56 keep_whitespace_(keep_whitespace),
57 normalization_form_(normalization_form),
58 preserve_unused_token_(preserve_unused_token),
59 case_fold_(std::make_unique<CaseFoldOp>()),
60 nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
61 common_normalize_(std::make_unique<NormalizeUTF8Op>(normalization_form)),
62 replace_accent_chars_(std::make_unique<RegexReplaceOp>("\\p{Mn}", "")),
63 replace_control_chars_(std::make_unique<RegexReplaceOp>("\\p{Cc}|\\p{Cf}", " ")) {
64 std::string delim_pattern = std::string("\\s+|") + kCommonPattern;
65 std::string keep_delim_pattern;
66 if (keep_whitespace_) {
67 keep_delim_pattern = delim_pattern;
68 } else {
69 keep_delim_pattern = kCommonPattern;
70 }
71 if (preserve_unused_token_) {
72 keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
73 delim_pattern = kUnusedPattern + delim_pattern;
74 }
75 regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern, with_offsets_);
76 }
77
CaseFoldWithoutUnusedWords(const std::string_view & text,const std::unordered_set<std::string> & unused_words,std::string * output)78 Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::string_view &text,
79 const std::unordered_set<std::string> &unused_words,
80 std::string *output) {
81 icu::ErrorCode error;
82 const icu::Normalizer2 *nfkc_case_fold = icu::Normalizer2::getNFKCCasefoldInstance(error);
83 CHECK_FAIL_RETURN_UNEXPECTED(error.isSuccess(), "BasicTokenizer: getNFKCCasefoldInstance failed.");
84 RETURN_UNEXPECTED_IF_NULL(output);
85 output->clear();
86
87 // 1. get start and end offsets of not case fold strs
88 std::queue<std::pair<int, int>> offsets; // offsets of not used words
89 int start = -1;
90 int len = 0;
91 for (int i = 0; i < text.length(); i++) {
92 if (text[i] == '[') {
93 start = i;
94 ++len;
95 } else if (text[i] == ']' && start >= 0) {
96 ++len;
97 std::string word(text.substr(start, len));
98 if (unused_words.find(word) != unused_words.end()) {
99 offsets.push(std::make_pair(start, start + len - 1));
100 }
101 start = -1;
102 len = 0;
103 } else if (start >= 0) {
104 ++len;
105 }
106 }
107
108 // 2. Do not apply case fold on `unused_words`
109 start = 0;
110 for (int i = 0; i < text.length();) {
111 std::string_view process_text;
112 std::string preserve_token;
113 if (offsets.empty()) {
114 i = text.length();
115 process_text = text.substr(start, i - start);
116 } else {
117 preserve_token = text.substr(offsets.front().first, offsets.front().second - offsets.front().first + 1);
118 process_text = text.substr(start, offsets.front().first - start);
119 i = offsets.front().second + 1;
120 offsets.pop();
121 }
122 std::string temp;
123 icu::StringByteSink<std::string> sink(&temp);
124 nfkc_case_fold->normalizeUTF8(0, icu::StringPiece(process_text.data(), process_text.size()), sink, nullptr, error);
125 *output += temp + preserve_token;
126 }
127 return Status::OK();
128 }
129
CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)130 Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor> &input,
131 std::shared_ptr<Tensor> *output) {
132 IO_CHECK(input, output);
133 CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "BasicTokenizer: input is not string datatype.");
134 std::vector<std::string> strs(input->Size());
135 size_t i = 0;
136 for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
137 RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(*iter, kUnusedWords, &strs[i++]));
138 }
139 return Tensor::CreateFromVector(strs, input->shape(), output);
140 }
141
Compute(const TensorRow & input,TensorRow * output)142 Status BasicTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
143 IO_CHECK_VECTOR(input, output);
144 CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "BasicTokenizer: input only support one column data.");
145 if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
146 RETURN_STATUS_UNEXPECTED("BasicTokenizer: the input should be scalar with string datatype");
147 }
148 std::shared_ptr<Tensor> cur_input;
149 std::shared_ptr<Tensor> processed_tensor;
150 if (lower_case_) {
151 if (!preserve_unused_token_) {
152 // to lower case
153 RETURN_IF_NOT_OK(case_fold_->Compute(input[0], &processed_tensor));
154 } else {
155 // to lower case except words in kUnusedWords
156 RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input[0], &processed_tensor));
157 }
158 cur_input = processed_tensor;
159 // strip accent characters
160 RETURN_IF_NOT_OK(nfd_normalize_->Compute(cur_input, &processed_tensor));
161 cur_input = processed_tensor;
162 RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
163 } else {
164 RETURN_IF_NOT_OK(common_normalize_->Compute(input[0], &processed_tensor));
165 }
166 // strip control characters
167 cur_input = processed_tensor;
168 RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
169 return regex_tokenizer_->Compute(TensorRow(0, {std::move(processed_tensor)}), output);
170 }
171 } // namespace dataset
172 } // namespace mindspore
173