1 /**
2 * Copyright 2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "minddata/dataset/text/kernels/filter_wikipedia_xml_op.h"
17
18 #include <memory>
19 #include <string_view>
20 #include <vector>
21
22 namespace mindspore {
23 namespace dataset {
24 std::map<icu::UnicodeString, icu::UnicodeString> patterns = {{R"(<.*>)", ""},
25 {R"(&)", "&"},
26 {"<", "<"},
27 {">", ">"},
28 {R"(<ef[^<]*<\/ef>)", ""},
29 {"<[^>]*>", ""},
30 {R"(\[http:[^] ]*)", "["},
31 {R"(\|thumb)", ""},
32 {R"(\|left)", ""},
33 {R"(\|right)", ""},
34 {R"(\|\d+px)", ""},
35 {R"(\[\[image:[^\[\]]*\|)", ""},
36 {R"(\[\[category:([^|\]]*)[^]]*\]\])", "[[$1]]"},
37 {R"(\[\[[a-z\-]*:[^\]]*\]\])", ""},
38 {R"(\[\[[^\|\]]*\|)", "[["},
39 {R"(\{\{[^\}]*\}\})", ""},
40 {R"(\{[^\}]*\})", ""},
41 {R"(\[)", ""},
42 {R"(\])", ""},
43 {"&[^;]*;", " "},
44 {"A", "a"},
45 {"B", "b"},
46 {"C", "c"},
47 {"D", "d"},
48 {"E", "e"},
49 {"F", "f"},
50 {"G", "g"},
51 {"H", "h"},
52 {"I", "i"},
53 {"J", "j"},
54 {"K", "k"},
55 {"L", "l"},
56 {"M", "m"},
57 {"N", "n"},
58 {"O", "o"},
59 {"P", "p"},
60 {"Q", "q"},
61 {"R", ""},
62 {"S", "s"},
63 {"T", "t"},
64 {"U", "u"},
65 {"V", "v"},
66 {"W", "w"},
67 {"X", "x"},
68 {"Y", "y"},
69 {"Z", "z"},
70 {"0", " zero "},
71 {"1", " one "},
72 {"2", " two "},
73 {"3", " three "},
74 {"4", " four "},
75 {"5", " five "},
76 {"6", " six "},
77 {"7", " seven "},
78 {"8", " eight "},
79 {"9", " nine "},
80 {R"([^a-z\n]+)", " "},
81 {R"(\n )", ""},
82 {R"(\s+)", " "},
83 {R"(\n\s*\n)", R"(\n)"}};
84
FilterWikipediaXML(const std::string_view & text,std::string * out) const85 Status FilterWikipediaXMLOp::FilterWikipediaXML(const std::string_view &text, std::string *out) const {
86 CHECK_FAIL_RETURN_UNEXPECTED((out != nullptr), "FilterWikipediaXML: icu init failed.");
87 if (((text).find("#redirect") == std::string::npos) && ((text).find("#REDIRECT") == std::string::npos)) {
88 (*out) = text;
89 UErrorCode icu_error = U_ZERO_ERROR;
90 for (auto pattern_iter = patterns.begin(); pattern_iter != patterns.end(); pattern_iter++) {
91 icu::RegexMatcher matcher(pattern_iter->first, 0, icu_error);
92 CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error),
93 "RegexReplace: create icu RegexMatcher failed, you may input an error pattern.");
94 icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(*out);
95 matcher.reset(unicode_text);
96 icu::UnicodeString unicode_out = matcher.replaceAll(pattern_iter->second, icu_error);
97 CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "FilterWikipediaXML: FilterWikipediaXML failed.");
98 (*out) = "";
99 unicode_out.trim().toUTF8String(*out);
100 }
101 } else {
102 (*out) = "";
103 }
104 return Status::OK();
105 }
106
Compute(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)107 Status FilterWikipediaXMLOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
108 IO_CHECK(input, output);
109 CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "RegexReplace: input is not of type string.");
110 std::vector<std::string> strs(input->Size());
111 auto iter = input->begin<std::string_view>();
112 RETURN_IF_NOT_OK(FilterWikipediaXML(*iter, &strs[0]));
113 RETURN_IF_NOT_OK(Tensor::CreateFromVector(strs, input->shape(), output));
114 return Status::OK();
115 }
116 } // namespace dataset
117 } // namespace mindspore
118