• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "minddata/dataset/text/kernels/filter_wikipedia_xml_op.h"
17 
18 #include <memory>
19 #include <string_view>
20 #include <vector>
21 
22 namespace mindspore {
23 namespace dataset {
24 std::map<icu::UnicodeString, icu::UnicodeString> patterns = {{R"(<.*>)", ""},
25                                                              {R"(&amp;)", "&"},
26                                                              {"&lt;", "<"},
27                                                              {"&gt;", ">"},
28                                                              {R"(<ef[^<]*<\/ef>)", ""},
29                                                              {"<[^>]*>", ""},
30                                                              {R"(\[http:[^] ]*)", "["},
31                                                              {R"(\|thumb)", ""},
32                                                              {R"(\|left)", ""},
33                                                              {R"(\|right)", ""},
34                                                              {R"(\|\d+px)", ""},
35                                                              {R"(\[\[image:[^\[\]]*\|)", ""},
36                                                              {R"(\[\[category:([^|\]]*)[^]]*\]\])", "[[$1]]"},
37                                                              {R"(\[\[[a-z\-]*:[^\]]*\]\])", ""},
38                                                              {R"(\[\[[^\|\]]*\|)", "[["},
39                                                              {R"(\{\{[^\}]*\}\})", ""},
40                                                              {R"(\{[^\}]*\})", ""},
41                                                              {R"(\[)", ""},
42                                                              {R"(\])", ""},
43                                                              {"&[^;]*;", " "},
44                                                              {"A", "a"},
45                                                              {"B", "b"},
46                                                              {"C", "c"},
47                                                              {"D", "d"},
48                                                              {"E", "e"},
49                                                              {"F", "f"},
50                                                              {"G", "g"},
51                                                              {"H", "h"},
52                                                              {"I", "i"},
53                                                              {"J", "j"},
54                                                              {"K", "k"},
55                                                              {"L", "l"},
56                                                              {"M", "m"},
57                                                              {"N", "n"},
58                                                              {"O", "o"},
59                                                              {"P", "p"},
60                                                              {"Q", "q"},
61                                                              {"R", ""},
62                                                              {"S", "s"},
63                                                              {"T", "t"},
64                                                              {"U", "u"},
65                                                              {"V", "v"},
66                                                              {"W", "w"},
67                                                              {"X", "x"},
68                                                              {"Y", "y"},
69                                                              {"Z", "z"},
70                                                              {"0", " zero "},
71                                                              {"1", " one "},
72                                                              {"2", " two "},
73                                                              {"3", " three "},
74                                                              {"4", " four "},
75                                                              {"5", " five "},
76                                                              {"6", " six "},
77                                                              {"7", " seven "},
78                                                              {"8", " eight "},
79                                                              {"9", " nine "},
80                                                              {R"([^a-z\n]+)", " "},
81                                                              {R"(\n )", ""},
82                                                              {R"(\s+)", " "},
83                                                              {R"(\n\s*\n)", R"(\n)"}};
84 
FilterWikipediaXML(const std::string_view & text,std::string * out) const85 Status FilterWikipediaXMLOp::FilterWikipediaXML(const std::string_view &text, std::string *out) const {
86   CHECK_FAIL_RETURN_UNEXPECTED((out != nullptr), "FilterWikipediaXML: icu init failed.");
87   if (((text).find("#redirect") == std::string::npos) && ((text).find("#REDIRECT") == std::string::npos)) {
88     (*out) = text;
89     UErrorCode icu_error = U_ZERO_ERROR;
90     for (auto pattern_iter = patterns.begin(); pattern_iter != patterns.end(); pattern_iter++) {
91       icu::RegexMatcher matcher(pattern_iter->first, 0, icu_error);
92       CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error),
93                                    "RegexReplace: create icu RegexMatcher failed, you may input an error pattern.");
94       icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(*out);
95       matcher.reset(unicode_text);
96       icu::UnicodeString unicode_out = matcher.replaceAll(pattern_iter->second, icu_error);
97       CHECK_FAIL_RETURN_UNEXPECTED(U_SUCCESS(icu_error), "FilterWikipediaXML: FilterWikipediaXML failed.");
98       (*out) = "";
99       unicode_out.trim().toUTF8String(*out);
100     }
101   } else {
102     (*out) = "";
103   }
104   return Status::OK();
105 }
106 
Compute(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)107 Status FilterWikipediaXMLOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
108   IO_CHECK(input, output);
109   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "RegexReplace: input is not of type string.");
110   std::vector<std::string> strs(input->Size());
111   auto iter = input->begin<std::string_view>();
112   RETURN_IF_NOT_OK(FilterWikipediaXML(*iter, &strs[0]));
113   RETURN_IF_NOT_OK(Tensor::CreateFromVector(strs, input->shape(), output));
114   return Status::OK();
115 }
116 }  // namespace dataset
117 }  // namespace mindspore
118