1 /*
2 * Copyright (c) 2024-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include <cstdlib>
17 #include <dlfcn.h>
18 #include <libxml/HTMLparser.h>
19 #include <regex>
20
21 #include "pasteboard_hilog.h"
22 #include "pasteboard_lib_guard.h"
23 #include "pasteboard_pattern.h"
24
25 namespace OHOS::MiscServices {
26 constexpr const char *LIBXML_SO_PATH = "libxml2.z.so";
27 using htmlReadMemoryFuncPtr = htmlDocPtr (*)(const char *, int, const char *, const char *, int);
28 std::map<uint32_t, std::string> PatternDetection::patterns_{
29 { static_cast<uint32_t>(Pattern::URL), std::string("[a-zA-Z0-9+.-]+://[-a-zA-Z0-9+&@#/%?"
30 "=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_]") },
31 { static_cast<uint32_t>(Pattern::NUMBER), std::string("[-+]?[0-9]*\\.?[0-9]+") },
32 { static_cast<uint32_t>(Pattern::EMAIL_ADDRESS), std::string("(([a-zA-Z0-9_\\-\\.\\%\\+]+)@"
33 "(([a-zA-Z0-9\\-]+(?:\\.[a-zA-Z0-9\\-]+)*)|"
34 "(?:\\[([0-9]{1,3}\\.){3}[0-9]{1,3}\\]))"
35 "([a-zA-Z]{1,}|[0-9]{1,3}))") },
36 };
37
Detect(const std::set<Pattern> & patternsToCheck,const PasteData & pasteData,bool hasHTML,bool hasPlain)38 const std::set<Pattern> PatternDetection::Detect(
39 const std::set<Pattern> &patternsToCheck, const PasteData &pasteData, bool hasHTML, bool hasPlain)
40 {
41 std::set<Pattern> existedPatterns;
42 for (auto &record : pasteData.AllRecords()) {
43 if (patternsToCheck == existedPatterns) {
44 break;
45 }
46 if (hasPlain && record->GetPlainTextV0() != nullptr) {
47 std::string recordText = *(record->GetPlainTextV0());
48 DetectPlainText(existedPatterns, patternsToCheck, recordText);
49 }
50 if (hasHTML && record->GetHtmlTextV0() != nullptr) {
51 std::string recordText = ExtractHtmlContent(*(record->GetHtmlTextV0()));
52 DetectPlainText(existedPatterns, patternsToCheck, recordText);
53 }
54 }
55 return existedPatterns;
56 }
57
DetectPlainText(std::set<Pattern> & patternsOut,const std::set<Pattern> & patternsIn,const std::string & plainText)58 void PatternDetection::DetectPlainText(
59 std::set<Pattern> &patternsOut, const std::set<Pattern> &patternsIn, const std::string &plainText)
60 {
61 for (Pattern pattern : patternsIn) {
62 if (patternsOut.find(pattern) != patternsOut.end()) {
63 continue;
64 }
65 uint32_t patternUint32 = static_cast<uint32_t>(pattern);
66 auto it = patterns_.find(patternUint32);
67 if (it == patterns_.end()) {
68 PASTEBOARD_HILOGE(PASTEBOARD_MODULE_SERVICE, "pasteboard pattern, unexpected Pattern value!");
69 continue;
70 }
71 std::regex curRegex(it->second);
72 if (std::regex_search(plainText, curRegex)) {
73 patternsOut.insert(pattern);
74 }
75 }
76 }
77
ExtractHtmlContent(const std::string & html_str)78 std::string PatternDetection::ExtractHtmlContent(const std::string &html_str)
79 {
80 LibGuard libxmlGuard{ LIBXML_SO_PATH};
81 void *libHandle = libxmlGuard.GetLibHandle();
82 PASTEBOARD_CHECK_AND_RETURN_RET_LOGE(libHandle != nullptr, "", PASTEBOARD_MODULE_SERVICE,
83 "dlopen libxml2 failed");
84
85 auto htmlReadMemory = reinterpret_cast<htmlReadMemoryFuncPtr>(dlsym(libHandle, "htmlReadMemory"));
86 auto xmlDocGetRootElement = reinterpret_cast<xmlNode *(*)(xmlDoc *)>(dlsym(libHandle, "xmlDocGetRootElement"));
87 auto xmlNodeGetContent = reinterpret_cast<xmlChar *(*)(xmlNode *)>(dlsym(libHandle, "xmlNodeGetContent"));
88 auto xmlFreeDoc = reinterpret_cast<void (*)(xmlDoc *)>(dlsym(libHandle, "xmlFreeDoc"));
89 PASTEBOARD_CHECK_AND_RETURN_RET_LOGE(htmlReadMemory != nullptr && xmlDocGetRootElement != nullptr &&
90 xmlNodeGetContent != nullptr && xmlFreeDoc != nullptr, "", PASTEBOARD_MODULE_SERVICE,
91 "dlsym libxml2 failed");
92 xmlDocPtr doc = htmlReadMemory(html_str.c_str(), html_str.size(), nullptr, nullptr, 0);
93 PASTEBOARD_CHECK_AND_RETURN_RET_LOGE(doc != nullptr, "", PASTEBOARD_MODULE_SERVICE,
94 "Parse html failed! doc nullptr");
95 xmlNode *rootNode = xmlDocGetRootElement(doc);
96 if (rootNode == nullptr) {
97 PASTEBOARD_HILOGE(PASTEBOARD_MODULE_SERVICE, "parse html failed, rootNode is null");
98 xmlFreeDoc(doc);
99 return "";
100 }
101 xmlChar *xmlStr = xmlNodeGetContent(rootNode);
102 if (xmlStr == nullptr) {
103 PASTEBOARD_HILOGE(PASTEBOARD_MODULE_SERVICE, "parse html failed, xmlStr is null");
104 xmlFreeDoc(doc);
105 return "";
106 }
107
108 std::string result(reinterpret_cast<const char *>(xmlStr));
109 free(xmlStr);
110 xmlFreeDoc(doc);
111 return result;
112 }
113 } // namespace OHOS::MiscServices