• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2024-2025 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include <cstdlib>
17 #include <dlfcn.h>
18 #include <libxml/HTMLparser.h>
19 #include <regex>
20 
21 #include "pasteboard_hilog.h"
22 #include "pasteboard_lib_guard.h"
23 #include "pasteboard_pattern.h"
24 
25 namespace OHOS::MiscServices {
26 constexpr const char *LIBXML_SO_PATH = "libxml2.z.so";
27 using htmlReadMemoryFuncPtr = htmlDocPtr (*)(const char *, int, const char *, const char *, int);
28 std::map<uint32_t, std::string> PatternDetection::patterns_{
29     { static_cast<uint32_t>(Pattern::URL), std::string("[a-zA-Z0-9+.-]+://[-a-zA-Z0-9+&@#/%?"
30                                                        "=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_]") },
31     { static_cast<uint32_t>(Pattern::NUMBER), std::string("[-+]?[0-9]*\\.?[0-9]+") },
32     { static_cast<uint32_t>(Pattern::EMAIL_ADDRESS), std::string("(([a-zA-Z0-9_\\-\\.\\%\\+]+)@"
33                                                                 "(([a-zA-Z0-9\\-]+(?:\\.[a-zA-Z0-9\\-]+)*)|"
34                                                                 "(?:\\[([0-9]{1,3}\\.){3}[0-9]{1,3}\\]))"
35                                                                 "([a-zA-Z]{1,}|[0-9]{1,3}))") },
36 };
37 
Detect(const std::set<Pattern> & patternsToCheck,const PasteData & pasteData,bool hasHTML,bool hasPlain)38 const std::set<Pattern> PatternDetection::Detect(
39     const std::set<Pattern> &patternsToCheck, const PasteData &pasteData, bool hasHTML, bool hasPlain)
40 {
41     std::set<Pattern> existedPatterns;
42     for (auto &record : pasteData.AllRecords()) {
43         if (patternsToCheck == existedPatterns) {
44             break;
45         }
46         if (hasPlain && record->GetPlainTextV0() != nullptr) {
47             std::string recordText = *(record->GetPlainTextV0());
48             DetectPlainText(existedPatterns, patternsToCheck, recordText);
49         }
50         if (hasHTML && record->GetHtmlTextV0() != nullptr) {
51             std::string recordText = ExtractHtmlContent(*(record->GetHtmlTextV0()));
52             DetectPlainText(existedPatterns, patternsToCheck, recordText);
53         }
54     }
55     return existedPatterns;
56 }
57 
DetectPlainText(std::set<Pattern> & patternsOut,const std::set<Pattern> & patternsIn,const std::string & plainText)58 void PatternDetection::DetectPlainText(
59     std::set<Pattern> &patternsOut, const std::set<Pattern> &patternsIn, const std::string &plainText)
60 {
61     for (Pattern pattern : patternsIn) {
62         if (patternsOut.find(pattern) != patternsOut.end()) {
63             continue;
64         }
65         uint32_t patternUint32 = static_cast<uint32_t>(pattern);
66         auto it = patterns_.find(patternUint32);
67         if (it == patterns_.end()) {
68             PASTEBOARD_HILOGE(PASTEBOARD_MODULE_SERVICE, "pasteboard pattern, unexpected Pattern value!");
69             continue;
70         }
71         std::regex curRegex(it->second);
72         if (std::regex_search(plainText, curRegex)) {
73             patternsOut.insert(pattern);
74         }
75     }
76 }
77 
ExtractHtmlContent(const std::string & html_str)78 std::string PatternDetection::ExtractHtmlContent(const std::string &html_str)
79 {
80     LibGuard libxmlGuard{ LIBXML_SO_PATH};
81     void *libHandle = libxmlGuard.GetLibHandle();
82     PASTEBOARD_CHECK_AND_RETURN_RET_LOGE(libHandle != nullptr, "", PASTEBOARD_MODULE_SERVICE,
83         "dlopen libxml2 failed");
84 
85     auto htmlReadMemory = reinterpret_cast<htmlReadMemoryFuncPtr>(dlsym(libHandle, "htmlReadMemory"));
86     auto xmlDocGetRootElement = reinterpret_cast<xmlNode *(*)(xmlDoc *)>(dlsym(libHandle, "xmlDocGetRootElement"));
87     auto xmlNodeGetContent = reinterpret_cast<xmlChar *(*)(xmlNode *)>(dlsym(libHandle, "xmlNodeGetContent"));
88     auto xmlFreeDoc = reinterpret_cast<void (*)(xmlDoc *)>(dlsym(libHandle, "xmlFreeDoc"));
89     PASTEBOARD_CHECK_AND_RETURN_RET_LOGE(htmlReadMemory != nullptr && xmlDocGetRootElement != nullptr &&
90         xmlNodeGetContent != nullptr && xmlFreeDoc != nullptr, "", PASTEBOARD_MODULE_SERVICE,
91         "dlsym libxml2 failed");
92     xmlDocPtr doc = htmlReadMemory(html_str.c_str(), html_str.size(), nullptr, nullptr, 0);
93     PASTEBOARD_CHECK_AND_RETURN_RET_LOGE(doc != nullptr, "", PASTEBOARD_MODULE_SERVICE,
94         "Parse html failed! doc nullptr");
95     xmlNode *rootNode = xmlDocGetRootElement(doc);
96     if (rootNode == nullptr) {
97         PASTEBOARD_HILOGE(PASTEBOARD_MODULE_SERVICE, "parse html failed, rootNode is null");
98         xmlFreeDoc(doc);
99         return "";
100     }
101     xmlChar *xmlStr = xmlNodeGetContent(rootNode);
102     if (xmlStr == nullptr) {
103         PASTEBOARD_HILOGE(PASTEBOARD_MODULE_SERVICE, "parse html failed, xmlStr is null");
104         xmlFreeDoc(doc);
105         return "";
106     }
107 
108     std::string result(reinterpret_cast<const char *>(xmlStr));
109     free(xmlStr);
110     xmlFreeDoc(doc);
111     return result;
112 }
113 } // namespace OHOS::MiscServices