1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_linkextract.h"
8
9 #include <wchar.h>
10
11 #include <vector>
12
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_extension.h"
15 #include "core/fxcrt/fx_string.h"
16 #include "core/fxcrt/fx_system.h"
17
18 namespace {
19
20 // Find the end of a web link starting from offset |start| and ending at offset
21 // |end|. The purpose of this function is to separate url from the surrounding
22 // context characters, we do not intend to fully validate the url. |str|
23 // contains lower case characters only.
FindWebLinkEnding(const WideString & str,size_t start,size_t end)24 size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
25 if (str.Contains(L'/', start)) {
26 // When there is a path and query after '/', most ASCII chars are allowed.
27 // We don't sanitize in this case.
28 return end;
29 }
30
31 // When there is no path, it only has IP address or host name.
32 // Port is optional at the end.
33 if (str[start] == L'[') {
34 // IPv6 reference.
35 // Find the end of the reference.
36 auto result = str.Find(L']', start + 1);
37 if (result.has_value()) {
38 end = result.value();
39 if (end > start + 1) { // Has content inside brackets.
40 size_t len = str.GetLength();
41 size_t off = end + 1;
42 if (off < len && str[off] == L':') {
43 off++;
44 while (off < len && FXSYS_IsDecimalDigit(str[off]))
45 off++;
46 if (off > end + 2 &&
47 off <= len) // At least one digit in port number.
48 end = off - 1; // |off| is offset of the first invalid char.
49 }
50 }
51 }
52 return end;
53 }
54
55 // According to RFC1123, host name only has alphanumeric chars, hyphens,
56 // and periods. Hyphen should not at the end though.
57 // Non-ASCII chars are ignored during checking.
58 while (end > start && str[end] < 0x80) {
59 if (FXSYS_IsDecimalDigit(str[end]) ||
60 (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') {
61 break;
62 }
63 end--;
64 }
65 return end;
66 }
67
68 // Remove characters from the end of |str|, delimited by |start| and |end|, up
69 // to and including |charToFind|. No-op if |charToFind| is not present. Updates
70 // |end| if characters were removed.
TrimBackwardsToChar(const WideString & str,wchar_t charToFind,size_t start,size_t * end)71 void TrimBackwardsToChar(const WideString& str,
72 wchar_t charToFind,
73 size_t start,
74 size_t* end) {
75 for (size_t pos = *end; pos >= start; pos--) {
76 if (str[pos] == charToFind) {
77 *end = pos - 1;
78 break;
79 }
80 }
81 }
82
83 // Finds opening brackets ()[]{}<> and quotes "' before the URL delimited by
84 // |start| and |end| in |str|. Matches a closing bracket or quote for each
85 // opening character and, if present, removes everything afterwards. Returns the
86 // new end position for the string.
TrimExternalBracketsFromWebLink(const WideString & str,size_t start,size_t end)87 size_t TrimExternalBracketsFromWebLink(const WideString& str,
88 size_t start,
89 size_t end) {
90 for (size_t pos = 0; pos < start; pos++) {
91 if (str[pos] == '(') {
92 TrimBackwardsToChar(str, ')', start, &end);
93 } else if (str[pos] == '[') {
94 TrimBackwardsToChar(str, ']', start, &end);
95 } else if (str[pos] == '{') {
96 TrimBackwardsToChar(str, '}', start, &end);
97 } else if (str[pos] == '<') {
98 TrimBackwardsToChar(str, '>', start, &end);
99 } else if (str[pos] == '"') {
100 TrimBackwardsToChar(str, '"', start, &end);
101 } else if (str[pos] == '\'') {
102 TrimBackwardsToChar(str, '\'', start, &end);
103 }
104 }
105 return end;
106 }
107
108 } // namespace
109
CPDF_LinkExtract(const CPDF_TextPage * pTextPage)110 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
111 : m_pTextPage(pTextPage) {}
112
113 CPDF_LinkExtract::~CPDF_LinkExtract() = default;
114
ExtractLinks()115 void CPDF_LinkExtract::ExtractLinks() {
116 m_LinkArray.clear();
117 size_t start = 0;
118 size_t pos = 0;
119 bool bAfterHyphen = false;
120 bool bLineBreak = false;
121 const size_t nTotalChar = m_pTextPage->CountChars();
122 const WideString page_text = m_pTextPage->GetAllPageText();
123 while (pos < nTotalChar) {
124 const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos);
125 if (char_info.char_type() != CPDF_TextPage::CharType::kGenerated &&
126 char_info.unicode() != L' ' && pos != nTotalChar - 1) {
127 bAfterHyphen =
128 (char_info.char_type() == CPDF_TextPage::CharType::kHyphen ||
129 (char_info.char_type() == CPDF_TextPage::CharType::kNormal &&
130 char_info.unicode() == L'-'));
131 ++pos;
132 continue;
133 }
134
135 size_t nCount = pos - start;
136 if (pos == nTotalChar - 1) {
137 ++nCount;
138 } else if (bAfterHyphen &&
139 (char_info.unicode() == L'\n' || char_info.unicode() == L'\r')) {
140 // Handle text breaks with a hyphen to the next line.
141 bLineBreak = true;
142 ++pos;
143 continue;
144 }
145
146 WideString strBeCheck = page_text.Substr(start, nCount);
147 if (bLineBreak) {
148 strBeCheck.Remove(L'\n');
149 strBeCheck.Remove(L'\r');
150 bLineBreak = false;
151 }
152 // Replace the generated code with the hyphen char.
153 strBeCheck.Replace(L"\xfffe", L"-");
154
155 if (strBeCheck.GetLength() > 5) {
156 while (strBeCheck.GetLength() > 0) {
157 wchar_t ch = strBeCheck.Back();
158 if (ch != L')' && ch != L',' && ch != L'>' && ch != L'.')
159 break;
160
161 strBeCheck = strBeCheck.First(strBeCheck.GetLength() - 1);
162 nCount--;
163 }
164
165 // Check for potential web URLs and email addresses.
166 // Ftp address, file system links, data, blob etc. are not checked.
167 if (nCount > 5) {
168 auto maybe_link = CheckWebLink(strBeCheck);
169 if (maybe_link.has_value()) {
170 maybe_link.value().m_Start += start;
171 m_LinkArray.push_back(maybe_link.value());
172 } else if (CheckMailLink(&strBeCheck)) {
173 m_LinkArray.push_back(Link{{start, nCount}, strBeCheck});
174 }
175 }
176 }
177 start = ++pos;
178 }
179 }
180
CheckWebLink(const WideString & strBeCheck)181 std::optional<CPDF_LinkExtract::Link> CPDF_LinkExtract::CheckWebLink(
182 const WideString& strBeCheck) {
183 static const wchar_t kHttpScheme[] = L"http";
184 static const wchar_t kWWWAddrStart[] = L"www.";
185
186 const size_t kHttpSchemeLen = wcslen(kHttpScheme);
187 const size_t kWWWAddrStartLen = wcslen(kWWWAddrStart);
188
189 WideString str = strBeCheck;
190 str.MakeLower();
191
192 // First, try to find the scheme.
193 auto start = str.Find(kHttpScheme);
194 if (start.has_value()) {
195 size_t off = start.value() + kHttpSchemeLen; // move after "http".
196 if (str.GetLength() > off + 4) { // At least "://<char>" follows.
197 if (str[off] == L's') // "https" scheme is accepted.
198 off++;
199 if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
200 off += 3;
201 const size_t end =
202 FindWebLinkEnding(str, off,
203 TrimExternalBracketsFromWebLink(
204 str, start.value(), str.GetLength() - 1));
205 if (end > off) { // Non-empty host name.
206 const size_t nStart = start.value();
207 const size_t nCount = end - nStart + 1;
208 return Link{{nStart, nCount}, strBeCheck.Substr(nStart, nCount)};
209 }
210 }
211 }
212 }
213
214 // When there is no scheme, try to find url starting with "www.".
215 start = str.Find(kWWWAddrStart);
216 if (start.has_value()) {
217 size_t off = start.value() + kWWWAddrStartLen;
218 if (str.GetLength() > off) {
219 const size_t end =
220 FindWebLinkEnding(str, start.value(),
221 TrimExternalBracketsFromWebLink(
222 str, start.value(), str.GetLength() - 1));
223 if (end > off) {
224 const size_t nStart = start.value();
225 const size_t nCount = end - nStart + 1;
226 return Link{{nStart, nCount},
227 L"http://" + strBeCheck.Substr(nStart, nCount)};
228 }
229 }
230 }
231
232 return std::nullopt;
233 }
234
CheckMailLink(WideString * str)235 bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
236 auto aPos = str->Find(L'@');
237 // Invalid when no '@' or when starts/ends with '@'.
238 if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
239 return false;
240
241 // Check the local part.
242 size_t pPos = aPos.value(); // Used to track the position of '@' or '.'.
243 for (size_t i = aPos.value(); i > 0; i--) {
244 wchar_t ch = (*str)[i - 1];
245 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
246 continue;
247
248 if (ch != L'.' || i == pPos || i == 1) {
249 if (i == aPos.value()) {
250 // There is '.' or invalid char before '@'.
251 return false;
252 }
253 // End extracting for other invalid chars, '.' at the beginning, or
254 // consecutive '.'.
255 size_t removed_len = i == pPos ? i + 1 : i;
256 *str = str->Last(str->GetLength() - removed_len);
257 break;
258 }
259 // Found a valid '.'.
260 pPos = i - 1;
261 }
262
263 // Check the domain name part.
264 aPos = str->Find(L'@');
265 if (!aPos.has_value() || aPos.value() == 0)
266 return false;
267
268 str->TrimBack(L'.');
269 // At least one '.' in domain name, but not at the beginning.
270 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
271 // Check whether we should remove this check.
272 auto ePos = str->Find(L'.', aPos.value() + 1);
273 if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
274 return false;
275
276 // Validate all other chars in domain name.
277 size_t nLen = str->GetLength();
278 pPos = 0; // Used to track the position of '.'.
279 for (size_t i = aPos.value() + 1; i < nLen; i++) {
280 wchar_t wch = (*str)[i];
281 if (wch == L'-' || FXSYS_iswalnum(wch))
282 continue;
283
284 if (wch != L'.' || i == pPos + 1) {
285 // Domain name should end before invalid char.
286 size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
287 if (pPos > 0 && host_end - aPos.value() >= 3) {
288 // Trim the ending invalid chars if there is at least one '.' and name.
289 *str = str->First(host_end + 1);
290 break;
291 }
292 return false;
293 }
294 pPos = i;
295 }
296
297 if (!str->Contains(L"mailto:"))
298 *str = L"mailto:" + *str;
299
300 return true;
301 }
302
GetURL(size_t index) const303 WideString CPDF_LinkExtract::GetURL(size_t index) const {
304 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl
305 : WideString();
306 }
307
GetRects(size_t index) const308 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
309 if (index >= m_LinkArray.size())
310 return std::vector<CFX_FloatRect>();
311
312 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
313 m_LinkArray[index].m_Count);
314 }
315
GetTextRange(size_t index) const316 std::optional<CPDF_LinkExtract::Range> CPDF_LinkExtract::GetTextRange(
317 size_t index) const {
318 if (index >= m_LinkArray.size())
319 return std::nullopt;
320 return m_LinkArray[index];
321 }
322