1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_linkextract.h"
8
9 #include <vector>
10
11 #include "core/fpdftext/cpdf_textpage.h"
12 #include "core/fxcrt/fx_extension.h"
13 #include "core/fxcrt/fx_string.h"
14 #include "core/fxcrt/fx_system.h"
15
16 namespace {
17
18 // Find the end of a web link starting from offset |start| and ending at offset
19 // |end|. The purpose of this function is to separate url from the surrounding
20 // context characters, we do not intend to fully validate the url. |str|
21 // contains lower case characters only.
FindWebLinkEnding(const WideString & str,size_t start,size_t end)22 size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
23 if (str.Contains(L'/', start)) {
24 // When there is a path and query after '/', most ASCII chars are allowed.
25 // We don't sanitize in this case.
26 return end;
27 }
28
29 // When there is no path, it only has IP address or host name.
30 // Port is optional at the end.
31 if (str[start] == L'[') {
32 // IPv6 reference.
33 // Find the end of the reference.
34 auto result = str.Find(L']', start + 1);
35 if (result.has_value()) {
36 end = result.value();
37 if (end > start + 1) { // Has content inside brackets.
38 size_t len = str.GetLength();
39 size_t off = end + 1;
40 if (off < len && str[off] == L':') {
41 off++;
42 while (off < len && FXSYS_IsDecimalDigit(str[off]))
43 off++;
44 if (off > end + 2 &&
45 off <= len) // At least one digit in port number.
46 end = off - 1; // |off| is offset of the first invalid char.
47 }
48 }
49 }
50 return end;
51 }
52
53 // According to RFC1123, host name only has alphanumeric chars, hyphens,
54 // and periods. Hyphen should not at the end though.
55 // Non-ASCII chars are ignored during checking.
56 while (end > start && str[end] < 0x80) {
57 if (FXSYS_IsDecimalDigit(str[end]) ||
58 (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') {
59 break;
60 }
61 end--;
62 }
63 return end;
64 }
65
66 // Remove characters from the end of |str|, delimited by |start| and |end|, up
67 // to and including |charToFind|. No-op if |charToFind| is not present. Updates
68 // |end| if characters were removed.
TrimBackwardsToChar(const WideString & str,wchar_t charToFind,size_t start,size_t * end)69 void TrimBackwardsToChar(const WideString& str,
70 wchar_t charToFind,
71 size_t start,
72 size_t* end) {
73 for (size_t pos = *end; pos >= start; pos--) {
74 if (str[pos] == charToFind) {
75 *end = pos - 1;
76 break;
77 }
78 }
79 }
80
81 // Finds opening brackets ()[]{}<> and quotes "' before the URL delimited by
82 // |start| and |end| in |str|. Matches a closing bracket or quote for each
83 // opening character and, if present, removes everything afterwards. Returns the
84 // new end position for the string.
TrimExternalBracketsFromWebLink(const WideString & str,size_t start,size_t end)85 size_t TrimExternalBracketsFromWebLink(const WideString& str,
86 size_t start,
87 size_t end) {
88 for (size_t pos = 0; pos < start; pos++) {
89 if (str[pos] == '(') {
90 TrimBackwardsToChar(str, ')', start, &end);
91 } else if (str[pos] == '[') {
92 TrimBackwardsToChar(str, ']', start, &end);
93 } else if (str[pos] == '{') {
94 TrimBackwardsToChar(str, '}', start, &end);
95 } else if (str[pos] == '<') {
96 TrimBackwardsToChar(str, '>', start, &end);
97 } else if (str[pos] == '"') {
98 TrimBackwardsToChar(str, '"', start, &end);
99 } else if (str[pos] == '\'') {
100 TrimBackwardsToChar(str, '\'', start, &end);
101 }
102 }
103 return end;
104 }
105
106 } // namespace
107
CPDF_LinkExtract(const CPDF_TextPage * pTextPage)108 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
109 : m_pTextPage(pTextPage) {}
110
111 CPDF_LinkExtract::~CPDF_LinkExtract() = default;
112
ExtractLinks()113 void CPDF_LinkExtract::ExtractLinks() {
114 m_LinkArray.clear();
115 int start = 0;
116 int pos = 0;
117 bool bAfterHyphen = false;
118 bool bLineBreak = false;
119 const int nTotalChar = m_pTextPage->CountChars();
120 const WideString page_text = m_pTextPage->GetAllPageText();
121 while (pos < nTotalChar) {
122 const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos);
123 if (char_info.m_CharType != CPDF_TextPage::CharType::kGenerated &&
124 char_info.m_Unicode != L' ' && pos != nTotalChar - 1) {
125 bAfterHyphen =
126 (char_info.m_CharType == CPDF_TextPage::CharType::kHyphen ||
127 (char_info.m_CharType == CPDF_TextPage::CharType::kNormal &&
128 char_info.m_Unicode == L'-'));
129 ++pos;
130 continue;
131 }
132
133 int nCount = pos - start;
134 if (pos == nTotalChar - 1) {
135 ++nCount;
136 } else if (bAfterHyphen &&
137 (char_info.m_Unicode == L'\n' || char_info.m_Unicode == L'\r')) {
138 // Handle text breaks with a hyphen to the next line.
139 bLineBreak = true;
140 ++pos;
141 continue;
142 }
143
144 WideString strBeCheck = page_text.Substr(start, nCount);
145 if (bLineBreak) {
146 strBeCheck.Remove(L'\n');
147 strBeCheck.Remove(L'\r');
148 bLineBreak = false;
149 }
150 // Replace the generated code with the hyphen char.
151 strBeCheck.Replace(L"\xfffe", L"-");
152
153 if (strBeCheck.GetLength() > 5) {
154 while (strBeCheck.GetLength() > 0) {
155 wchar_t ch = strBeCheck.Back();
156 if (ch != L')' && ch != L',' && ch != L'>' && ch != L'.')
157 break;
158
159 strBeCheck = strBeCheck.First(strBeCheck.GetLength() - 1);
160 nCount--;
161 }
162
163 // Check for potential web URLs and email addresses.
164 // Ftp address, file system links, data, blob etc. are not checked.
165 if (nCount > 5) {
166 int32_t nStartOffset;
167 int32_t nCountOverload;
168 if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) {
169 m_LinkArray.push_back(
170 {start + nStartOffset, nCountOverload, strBeCheck});
171 } else if (CheckMailLink(&strBeCheck)) {
172 m_LinkArray.push_back({start, nCount, strBeCheck});
173 }
174 }
175 }
176 start = ++pos;
177 }
178 }
179
CheckWebLink(WideString * strBeCheck,int32_t * nStart,int32_t * nCount)180 bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck,
181 int32_t* nStart,
182 int32_t* nCount) {
183 static const wchar_t kHttpScheme[] = L"http";
184 static const wchar_t kWWWAddrStart[] = L"www.";
185
186 const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
187 const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
188
189 WideString str = *strBeCheck;
190 str.MakeLower();
191
192 size_t len = str.GetLength();
193 // First, try to find the scheme.
194 auto start = str.Find(kHttpScheme);
195 if (start.has_value()) {
196 size_t off = start.value() + kHttpSchemeLen; // move after "http".
197 if (len > off + 4) { // At least "://<char>" follows.
198 if (str[off] == L's') // "https" scheme is accepted.
199 off++;
200 if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
201 off += 3;
202 size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
203 str.GetLength() - 1);
204 end = FindWebLinkEnding(str, off, end);
205 if (end > off) { // Non-empty host name.
206 *nStart = start.value();
207 *nCount = end - start.value() + 1;
208 *strBeCheck = strBeCheck->Substr(*nStart, *nCount);
209 return true;
210 }
211 }
212 }
213 }
214
215 // When there is no scheme, try to find url starting with "www.".
216 start = str.Find(kWWWAddrStart);
217 if (start.has_value() && len > start.value() + kWWWAddrStartLen) {
218 size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
219 str.GetLength() - 1);
220 end = FindWebLinkEnding(str, start.value(), end);
221 if (end > start.value() + kWWWAddrStartLen) {
222 *nStart = start.value();
223 *nCount = end - start.value() + 1;
224 *strBeCheck = L"http://" + strBeCheck->Substr(*nStart, *nCount);
225 return true;
226 }
227 }
228 return false;
229 }
230
CheckMailLink(WideString * str)231 bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
232 auto aPos = str->Find(L'@');
233 // Invalid when no '@' or when starts/ends with '@'.
234 if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
235 return false;
236
237 // Check the local part.
238 size_t pPos = aPos.value(); // Used to track the position of '@' or '.'.
239 for (size_t i = aPos.value(); i > 0; i--) {
240 wchar_t ch = (*str)[i - 1];
241 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
242 continue;
243
244 if (ch != L'.' || i == pPos || i == 1) {
245 if (i == aPos.value()) {
246 // There is '.' or invalid char before '@'.
247 return false;
248 }
249 // End extracting for other invalid chars, '.' at the beginning, or
250 // consecutive '.'.
251 size_t removed_len = i == pPos ? i + 1 : i;
252 *str = str->Last(str->GetLength() - removed_len);
253 break;
254 }
255 // Found a valid '.'.
256 pPos = i - 1;
257 }
258
259 // Check the domain name part.
260 aPos = str->Find(L'@');
261 if (!aPos.has_value() || aPos.value() == 0)
262 return false;
263
264 str->TrimRight(L'.');
265 // At least one '.' in domain name, but not at the beginning.
266 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
267 // Check whether we should remove this check.
268 auto ePos = str->Find(L'.', aPos.value() + 1);
269 if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
270 return false;
271
272 // Validate all other chars in domain name.
273 size_t nLen = str->GetLength();
274 pPos = 0; // Used to track the position of '.'.
275 for (size_t i = aPos.value() + 1; i < nLen; i++) {
276 wchar_t wch = (*str)[i];
277 if (wch == L'-' || FXSYS_iswalnum(wch))
278 continue;
279
280 if (wch != L'.' || i == pPos + 1) {
281 // Domain name should end before invalid char.
282 size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
283 if (pPos > 0 && host_end - aPos.value() >= 3) {
284 // Trim the ending invalid chars if there is at least one '.' and name.
285 *str = str->First(host_end + 1);
286 break;
287 }
288 return false;
289 }
290 pPos = i;
291 }
292
293 if (!str->Contains(L"mailto:"))
294 *str = L"mailto:" + *str;
295
296 return true;
297 }
298
GetURL(size_t index) const299 WideString CPDF_LinkExtract::GetURL(size_t index) const {
300 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl
301 : WideString();
302 }
303
GetRects(size_t index) const304 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
305 if (index >= m_LinkArray.size())
306 return std::vector<CFX_FloatRect>();
307
308 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
309 m_LinkArray[index].m_Count);
310 }
311
GetTextRange(size_t index,int * start_char_index,int * char_count) const312 bool CPDF_LinkExtract::GetTextRange(size_t index,
313 int* start_char_index,
314 int* char_count) const {
315 if (index >= m_LinkArray.size())
316 return false;
317 *start_char_index = m_LinkArray[index].m_Start;
318 *char_count = m_LinkArray[index].m_Count;
319 return true;
320 }
321