1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_linkextract.h"
8
9 #include <vector>
10
11 #include "core/fpdftext/cpdf_textpage.h"
12 #include "core/fxcrt/fx_extension.h"
13 #include "core/fxcrt/fx_string.h"
14 #include "core/fxcrt/fx_system.h"
15
16 namespace {
17
18 // Find the end of a web link starting from offset |start| and ending at offset
19 // |end|. The purpose of this function is to separate url from the surrounding
20 // context characters, we do not intend to fully validate the url. |str|
21 // contains lower case characters only.
FindWebLinkEnding(const WideString & str,size_t start,size_t end)22 size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
23 if (str.Contains(L'/', start)) {
24 // When there is a path and query after '/', most ASCII chars are allowed.
25 // We don't sanitize in this case.
26 return end;
27 }
28
29 // When there is no path, it only has IP address or host name.
30 // Port is optional at the end.
31 if (str[start] == L'[') {
32 // IPv6 reference.
33 // Find the end of the reference.
34 auto result = str.Find(L']', start + 1);
35 if (result.has_value()) {
36 end = result.value();
37 if (end > start + 1) { // Has content inside brackets.
38 size_t len = str.GetLength();
39 size_t off = end + 1;
40 if (off < len && str[off] == L':') {
41 off++;
42 while (off < len && str[off] >= L'0' && str[off] <= L'9')
43 off++;
44 if (off > end + 2 &&
45 off <= len) // At least one digit in port number.
46 end = off - 1; // |off| is offset of the first invalid char.
47 }
48 }
49 }
50 return end;
51 }
52
53 // According to RFC1123, host name only has alphanumeric chars, hyphens,
54 // and periods. Hyphen should not at the end though.
55 // Non-ASCII chars are ignored during checking.
56 while (end > start && str[end] < 0x80) {
57 if ((str[end] >= L'0' && str[end] <= L'9') ||
58 (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.')
59 break;
60 end--;
61 }
62 return end;
63 }
64
65 // Remove characters from the end of |str|, delimited by |start| and |end|, up
66 // to and including |charToFind|. No-op if |charToFind| is not present. Updates
67 // |end| if characters were removed.
TrimBackwardsToChar(const WideString & str,wchar_t charToFind,size_t start,size_t * end)68 void TrimBackwardsToChar(const WideString& str,
69 wchar_t charToFind,
70 size_t start,
71 size_t* end) {
72 for (size_t pos = *end; pos >= start; pos--) {
73 if (str[pos] == charToFind) {
74 *end = pos - 1;
75 break;
76 }
77 }
78 }
79
80 // Finds opening brackets ()[]{}<> and quotes "' before the URL delimited by
81 // |start| and |end| in |str|. Matches a closing bracket or quote for each
82 // opening character and, if present, removes everything afterwards. Returns the
83 // new end position for the string.
TrimExternalBracketsFromWebLink(const WideString & str,size_t start,size_t end)84 size_t TrimExternalBracketsFromWebLink(const WideString& str,
85 size_t start,
86 size_t end) {
87 for (size_t pos = 0; pos < start; pos++) {
88 if (str[pos] == '(') {
89 TrimBackwardsToChar(str, ')', start, &end);
90 } else if (str[pos] == '[') {
91 TrimBackwardsToChar(str, ']', start, &end);
92 } else if (str[pos] == '{') {
93 TrimBackwardsToChar(str, '}', start, &end);
94 } else if (str[pos] == '<') {
95 TrimBackwardsToChar(str, '>', start, &end);
96 } else if (str[pos] == '"') {
97 TrimBackwardsToChar(str, '"', start, &end);
98 } else if (str[pos] == '\'') {
99 TrimBackwardsToChar(str, '\'', start, &end);
100 }
101 }
102 return end;
103 }
104
105 } // namespace
106
CPDF_LinkExtract(const CPDF_TextPage * pTextPage)107 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
108 : m_pTextPage(pTextPage) {}
109
~CPDF_LinkExtract()110 CPDF_LinkExtract::~CPDF_LinkExtract() {}
111
ExtractLinks()112 void CPDF_LinkExtract::ExtractLinks() {
113 m_LinkArray.clear();
114 if (!m_pTextPage->IsParsed())
115 return;
116
117 m_strPageText = m_pTextPage->GetAllPageText();
118 if (m_strPageText.IsEmpty())
119 return;
120
121 ParseLink();
122 }
123
ParseLink()124 void CPDF_LinkExtract::ParseLink() {
125 int start = 0;
126 int pos = 0;
127 int nTotalChar = m_pTextPage->CountChars();
128 bool bAfterHyphen = false;
129 bool bLineBreak = false;
130 while (pos < nTotalChar) {
131 FPDF_CHAR_INFO pageChar;
132 m_pTextPage->GetCharInfo(pos, &pageChar);
133 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
134 pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) {
135 int nCount = pos - start;
136 if (pos == nTotalChar - 1) {
137 nCount++;
138 } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR ||
139 pageChar.m_Unicode == TEXT_RETURN_CHAR)) {
140 // Handle text breaks with a hyphen to the next line.
141 bLineBreak = true;
142 pos++;
143 continue;
144 }
145 WideString strBeCheck;
146 strBeCheck = m_pTextPage->GetPageText(start, nCount);
147 if (bLineBreak) {
148 strBeCheck.Remove(TEXT_LINEFEED_CHAR);
149 strBeCheck.Remove(TEXT_RETURN_CHAR);
150 bLineBreak = false;
151 }
152 // Replace the generated code with the hyphen char.
153 strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN);
154
155 if (strBeCheck.GetLength() > 5) {
156 while (strBeCheck.GetLength() > 0) {
157 wchar_t ch = strBeCheck[strBeCheck.GetLength() - 1];
158 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
159 strBeCheck = strBeCheck.Left(strBeCheck.GetLength() - 1);
160 nCount--;
161 } else {
162 break;
163 }
164 }
165 // Check for potential web URLs and email addresses.
166 // Ftp address, file system links, data, blob etc. are not checked.
167 if (nCount > 5) {
168 int32_t nStartOffset;
169 int32_t nCountOverload;
170 if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) {
171 m_LinkArray.push_back(
172 {start + nStartOffset, nCountOverload, strBeCheck});
173 } else if (CheckMailLink(&strBeCheck)) {
174 m_LinkArray.push_back({start, nCount, strBeCheck});
175 }
176 }
177 }
178 start = ++pos;
179 } else {
180 bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN ||
181 (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL &&
182 pageChar.m_Unicode == TEXT_HYPHEN_CHAR));
183 pos++;
184 }
185 }
186 }
187
CheckWebLink(WideString * strBeCheck,int32_t * nStart,int32_t * nCount)188 bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck,
189 int32_t* nStart,
190 int32_t* nCount) {
191 static const wchar_t kHttpScheme[] = L"http";
192 static const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
193 static const wchar_t kWWWAddrStart[] = L"www.";
194 static const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
195
196 WideString str = *strBeCheck;
197 str.MakeLower();
198
199 size_t len = str.GetLength();
200 // First, try to find the scheme.
201 auto start = str.Find(kHttpScheme);
202 if (start.has_value()) {
203 size_t off = start.value() + kHttpSchemeLen; // move after "http".
204 if (len > off + 4) { // At least "://<char>" follows.
205 if (str[off] == L's') // "https" scheme is accepted.
206 off++;
207 if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
208 off += 3;
209 size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
210 str.GetLength() - 1);
211 end = FindWebLinkEnding(str, off, end);
212 if (end > off) { // Non-empty host name.
213 *nStart = start.value();
214 *nCount = end - start.value() + 1;
215 *strBeCheck = strBeCheck->Mid(*nStart, *nCount);
216 return true;
217 }
218 }
219 }
220 }
221
222 // When there is no scheme, try to find url starting with "www.".
223 start = str.Find(kWWWAddrStart);
224 if (start.has_value() && len > start.value() + kWWWAddrStartLen) {
225 size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
226 str.GetLength() - 1);
227 end = FindWebLinkEnding(str, start.value(), end);
228 if (end > start.value() + kWWWAddrStartLen) {
229 *nStart = start.value();
230 *nCount = end - start.value() + 1;
231 *strBeCheck = L"http://" + strBeCheck->Mid(*nStart, *nCount);
232 return true;
233 }
234 }
235 return false;
236 }
237
CheckMailLink(WideString * str)238 bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
239 auto aPos = str->Find(L'@');
240 // Invalid when no '@' or when starts/ends with '@'.
241 if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
242 return false;
243
244 // Check the local part.
245 size_t pPos = aPos.value(); // Used to track the position of '@' or '.'.
246 for (size_t i = aPos.value(); i > 0; i--) {
247 wchar_t ch = (*str)[i - 1];
248 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
249 continue;
250
251 if (ch != L'.' || i == pPos || i == 1) {
252 if (i == aPos.value()) {
253 // There is '.' or invalid char before '@'.
254 return false;
255 }
256 // End extracting for other invalid chars, '.' at the beginning, or
257 // consecutive '.'.
258 size_t removed_len = i == pPos ? i + 1 : i;
259 *str = str->Right(str->GetLength() - removed_len);
260 break;
261 }
262 // Found a valid '.'.
263 pPos = i - 1;
264 }
265
266 // Check the domain name part.
267 aPos = str->Find(L'@');
268 if (!aPos.has_value() || aPos.value() == 0)
269 return false;
270
271 str->TrimRight(L'.');
272 // At least one '.' in domain name, but not at the beginning.
273 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
274 // Check whether we should remove this check.
275 auto ePos = str->Find(L'.', aPos.value() + 1);
276 if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
277 return false;
278
279 // Validate all other chars in domain name.
280 size_t nLen = str->GetLength();
281 pPos = 0; // Used to track the position of '.'.
282 for (size_t i = aPos.value() + 1; i < nLen; i++) {
283 wchar_t wch = (*str)[i];
284 if (wch == L'-' || FXSYS_iswalnum(wch))
285 continue;
286
287 if (wch != L'.' || i == pPos + 1) {
288 // Domain name should end before invalid char.
289 size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
290 if (pPos > 0 && host_end - aPos.value() >= 3) {
291 // Trim the ending invalid chars if there is at least one '.' and name.
292 *str = str->Left(host_end + 1);
293 break;
294 }
295 return false;
296 }
297 pPos = i;
298 }
299
300 if (!str->Contains(L"mailto:"))
301 *str = L"mailto:" + *str;
302
303 return true;
304 }
305
GetURL(size_t index) const306 WideString CPDF_LinkExtract::GetURL(size_t index) const {
307 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
308 }
309
GetRects(size_t index) const310 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
311 if (index >= m_LinkArray.size())
312 return std::vector<CFX_FloatRect>();
313
314 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
315 m_LinkArray[index].m_Count);
316 }
317