• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_linkextract.h"
8 
9 #include <wchar.h>
10 
11 #include <vector>
12 
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_extension.h"
15 #include "core/fxcrt/fx_string.h"
16 #include "core/fxcrt/fx_system.h"
17 
18 namespace {
19 
20 // Find the end of a web link starting from offset |start| and ending at offset
21 // |end|. The purpose of this function is to separate url from the surrounding
22 // context characters, we do not intend to fully validate the url. |str|
23 // contains lower case characters only.
FindWebLinkEnding(const WideString & str,size_t start,size_t end)24 size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
25   if (str.Contains(L'/', start)) {
26     // When there is a path and query after '/', most ASCII chars are allowed.
27     // We don't sanitize in this case.
28     return end;
29   }
30 
31   // When there is no path, it only has IP address or host name.
32   // Port is optional at the end.
33   if (str[start] == L'[') {
34     // IPv6 reference.
35     // Find the end of the reference.
36     auto result = str.Find(L']', start + 1);
37     if (result.has_value()) {
38       end = result.value();
39       if (end > start + 1) {  // Has content inside brackets.
40         size_t len = str.GetLength();
41         size_t off = end + 1;
42         if (off < len && str[off] == L':') {
43           off++;
44           while (off < len && FXSYS_IsDecimalDigit(str[off]))
45             off++;
46           if (off > end + 2 &&
47               off <= len)   // At least one digit in port number.
48             end = off - 1;  // |off| is offset of the first invalid char.
49         }
50       }
51     }
52     return end;
53   }
54 
55   // According to RFC1123, host name only has alphanumeric chars, hyphens,
56   // and periods. Hyphen should not at the end though.
57   // Non-ASCII chars are ignored during checking.
58   while (end > start && str[end] < 0x80) {
59     if (FXSYS_IsDecimalDigit(str[end]) ||
60         (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') {
61       break;
62     }
63     end--;
64   }
65   return end;
66 }
67 
68 // Remove characters from the end of |str|, delimited by |start| and |end|, up
69 // to and including |charToFind|. No-op if |charToFind| is not present. Updates
70 // |end| if characters were removed.
TrimBackwardsToChar(const WideString & str,wchar_t charToFind,size_t start,size_t * end)71 void TrimBackwardsToChar(const WideString& str,
72                          wchar_t charToFind,
73                          size_t start,
74                          size_t* end) {
75   for (size_t pos = *end; pos >= start; pos--) {
76     if (str[pos] == charToFind) {
77       *end = pos - 1;
78       break;
79     }
80   }
81 }
82 
83 // Finds opening brackets ()[]{}<> and quotes "'  before the URL delimited by
84 // |start| and |end| in |str|. Matches a closing bracket or quote for each
85 // opening character and, if present, removes everything afterwards. Returns the
86 // new end position for the string.
TrimExternalBracketsFromWebLink(const WideString & str,size_t start,size_t end)87 size_t TrimExternalBracketsFromWebLink(const WideString& str,
88                                        size_t start,
89                                        size_t end) {
90   for (size_t pos = 0; pos < start; pos++) {
91     if (str[pos] == '(') {
92       TrimBackwardsToChar(str, ')', start, &end);
93     } else if (str[pos] == '[') {
94       TrimBackwardsToChar(str, ']', start, &end);
95     } else if (str[pos] == '{') {
96       TrimBackwardsToChar(str, '}', start, &end);
97     } else if (str[pos] == '<') {
98       TrimBackwardsToChar(str, '>', start, &end);
99     } else if (str[pos] == '"') {
100       TrimBackwardsToChar(str, '"', start, &end);
101     } else if (str[pos] == '\'') {
102       TrimBackwardsToChar(str, '\'', start, &end);
103     }
104   }
105   return end;
106 }
107 
108 }  // namespace
109 
CPDF_LinkExtract(const CPDF_TextPage * pTextPage)110 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
111     : m_pTextPage(pTextPage) {}
112 
113 CPDF_LinkExtract::~CPDF_LinkExtract() = default;
114 
ExtractLinks()115 void CPDF_LinkExtract::ExtractLinks() {
116   m_LinkArray.clear();
117   size_t start = 0;
118   size_t pos = 0;
119   bool bAfterHyphen = false;
120   bool bLineBreak = false;
121   const size_t nTotalChar = m_pTextPage->CountChars();
122   const WideString page_text = m_pTextPage->GetAllPageText();
123   while (pos < nTotalChar) {
124     const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos);
125     if (char_info.char_type() != CPDF_TextPage::CharType::kGenerated &&
126         char_info.unicode() != L' ' && pos != nTotalChar - 1) {
127       bAfterHyphen =
128           (char_info.char_type() == CPDF_TextPage::CharType::kHyphen ||
129            (char_info.char_type() == CPDF_TextPage::CharType::kNormal &&
130             char_info.unicode() == L'-'));
131       ++pos;
132       continue;
133     }
134 
135     size_t nCount = pos - start;
136     if (pos == nTotalChar - 1) {
137       ++nCount;
138     } else if (bAfterHyphen &&
139                (char_info.unicode() == L'\n' || char_info.unicode() == L'\r')) {
140       // Handle text breaks with a hyphen to the next line.
141       bLineBreak = true;
142       ++pos;
143       continue;
144     }
145 
146     WideString strBeCheck = page_text.Substr(start, nCount);
147     if (bLineBreak) {
148       strBeCheck.Remove(L'\n');
149       strBeCheck.Remove(L'\r');
150       bLineBreak = false;
151     }
152     // Replace the generated code with the hyphen char.
153     strBeCheck.Replace(L"\xfffe", L"-");
154 
155     if (strBeCheck.GetLength() > 5) {
156       while (strBeCheck.GetLength() > 0) {
157         wchar_t ch = strBeCheck.Back();
158         if (ch != L')' && ch != L',' && ch != L'>' && ch != L'.')
159           break;
160 
161         strBeCheck = strBeCheck.First(strBeCheck.GetLength() - 1);
162         nCount--;
163       }
164 
165       // Check for potential web URLs and email addresses.
166       // Ftp address, file system links, data, blob etc. are not checked.
167       if (nCount > 5) {
168         auto maybe_link = CheckWebLink(strBeCheck);
169         if (maybe_link.has_value()) {
170           maybe_link.value().m_Start += start;
171           m_LinkArray.push_back(maybe_link.value());
172         } else if (CheckMailLink(&strBeCheck)) {
173           m_LinkArray.push_back(Link{{start, nCount}, strBeCheck});
174         }
175       }
176     }
177     start = ++pos;
178   }
179 }
180 
CheckWebLink(const WideString & strBeCheck)181 std::optional<CPDF_LinkExtract::Link> CPDF_LinkExtract::CheckWebLink(
182     const WideString& strBeCheck) {
183   static const wchar_t kHttpScheme[] = L"http";
184   static const wchar_t kWWWAddrStart[] = L"www.";
185 
186   const size_t kHttpSchemeLen = wcslen(kHttpScheme);
187   const size_t kWWWAddrStartLen = wcslen(kWWWAddrStart);
188 
189   WideString str = strBeCheck;
190   str.MakeLower();
191 
192   // First, try to find the scheme.
193   auto start = str.Find(kHttpScheme);
194   if (start.has_value()) {
195     size_t off = start.value() + kHttpSchemeLen;  // move after "http".
196     if (str.GetLength() > off + 4) {  // At least "://<char>" follows.
197       if (str[off] == L's')  // "https" scheme is accepted.
198         off++;
199       if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
200         off += 3;
201         const size_t end =
202             FindWebLinkEnding(str, off,
203                               TrimExternalBracketsFromWebLink(
204                                   str, start.value(), str.GetLength() - 1));
205         if (end > off) {  // Non-empty host name.
206           const size_t nStart = start.value();
207           const size_t nCount = end - nStart + 1;
208           return Link{{nStart, nCount}, strBeCheck.Substr(nStart, nCount)};
209         }
210       }
211     }
212   }
213 
214   // When there is no scheme, try to find url starting with "www.".
215   start = str.Find(kWWWAddrStart);
216   if (start.has_value()) {
217     size_t off = start.value() + kWWWAddrStartLen;
218     if (str.GetLength() > off) {
219       const size_t end =
220           FindWebLinkEnding(str, start.value(),
221                             TrimExternalBracketsFromWebLink(
222                                 str, start.value(), str.GetLength() - 1));
223       if (end > off) {
224         const size_t nStart = start.value();
225         const size_t nCount = end - nStart + 1;
226         return Link{{nStart, nCount},
227                     L"http://" + strBeCheck.Substr(nStart, nCount)};
228       }
229     }
230   }
231 
232   return std::nullopt;
233 }
234 
CheckMailLink(WideString * str)235 bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
236   auto aPos = str->Find(L'@');
237   // Invalid when no '@' or when starts/ends with '@'.
238   if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
239     return false;
240 
241   // Check the local part.
242   size_t pPos = aPos.value();  // Used to track the position of '@' or '.'.
243   for (size_t i = aPos.value(); i > 0; i--) {
244     wchar_t ch = (*str)[i - 1];
245     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
246       continue;
247 
248     if (ch != L'.' || i == pPos || i == 1) {
249       if (i == aPos.value()) {
250         // There is '.' or invalid char before '@'.
251         return false;
252       }
253       // End extracting for other invalid chars, '.' at the beginning, or
254       // consecutive '.'.
255       size_t removed_len = i == pPos ? i + 1 : i;
256       *str = str->Last(str->GetLength() - removed_len);
257       break;
258     }
259     // Found a valid '.'.
260     pPos = i - 1;
261   }
262 
263   // Check the domain name part.
264   aPos = str->Find(L'@');
265   if (!aPos.has_value() || aPos.value() == 0)
266     return false;
267 
268   str->TrimBack(L'.');
269   // At least one '.' in domain name, but not at the beginning.
270   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
271   // Check whether we should remove this check.
272   auto ePos = str->Find(L'.', aPos.value() + 1);
273   if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
274     return false;
275 
276   // Validate all other chars in domain name.
277   size_t nLen = str->GetLength();
278   pPos = 0;  // Used to track the position of '.'.
279   for (size_t i = aPos.value() + 1; i < nLen; i++) {
280     wchar_t wch = (*str)[i];
281     if (wch == L'-' || FXSYS_iswalnum(wch))
282       continue;
283 
284     if (wch != L'.' || i == pPos + 1) {
285       // Domain name should end before invalid char.
286       size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
287       if (pPos > 0 && host_end - aPos.value() >= 3) {
288         // Trim the ending invalid chars if there is at least one '.' and name.
289         *str = str->First(host_end + 1);
290         break;
291       }
292       return false;
293     }
294     pPos = i;
295   }
296 
297   if (!str->Contains(L"mailto:"))
298     *str = L"mailto:" + *str;
299 
300   return true;
301 }
302 
GetURL(size_t index) const303 WideString CPDF_LinkExtract::GetURL(size_t index) const {
304   return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl
305                                     : WideString();
306 }
307 
GetRects(size_t index) const308 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
309   if (index >= m_LinkArray.size())
310     return std::vector<CFX_FloatRect>();
311 
312   return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
313                                    m_LinkArray[index].m_Count);
314 }
315 
GetTextRange(size_t index) const316 std::optional<CPDF_LinkExtract::Range> CPDF_LinkExtract::GetTextRange(
317     size_t index) const {
318   if (index >= m_LinkArray.size())
319     return std::nullopt;
320   return m_LinkArray[index];
321 }
322