• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2015 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "core/fpdftext/cpdf_linkextract.h"
6 
7 #include <utility>
8 
9 #include "testing/gtest/include/gtest/gtest.h"
10 
11 // Class to help test functions in CPDF_LinkExtract class.
12 class CPDF_TestLinkExtract final : public CPDF_LinkExtract {
13  public:
CPDF_TestLinkExtract()14   CPDF_TestLinkExtract() : CPDF_LinkExtract(nullptr) {}
15 
16  private:
17   // Add test cases as friends to access protected member functions.
18   // Access CheckMailLink and CheckWebLink.
19   FRIEND_TEST(CPDFLinkExtractTest, CheckMailLink);
20   FRIEND_TEST(CPDFLinkExtractTest, CheckWebLink);
21 };
22 
TEST(CPDFLinkExtractTest,CheckMailLink)23 TEST(CPDFLinkExtractTest, CheckMailLink) {
24   CPDF_TestLinkExtract extractor;
25   // Check cases that fail to extract valid mail link.
26   const wchar_t* const kInvalidStrings[] = {
27       L"",
28       L"peter.pan",       // '@' is required.
29       L"abc@server",      // Domain name needs at least one '.'.
30       L"abc.@gmail.com",  // '.' can not immediately precede '@'.
31       L"abc@xyz&q.org",   // Domain name should not contain '&'.
32       L"abc@.xyz.org",    // Domain name should not start with '.'.
33       L"fan@g..com"       // Domain name should not have consecutive '.'
34   };
35   for (const wchar_t* input : kInvalidStrings) {
36     WideString text_str(input);
37     EXPECT_FALSE(extractor.CheckMailLink(&text_str)) << input;
38   }
39 
40   // A struct of {input_string, expected_extracted_email_address}.
41   struct IOPair {
42     const wchar_t* input;
43     const wchar_t* expected_output;
44   };
45   // Check cases that can extract valid mail link.
46   constexpr IOPair kValidStrings[] = {
47       {L"peter@abc.d", L"peter@abc.d"},
48       {L"red.teddy.b@abc.com", L"red.teddy.b@abc.com"},
49       {L"abc_@gmail.com", L"abc_@gmail.com"},  // '_' is ok before '@'.
50       {L"dummy-hi@gmail.com",
51        L"dummy-hi@gmail.com"},                  // '-' is ok in user name.
52       {L"a..df@gmail.com", L"df@gmail.com"},    // Stop at consecutive '.'.
53       {L".john@yahoo.com", L"john@yahoo.com"},  // Remove heading '.'.
54       {L"abc@xyz.org?/", L"abc@xyz.org"},       // Trim ending invalid chars.
55       {L"fan{abc@xyz.org", L"abc@xyz.org"},     // Trim beginning invalid chars.
56       {L"fan@g.com..", L"fan@g.com"},           // Trim the ending periods.
57       {L"CAP.cap@Gmail.Com", L"CAP.cap@Gmail.Com"},  // Keep the original case.
58   };
59   for (const auto& it : kValidStrings) {
60     WideString text_str(it.input);
61     WideString expected_str(L"mailto:");
62     expected_str += it.expected_output;
63     EXPECT_TRUE(extractor.CheckMailLink(&text_str)) << it.input;
64     EXPECT_EQ(expected_str.c_str(), text_str);
65   }
66 }
67 
TEST(CPDFLinkExtractTest,CheckWebLink)68 TEST(CPDFLinkExtractTest, CheckWebLink) {
69   CPDF_TestLinkExtract extractor;
70   // Check cases that fail to extract valid web link.
71   // The last few are legit web addresses that we don't handle now.
72   const wchar_t* const kInvalidCases[] = {
73       L"",                          // Blank.
74       L"http",                      // No colon.
75       L"www.",                      // Missing domain.
76       L"https-and-www",             // Dash not colon.
77       L"http:/abc.com",             // Missing slash.
78       L"http://((()),",             // Only invalid chars in host name.
79       L"ftp://example.com",         // Ftp scheme is not supported.
80       L"http:example.com",          // Missing slashes.
81       L"http//[example.com",        // Invalid IPv6 address.
82       L"http//[00:00:00:00:00:00",  // Invalid IPv6 address.
83       L"http//[]",                  // Empty IPv6 address.
84       L"abc.example.com",           // URL without scheme.
85   };
86   for (const wchar_t* input : kInvalidCases) {
87     auto maybe_link = extractor.CheckWebLink(input);
88     EXPECT_FALSE(maybe_link.has_value()) << input;
89   }
90 
91   // Check cases that can extract valid web link.
92   // An array of {input_string, expected_extracted_web_link}.
93   struct ValidCase {
94     const wchar_t* const input_string;
95     const wchar_t* const url_extracted;
96     const size_t start_offset;
97     const size_t count;
98   };
99   const ValidCase kValidCases[] = {
100       {L"http://www.example.com", L"http://www.example.com", 0,
101        22},  // standard URL.
102       {L"http://www.example.com:88", L"http://www.example.com:88", 0,
103        25},  // URL with port number.
104       {L"http://test@www.example.com", L"http://test@www.example.com", 0,
105        27},  // URL with username.
106       {L"http://test:test@example.com", L"http://test:test@example.com", 0,
107        28},  // URL with username and password.
108       {L"http://example", L"http://example", 0,
109        14},  // URL with short domain name.
110       {L"http////www.server", L"http://www.server", 8,
111        10},  // URL starts with "www.".
112       {L"http:/www.abc.com", L"http://www.abc.com", 6,
113        11},                                       // URL starts with "www.".
114       {L"www.a.b.c", L"http://www.a.b.c", 0, 9},  // URL starts with "www.".
115       {L"https://a.us", L"https://a.us", 0, 12},  // Secure http URL.
116       {L"https://www.t.us", L"https://www.t.us", 0, 16},  // Secure http URL.
117       {L"www.example-test.com", L"http://www.example-test.com", 0,
118        20},  // '-' in host is ok.
119       {L"www.example.com,", L"http://www.example.com", 0,
120        15},  // Trim ending invalid chars.
121       {L"www.example.com;(", L"http://www.example.com", 0,
122        15},  // Trim ending invalid chars.
123       {L"test:www.abc.com", L"http://www.abc.com", 5,
124        11},  // Trim chars before URL.
125       {L"(http://www.abc.com)", L"http://www.abc.com", 1,
126        18},  // Trim external brackets.
127       {L"0(http://www.abc.com)0", L"http://www.abc.com", 2,
128        18},  // Trim chars outside brackets as well.
129       {L"0(www.abc.com)0", L"http://www.abc.com", 2,
130        11},  // Links without http should also have brackets trimmed.
131       {L"http://www.abc.com)0", L"http://www.abc.com)0", 0,
132        20},  // Do not trim brackets that were not opened.
133       {L"{(<http://www.abc.com>)}", L"http://www.abc.com", 3,
134        18},  // Trim chars with multiple levels of brackets.
135       {L"[http://www.abc.com/z(1)]", L"http://www.abc.com/z(1)", 1,
136        23},  // Brackets opened inside the URL should not be trimmed.
137       {L"(http://www.abc.com/z(1))", L"http://www.abc.com/z(1)", 1,
138        23},  // Brackets opened inside the URL should not be trimmed.
139       {L"\"http://www.abc.com\"", L"http://www.abc.com", 1,
140        18},  // External quotes can also be escaped
141       {L"www.g.com..", L"http://www.g.com..", 0, 11},  // Leave ending periods.
142 
143       // Web links can contain IP addresses too.
144       {L"http://192.168.0.1", L"http://192.168.0.1", 0, 18},  // IPv4 address.
145       {L"http://192.168.0.1:80", L"http://192.168.0.1:80", 0,
146        21},  // IPv4 address with port.
147       {L"http://[aa::00:bb::00:cc:00]", L"http://[aa::00:bb::00:cc:00]", 0,
148        28},  // IPv6 reference.
149       {L"http://[aa::00:bb::00:cc:00]:12", L"http://[aa::00:bb::00:cc:00]:12",
150        0, 31},  // IPv6 reference with port.
151       {L"http://[aa]:12", L"http://[aa]:12", 0,
152        14},  // Not validate IP address.
153       {L"http://[aa]:12abc", L"http://[aa]:12", 0,
154        14},                                      // Trim for IPv6 address.
155       {L"http://[aa]:", L"http://[aa]", 0, 11},  // Trim for IPv6 address.
156 
157       // Path and query parts can be anything.
158       {L"www.abc.com/#%%^&&*(", L"http://www.abc.com/#%%^&&*(", 0, 20},
159       {L"www.a.com/#a=@?q=rr&r=y", L"http://www.a.com/#a=@?q=rr&r=y", 0, 23},
160       {L"http://a.com/1/2/3/4\5\6", L"http://a.com/1/2/3/4\5\6", 0, 22},
161       {L"http://www.example.com/foo;bar", L"http://www.example.com/foo;bar", 0,
162        30},
163 
164       // Invalid chars inside host name are ok as we don't validate them.
165       {L"http://ex[am]ple", L"http://ex[am]ple", 0, 16},
166       {L"http://:example.com", L"http://:example.com", 0, 19},
167       {L"http://((())/path?", L"http://((())/path?", 0, 18},
168       {L"http:////abc.server", L"http:////abc.server", 0, 19},
169 
170       // Non-ASCII chars are not validated either.
171       {L"www.测试.net", L"http://www.测试.net", 0, 10},
172       {L"www.测试。net。", L"http://www.测试。net。", 0, 11},
173       {L"www.测试.net;", L"http://www.测试.net;", 0, 11},
174   };
175   for (const auto& it : kValidCases) {
176     auto maybe_link = extractor.CheckWebLink(it.input_string);
177     ASSERT_TRUE(maybe_link.has_value()) << it.input_string;
178     EXPECT_EQ(it.url_extracted, maybe_link.value().m_strUrl);
179     EXPECT_EQ(it.start_offset, maybe_link.value().m_Start) << it.input_string;
180     EXPECT_EQ(it.count, maybe_link.value().m_Count) << it.input_string;
181   }
182 }
183