1 // Copyright 2015 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <memory>
6
7 #include "core/fxcrt/fx_memory.h"
8 #include "public/fpdf_text.h"
9 #include "public/fpdfview.h"
10 #include "testing/embedder_test.h"
11 #include "testing/gtest/include/gtest/gtest.h"
12 #include "testing/test_support.h"
13
14 namespace {
15
check_unsigned_shorts(const char * expected,const unsigned short * actual,size_t length)16 bool check_unsigned_shorts(const char* expected,
17 const unsigned short* actual,
18 size_t length) {
19 if (length > strlen(expected) + 1)
20 return false;
21
22 for (size_t i = 0; i < length; ++i) {
23 if (actual[i] != static_cast<unsigned short>(expected[i]))
24 return false;
25 }
26 return true;
27 }
28
29 } // namespace
30
31 class FPDFTextEmbeddertest : public EmbedderTest {};
32
TEST_F(FPDFTextEmbeddertest,Text)33 TEST_F(FPDFTextEmbeddertest, Text) {
34 EXPECT_TRUE(OpenDocument("hello_world.pdf"));
35 FPDF_PAGE page = LoadPage(0);
36 EXPECT_TRUE(page);
37
38 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
39 EXPECT_TRUE(textpage);
40
41 static const char expected[] = "Hello, world!\r\nGoodbye, world!";
42 unsigned short fixed_buffer[128];
43 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
44
45 // Check that edge cases are handled gracefully
46 EXPECT_EQ(0, FPDFText_GetText(textpage, 0, 128, nullptr));
47 EXPECT_EQ(0, FPDFText_GetText(textpage, -1, 128, fixed_buffer));
48 EXPECT_EQ(0, FPDFText_GetText(textpage, 0, -1, fixed_buffer));
49 EXPECT_EQ(1, FPDFText_GetText(textpage, 0, 0, fixed_buffer));
50 EXPECT_EQ(0, fixed_buffer[0]);
51
52 // Keep going and check the next case.
53 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
54 EXPECT_EQ(2, FPDFText_GetText(textpage, 0, 1, fixed_buffer));
55 EXPECT_EQ(expected[0], fixed_buffer[0]);
56 EXPECT_EQ(0, fixed_buffer[1]);
57
58 // Check includes the terminating NUL that is provided.
59 int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
60 ASSERT_GE(num_chars, 0);
61 EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
62 EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
63
64 // Count does not include the terminating NUL in the string literal.
65 EXPECT_EQ(sizeof(expected) - 1,
66 static_cast<size_t>(FPDFText_CountChars(textpage)));
67 for (size_t i = 0; i < sizeof(expected) - 1; ++i) {
68 EXPECT_EQ(static_cast<unsigned int>(expected[i]),
69 FPDFText_GetUnicode(textpage, i))
70 << " at " << i;
71 }
72
73 // Extracting using a buffer that will be completely filled. Small buffer is
74 // 12 elements long, since it will need 2 locations per displayed character in
75 // the expected string, plus 2 more for the terminating character.
76 static const char small_expected[] = "Hello";
77 unsigned short small_buffer[12];
78 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
79 EXPECT_EQ(6, FPDFText_GetText(textpage, 0, 5, small_buffer));
80 EXPECT_TRUE(check_unsigned_shorts(small_expected, small_buffer,
81 sizeof(small_expected)));
82
83 EXPECT_EQ(12.0, FPDFText_GetFontSize(textpage, 0));
84 EXPECT_EQ(16.0, FPDFText_GetFontSize(textpage, 15));
85
86 double left = 0.0;
87 double right = 0.0;
88 double bottom = 0.0;
89 double top = 0.0;
90 EXPECT_FALSE(FPDFText_GetCharBox(nullptr, 4, &left, &right, &bottom, &top));
91 EXPECT_DOUBLE_EQ(0.0, left);
92 EXPECT_DOUBLE_EQ(0.0, right);
93 EXPECT_DOUBLE_EQ(0.0, bottom);
94 EXPECT_DOUBLE_EQ(0.0, top);
95 EXPECT_FALSE(FPDFText_GetCharBox(textpage, -1, &left, &right, &bottom, &top));
96 EXPECT_DOUBLE_EQ(0.0, left);
97 EXPECT_DOUBLE_EQ(0.0, right);
98 EXPECT_DOUBLE_EQ(0.0, bottom);
99 EXPECT_DOUBLE_EQ(0.0, top);
100 EXPECT_FALSE(FPDFText_GetCharBox(textpage, 55, &left, &right, &bottom, &top));
101 EXPECT_DOUBLE_EQ(0.0, left);
102 EXPECT_DOUBLE_EQ(0.0, right);
103 EXPECT_DOUBLE_EQ(0.0, bottom);
104 EXPECT_DOUBLE_EQ(0.0, top);
105
106 EXPECT_TRUE(FPDFText_GetCharBox(textpage, 4, &left, &right, &bottom, &top));
107 EXPECT_NEAR(41.071, left, 0.001);
108 EXPECT_NEAR(46.243, right, 0.001);
109 EXPECT_NEAR(49.844, bottom, 0.001);
110 EXPECT_NEAR(55.520, top, 0.001);
111
112 double x = 0.0;
113 double y = 0.0;
114 EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 4, &x, &y));
115 EXPECT_NEAR(40.664, x, 0.001);
116 EXPECT_NEAR(50.000, y, 0.001);
117
118 EXPECT_EQ(4, FPDFText_GetCharIndexAtPos(textpage, 42.0, 50.0, 1.0, 1.0));
119 EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 0.0, 0.0, 1.0, 1.0));
120 EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 199.0, 199.0, 1.0, 1.0));
121
122 // Test out of range indicies.
123 EXPECT_EQ(-1,
124 FPDFText_GetCharIndexAtPos(textpage, 42.0, 10000000.0, 1.0, 1.0));
125 EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, -1.0, 50.0, 1.0, 1.0));
126
127 // Count does not include the terminating NUL in the string literal.
128 EXPECT_EQ(2, FPDFText_CountRects(textpage, 0, sizeof(expected) - 1));
129
130 left = 0.0;
131 right = 0.0;
132 bottom = 0.0;
133 top = 0.0;
134 EXPECT_TRUE(FPDFText_GetRect(textpage, 1, &left, &top, &right, &bottom));
135 EXPECT_NEAR(20.847, left, 0.001);
136 EXPECT_NEAR(135.167, right, 0.001);
137 EXPECT_NEAR(96.655, bottom, 0.001);
138 EXPECT_NEAR(116.000, top, 0.001);
139
140 // Test out of range indicies set outputs to (0.0, 0.0, 0.0, 0.0).
141 left = -1.0;
142 right = -1.0;
143 bottom = -1.0;
144 top = -1.0;
145 EXPECT_FALSE(FPDFText_GetRect(textpage, -1, &left, &top, &right, &bottom));
146 EXPECT_EQ(0.0, left);
147 EXPECT_EQ(0.0, right);
148 EXPECT_EQ(0.0, bottom);
149 EXPECT_EQ(0.0, top);
150
151 left = -2.0;
152 right = -2.0;
153 bottom = -2.0;
154 top = -2.0;
155 EXPECT_FALSE(FPDFText_GetRect(textpage, 2, &left, &top, &right, &bottom));
156 EXPECT_EQ(0.0, left);
157 EXPECT_EQ(0.0, right);
158 EXPECT_EQ(0.0, bottom);
159 EXPECT_EQ(0.0, top);
160
161 EXPECT_EQ(9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, 0, 0));
162
163 // Extract starting at character 4 as above.
164 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
165 EXPECT_EQ(1, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
166 fixed_buffer, 1));
167 EXPECT_TRUE(check_unsigned_shorts(expected + 4, fixed_buffer, 1));
168 EXPECT_EQ(0xbdbd, fixed_buffer[1]);
169
170 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
171 EXPECT_EQ(9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
172 fixed_buffer, 9));
173 EXPECT_TRUE(check_unsigned_shorts(expected + 4, fixed_buffer, 9));
174 EXPECT_EQ(0xbdbd, fixed_buffer[9]);
175
176 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
177 EXPECT_EQ(10, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
178 fixed_buffer, 128));
179 EXPECT_TRUE(check_unsigned_shorts(expected + 4, fixed_buffer, 9));
180 EXPECT_EQ(0u, fixed_buffer[9]);
181 EXPECT_EQ(0xbdbd, fixed_buffer[10]);
182
183 FPDFText_ClosePage(textpage);
184 UnloadPage(page);
185 }
186
TEST_F(FPDFTextEmbeddertest,TextSearch)187 TEST_F(FPDFTextEmbeddertest, TextSearch) {
188 EXPECT_TRUE(OpenDocument("hello_world.pdf"));
189 FPDF_PAGE page = LoadPage(0);
190 EXPECT_TRUE(page);
191
192 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
193 EXPECT_TRUE(textpage);
194
195 std::unique_ptr<unsigned short, pdfium::FreeDeleter> nope =
196 GetFPDFWideString(L"nope");
197 std::unique_ptr<unsigned short, pdfium::FreeDeleter> world =
198 GetFPDFWideString(L"world");
199 std::unique_ptr<unsigned short, pdfium::FreeDeleter> world_caps =
200 GetFPDFWideString(L"WORLD");
201 std::unique_ptr<unsigned short, pdfium::FreeDeleter> world_substr =
202 GetFPDFWideString(L"orld");
203
204 // No occurences of "nope" in test page.
205 FPDF_SCHHANDLE search = FPDFText_FindStart(textpage, nope.get(), 0, 0);
206 EXPECT_TRUE(search);
207 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
208 EXPECT_EQ(0, FPDFText_GetSchCount(search));
209
210 // Advancing finds nothing.
211 EXPECT_FALSE(FPDFText_FindNext(search));
212 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
213 EXPECT_EQ(0, FPDFText_GetSchCount(search));
214
215 // Retreating finds nothing.
216 EXPECT_FALSE(FPDFText_FindPrev(search));
217 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
218 EXPECT_EQ(0, FPDFText_GetSchCount(search));
219 FPDFText_FindClose(search);
220
221 // Two occurences of "world" in test page.
222 search = FPDFText_FindStart(textpage, world.get(), 0, 2);
223 EXPECT_TRUE(search);
224
225 // Remains not found until advanced.
226 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
227 EXPECT_EQ(0, FPDFText_GetSchCount(search));
228
229 // First occurence of "world" in this test page.
230 EXPECT_TRUE(FPDFText_FindNext(search));
231 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
232 EXPECT_EQ(5, FPDFText_GetSchCount(search));
233
234 // Last occurence of "world" in this test page.
235 EXPECT_TRUE(FPDFText_FindNext(search));
236 EXPECT_EQ(24, FPDFText_GetSchResultIndex(search));
237 EXPECT_EQ(5, FPDFText_GetSchCount(search));
238
239 // Found position unchanged when fails to advance.
240 EXPECT_FALSE(FPDFText_FindNext(search));
241 EXPECT_EQ(24, FPDFText_GetSchResultIndex(search));
242 EXPECT_EQ(5, FPDFText_GetSchCount(search));
243
244 // Back to first occurence.
245 EXPECT_TRUE(FPDFText_FindPrev(search));
246 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
247 EXPECT_EQ(5, FPDFText_GetSchCount(search));
248
249 // Found position unchanged when fails to retreat.
250 EXPECT_FALSE(FPDFText_FindPrev(search));
251 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
252 EXPECT_EQ(5, FPDFText_GetSchCount(search));
253 FPDFText_FindClose(search);
254
255 // Exact search unaffected by case sensitiity and whole word flags.
256 search = FPDFText_FindStart(textpage, world.get(),
257 FPDF_MATCHCASE | FPDF_MATCHWHOLEWORD, 0);
258 EXPECT_TRUE(search);
259 EXPECT_TRUE(FPDFText_FindNext(search));
260 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
261 EXPECT_EQ(5, FPDFText_GetSchCount(search));
262 FPDFText_FindClose(search);
263
264 // Default is case-insensitive, so matching agaist caps works.
265 search = FPDFText_FindStart(textpage, world_caps.get(), 0, 0);
266 EXPECT_TRUE(search);
267 EXPECT_TRUE(FPDFText_FindNext(search));
268 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
269 EXPECT_EQ(5, FPDFText_GetSchCount(search));
270 FPDFText_FindClose(search);
271
272 // But can be made case sensitive, in which case this fails.
273 search = FPDFText_FindStart(textpage, world_caps.get(), FPDF_MATCHCASE, 0);
274 EXPECT_FALSE(FPDFText_FindNext(search));
275 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
276 EXPECT_EQ(0, FPDFText_GetSchCount(search));
277 FPDFText_FindClose(search);
278
279 // Default is match anywhere within word, so matching substirng works.
280 search = FPDFText_FindStart(textpage, world_substr.get(), 0, 0);
281 EXPECT_TRUE(FPDFText_FindNext(search));
282 EXPECT_EQ(8, FPDFText_GetSchResultIndex(search));
283 EXPECT_EQ(4, FPDFText_GetSchCount(search));
284 FPDFText_FindClose(search);
285
286 // But can be made to mach word boundaries, in which case this fails.
287 search =
288 FPDFText_FindStart(textpage, world_substr.get(), FPDF_MATCHWHOLEWORD, 0);
289 EXPECT_FALSE(FPDFText_FindNext(search));
290 // TODO(tsepez): investigate strange index/count values in this state.
291 FPDFText_FindClose(search);
292
293 FPDFText_ClosePage(textpage);
294 UnloadPage(page);
295 }
296
297 // Test that the page has characters despite a bad stream length.
TEST_F(FPDFTextEmbeddertest,StreamLengthPastEndOfFile)298 TEST_F(FPDFTextEmbeddertest, StreamLengthPastEndOfFile) {
299 EXPECT_TRUE(OpenDocument("bug_57.pdf"));
300 FPDF_PAGE page = LoadPage(0);
301 EXPECT_TRUE(page);
302
303 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
304 EXPECT_TRUE(textpage);
305 EXPECT_EQ(13, FPDFText_CountChars(textpage));
306
307 FPDFText_ClosePage(textpage);
308 UnloadPage(page);
309 }
310
TEST_F(FPDFTextEmbeddertest,WebLinks)311 TEST_F(FPDFTextEmbeddertest, WebLinks) {
312 EXPECT_TRUE(OpenDocument("weblinks.pdf"));
313 FPDF_PAGE page = LoadPage(0);
314 EXPECT_TRUE(page);
315
316 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
317 EXPECT_TRUE(textpage);
318
319 FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
320 EXPECT_TRUE(pagelink);
321
322 // Page contains two HTTP-style URLs.
323 EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
324
325 // Only a terminating NUL required for bogus links.
326 EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 2, nullptr, 0));
327 EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 1400, nullptr, 0));
328 EXPECT_EQ(1, FPDFLink_GetURL(pagelink, -1, nullptr, 0));
329
330 // Query the number of characters required for each link (incl NUL).
331 EXPECT_EQ(25, FPDFLink_GetURL(pagelink, 0, nullptr, 0));
332 EXPECT_EQ(26, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
333
334 static const char expected_url[] = "http://example.com?q=foo";
335 static const size_t expected_len = sizeof(expected_url);
336 unsigned short fixed_buffer[128];
337
338 // Retrieve a link with too small a buffer. Buffer will not be
339 // NUL-terminated, but must not be modified past indicated length,
340 // so pre-fill with a pattern to check write bounds.
341 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
342 EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 0, fixed_buffer, 1));
343 EXPECT_TRUE(check_unsigned_shorts(expected_url, fixed_buffer, 1));
344 EXPECT_EQ(0xbdbd, fixed_buffer[1]);
345
346 // Check buffer that doesn't have space for a terminating NUL.
347 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
348 EXPECT_EQ(static_cast<int>(expected_len - 1),
349 FPDFLink_GetURL(pagelink, 0, fixed_buffer, expected_len - 1));
350 EXPECT_TRUE(
351 check_unsigned_shorts(expected_url, fixed_buffer, expected_len - 1));
352 EXPECT_EQ(0xbdbd, fixed_buffer[expected_len - 1]);
353
354 // Retreive link with exactly-sized buffer.
355 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
356 EXPECT_EQ(static_cast<int>(expected_len),
357 FPDFLink_GetURL(pagelink, 0, fixed_buffer, expected_len));
358 EXPECT_TRUE(check_unsigned_shorts(expected_url, fixed_buffer, expected_len));
359 EXPECT_EQ(0u, fixed_buffer[expected_len - 1]);
360 EXPECT_EQ(0xbdbd, fixed_buffer[expected_len]);
361
362 // Retreive link with ample-sized-buffer.
363 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
364 EXPECT_EQ(static_cast<int>(expected_len),
365 FPDFLink_GetURL(pagelink, 0, fixed_buffer, 128));
366 EXPECT_TRUE(check_unsigned_shorts(expected_url, fixed_buffer, expected_len));
367 EXPECT_EQ(0u, fixed_buffer[expected_len - 1]);
368 EXPECT_EQ(0xbdbd, fixed_buffer[expected_len]);
369
370 // Each link rendered in a single rect in this test page.
371 EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 0));
372 EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 1));
373
374 // Each link rendered in a single rect in this test page.
375 EXPECT_EQ(0, FPDFLink_CountRects(pagelink, -1));
376 EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 2));
377 EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 10000));
378
379 // Check boundary of valid link index with valid rect index.
380 double left = 0.0;
381 double right = 0.0;
382 double top = 0.0;
383 double bottom = 0.0;
384 EXPECT_TRUE(FPDFLink_GetRect(pagelink, 0, 0, &left, &top, &right, &bottom));
385 EXPECT_NEAR(50.791, left, 0.001);
386 EXPECT_NEAR(187.963, right, 0.001);
387 EXPECT_NEAR(97.624, bottom, 0.001);
388 EXPECT_NEAR(108.736, top, 0.001);
389
390 // Check that valid link with invalid rect index leaves parameters unchanged.
391 left = -1.0;
392 right = -1.0;
393 top = -1.0;
394 bottom = -1.0;
395 EXPECT_FALSE(FPDFLink_GetRect(pagelink, 0, 1, &left, &top, &right, &bottom));
396 EXPECT_EQ(-1.0, left);
397 EXPECT_EQ(-1.0, right);
398 EXPECT_EQ(-1.0, bottom);
399 EXPECT_EQ(-1.0, top);
400
401 // Check that invalid link index leaves parameters unchanged.
402 left = -2.0;
403 right = -2.0;
404 top = -2.0;
405 bottom = -2.0;
406 EXPECT_FALSE(FPDFLink_GetRect(pagelink, -1, 0, &left, &top, &right, &bottom));
407 EXPECT_EQ(-2.0, left);
408 EXPECT_EQ(-2.0, right);
409 EXPECT_EQ(-2.0, bottom);
410 EXPECT_EQ(-2.0, top);
411
412 FPDFLink_CloseWebLinks(pagelink);
413 FPDFText_ClosePage(textpage);
414 UnloadPage(page);
415 }
416
TEST_F(FPDFTextEmbeddertest,WebLinksAcrossLines)417 TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLines) {
418 EXPECT_TRUE(OpenDocument("weblinks_across_lines.pdf"));
419 FPDF_PAGE page = LoadPage(0);
420 EXPECT_TRUE(page);
421
422 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
423 EXPECT_TRUE(textpage);
424
425 FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
426 EXPECT_TRUE(pagelink);
427
428 static const char* const kExpectedUrls[] = {
429 "http://example.com", // from "http://www.example.com?\r\nfoo"
430 "http://example.com/", // from "http://www.example.com/\r\nfoo"
431 "http://example.com/test-foo", // from "http://example.com/test-\r\nfoo"
432 "http://abc.com/test-foo", // from "http://abc.com/test-\r\n\r\nfoo"
433 // Next two links from "http://www.example.com/\r\nhttp://www.abc.com/"
434 "http://example.com/", "http://www.abc.com",
435 };
436 static const int kNumLinks = static_cast<int>(FX_ArraySize(kExpectedUrls));
437
438 EXPECT_EQ(kNumLinks, FPDFLink_CountWebLinks(pagelink));
439
440 unsigned short fixed_buffer[128];
441 for (int i = 0; i < kNumLinks; i++) {
442 const size_t expected_len = strlen(kExpectedUrls[i]) + 1;
443 memset(fixed_buffer, 0, FX_ArraySize(fixed_buffer));
444 EXPECT_EQ(static_cast<int>(expected_len),
445 FPDFLink_GetURL(pagelink, i, nullptr, 0));
446 EXPECT_EQ(
447 static_cast<int>(expected_len),
448 FPDFLink_GetURL(pagelink, i, fixed_buffer, FX_ArraySize(fixed_buffer)));
449 EXPECT_TRUE(
450 check_unsigned_shorts(kExpectedUrls[i], fixed_buffer, expected_len));
451 }
452
453 FPDFLink_CloseWebLinks(pagelink);
454 FPDFText_ClosePage(textpage);
455 UnloadPage(page);
456 }
457
TEST_F(FPDFTextEmbeddertest,WebLinksAcrossLinesBug)458 TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLinesBug) {
459 EXPECT_TRUE(OpenDocument("bug_650.pdf"));
460 FPDF_PAGE page = LoadPage(0);
461 EXPECT_TRUE(page);
462
463 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
464 EXPECT_TRUE(textpage);
465
466 FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
467 EXPECT_TRUE(pagelink);
468
469 EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
470 unsigned short fixed_buffer[128] = {0};
471 static const char kExpectedUrl[] =
472 "http://tutorial45.com/learn-autocad-basics-day-166/";
473 static const int kUrlSize = static_cast<int>(sizeof(kExpectedUrl));
474
475 EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
476 EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, fixed_buffer,
477 FX_ArraySize(fixed_buffer)));
478 EXPECT_TRUE(check_unsigned_shorts(kExpectedUrl, fixed_buffer, kUrlSize));
479
480 FPDFLink_CloseWebLinks(pagelink);
481 FPDFText_ClosePage(textpage);
482 UnloadPage(page);
483 }
484
TEST_F(FPDFTextEmbeddertest,GetFontSize)485 TEST_F(FPDFTextEmbeddertest, GetFontSize) {
486 EXPECT_TRUE(OpenDocument("hello_world.pdf"));
487 FPDF_PAGE page = LoadPage(0);
488 EXPECT_TRUE(page);
489
490 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
491 EXPECT_TRUE(textpage);
492
493 const double kExpectedFontsSizes[] = {12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
494 12, 12, 12, 1, 1, 16, 16, 16, 16, 16,
495 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
496
497 int count = FPDFText_CountChars(textpage);
498 ASSERT_EQ(FX_ArraySize(kExpectedFontsSizes), static_cast<size_t>(count));
499 for (int i = 0; i < count; ++i)
500 EXPECT_EQ(kExpectedFontsSizes[i], FPDFText_GetFontSize(textpage, i)) << i;
501
502 FPDFText_ClosePage(textpage);
503 UnloadPage(page);
504 }
505
TEST_F(FPDFTextEmbeddertest,ToUnicode)506 TEST_F(FPDFTextEmbeddertest, ToUnicode) {
507 EXPECT_TRUE(OpenDocument("bug_583.pdf"));
508 FPDF_PAGE page = LoadPage(0);
509 EXPECT_TRUE(page);
510
511 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
512 EXPECT_TRUE(textpage);
513
514 ASSERT_EQ(1, FPDFText_CountChars(textpage));
515 EXPECT_EQ(static_cast<unsigned int>(0), FPDFText_GetUnicode(textpage, 0));
516
517 FPDFText_ClosePage(textpage);
518 UnloadPage(page);
519 }
520
TEST_F(FPDFTextEmbeddertest,Bug_921)521 TEST_F(FPDFTextEmbeddertest, Bug_921) {
522 EXPECT_TRUE(OpenDocument("bug_921.pdf"));
523 FPDF_PAGE page = LoadPage(0);
524 EXPECT_TRUE(page);
525
526 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
527 EXPECT_TRUE(textpage);
528
529 static constexpr unsigned int kData[] = {
530 1095, 1077, 1083, 1086, 1074, 1077, 1095, 1077, 1089, 1082, 1086, 1077,
531 32, 1089, 1090, 1088, 1072, 1076, 1072, 1085, 1080, 1077, 46, 32};
532 static constexpr int kStartIndex = 238;
533
534 ASSERT_EQ(268, FPDFText_CountChars(textpage));
535 for (size_t i = 0; i < FX_ArraySize(kData); ++i)
536 EXPECT_EQ(kData[i], FPDFText_GetUnicode(textpage, kStartIndex + i));
537
538 unsigned short buffer[FX_ArraySize(kData) + 1];
539 memset(buffer, 0xbd, sizeof(buffer));
540 int count =
541 FPDFText_GetText(textpage, kStartIndex, FX_ArraySize(kData), buffer);
542 ASSERT_GT(count, 0);
543 ASSERT_EQ(FX_ArraySize(kData) + 1, static_cast<size_t>(count));
544 for (size_t i = 0; i < FX_ArraySize(kData); ++i)
545 EXPECT_EQ(kData[i], buffer[i]);
546 EXPECT_EQ(0, buffer[FX_ArraySize(kData)]);
547
548 FPDFText_ClosePage(textpage);
549 UnloadPage(page);
550 }
551
TEST_F(FPDFTextEmbeddertest,GetTextWithHyphen)552 TEST_F(FPDFTextEmbeddertest, GetTextWithHyphen) {
553 EXPECT_TRUE(OpenDocument("bug_781804.pdf"));
554 FPDF_PAGE page = LoadPage(0);
555 EXPECT_TRUE(page);
556
557 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
558 EXPECT_TRUE(textpage);
559
560 // Check that soft hyphens are not included
561 // Expecting 'Veritaserum', except there is a \uFFFE where the hyphen was in
562 // the original text. This is a weird thing that Adobe does, which we
563 // replicate.
564 constexpr unsigned short soft_expected[] = {
565 0x0056, 0x0065, 0x0072, 0x0069, 0x0074, 0x0061, 0xfffe,
566 0x0073, 0x0065, 0x0072, 0x0075, 0x006D, 0x0000};
567 {
568 constexpr int count = FX_ArraySize(soft_expected) - 1;
569 unsigned short buffer[FX_ArraySize(soft_expected)];
570 memset(buffer, 0, sizeof(buffer));
571
572 EXPECT_EQ(count + 1, FPDFText_GetText(textpage, 0, count, buffer));
573 for (int i = 0; i < count; i++)
574 EXPECT_EQ(soft_expected[i], buffer[i]);
575 }
576
577 // Check that hard hyphens are included
578 {
579 // There isn't the \0 in the actual doc, but there is a \r\n, so need to
580 // add 1 to get aligned.
581 constexpr size_t offset = FX_ArraySize(soft_expected) + 1;
582 // Expecting 'User-\r\ngenerated', the - is a unicode character, so cannnot
583 // store in a char[].
584 constexpr unsigned short hard_expected[] = {
585 0x0055, 0x0073, 0x0065, 0x0072, 0x2010, 0x000d, 0x000a, 0x0067, 0x0065,
586 0x006e, 0x0065, 0x0072, 0x0061, 0x0074, 0x0065, 0x0064, 0x0000};
587 constexpr int count = FX_ArraySize(hard_expected) - 1;
588 unsigned short buffer[FX_ArraySize(hard_expected)];
589
590 EXPECT_EQ(count + 1, FPDFText_GetText(textpage, offset, count, buffer));
591 for (int i = 0; i < count; i++)
592 EXPECT_EQ(hard_expected[i], buffer[i]);
593 }
594
595 FPDFText_ClosePage(textpage);
596 UnloadPage(page);
597 }
598
TEST_F(FPDFTextEmbeddertest,bug_782596)599 TEST_F(FPDFTextEmbeddertest, bug_782596) {
600 // If there is a regression in this test, it will only fail under ASAN
601 EXPECT_TRUE(OpenDocument("bug_782596.pdf"));
602 FPDF_PAGE page = LoadPage(0);
603 EXPECT_TRUE(page);
604 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
605 EXPECT_TRUE(textpage);
606 FPDFText_ClosePage(textpage);
607 UnloadPage(page);
608 }
609
TEST_F(FPDFTextEmbeddertest,ControlCharacters)610 TEST_F(FPDFTextEmbeddertest, ControlCharacters) {
611 EXPECT_TRUE(OpenDocument("control_characters.pdf"));
612 FPDF_PAGE page = LoadPage(0);
613 EXPECT_TRUE(page);
614
615 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
616 EXPECT_TRUE(textpage);
617
618 // Should not include the control characters in the output
619 static const char expected[] = "Hello, world!\r\nGoodbye, world!";
620 unsigned short fixed_buffer[128];
621 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
622 int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
623
624 ASSERT_GE(num_chars, 0);
625 EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
626 EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
627
628 // Attempting to get a chunk of text after the control characters
629 static const char expected_substring[] = "Goodbye, world!";
630 // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the
631 // original stream
632 static const int offset = 17;
633 memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
634 num_chars = FPDFText_GetText(textpage, offset, 128, fixed_buffer);
635
636 ASSERT_GE(num_chars, 0);
637 EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars));
638 EXPECT_TRUE(check_unsigned_shorts(expected_substring, fixed_buffer,
639 sizeof(expected_substring)));
640
641 FPDFText_ClosePage(textpage);
642 UnloadPage(page);
643 }
644