1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_textpage.h"
8
9 #include <algorithm>
10 #include <utility>
11 #include <vector>
12
13 #include "core/fpdfapi/font/cpdf_font.h"
14 #include "core/fpdfapi/page/cpdf_form.h"
15 #include "core/fpdfapi/page/cpdf_formobject.h"
16 #include "core/fpdfapi/page/cpdf_page.h"
17 #include "core/fpdfapi/page/cpdf_pageobject.h"
18 #include "core/fpdfapi/page/cpdf_textobject.h"
19 #include "core/fpdfapi/parser/cpdf_dictionary.h"
20 #include "core/fpdfapi/parser/cpdf_string.h"
21 #include "core/fpdftext/unicodenormalizationdata.h"
22 #include "core/fxcrt/fx_bidi.h"
23 #include "core/fxcrt/fx_extension.h"
24 #include "core/fxcrt/fx_unicode.h"
25 #include "third_party/base/stl_util.h"
26
27 namespace {
28
29 const float kDefaultFontSize = 1.0f;
30 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {
31 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,
32 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};
33
NormalizeThreshold(float threshold)34 float NormalizeThreshold(float threshold) {
35 if (threshold < 300)
36 return threshold / 2.0f;
37 if (threshold < 500)
38 return threshold / 4.0f;
39 if (threshold < 700)
40 return threshold / 5.0f;
41 return threshold / 6.0f;
42 }
43
CalculateBaseSpace(const CPDF_TextObject * pTextObj,const CFX_Matrix & matrix)44 float CalculateBaseSpace(const CPDF_TextObject* pTextObj,
45 const CFX_Matrix& matrix) {
46 float baseSpace = 0.0;
47 const size_t nItems = pTextObj->CountItems();
48 if (pTextObj->m_TextState.GetCharSpace() && nItems >= 3) {
49 bool bAllChar = true;
50 float spacing =
51 matrix.TransformDistance(pTextObj->m_TextState.GetCharSpace());
52 baseSpace = spacing;
53 for (size_t i = 0; i < nItems; ++i) {
54 CPDF_TextObjectItem item;
55 pTextObj->GetItemInfo(i, &item);
56 if (item.m_CharCode == static_cast<uint32_t>(-1)) {
57 float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
58 float kerning = -fontsize_h * item.m_Origin.x / 1000;
59 baseSpace = std::min(baseSpace, kerning + spacing);
60 bAllChar = false;
61 }
62 }
63 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar))
64 baseSpace = 0.0;
65 }
66 return baseSpace;
67 }
68
Unicode_GetNormalization(wchar_t wch,wchar_t * pDst)69 size_t Unicode_GetNormalization(wchar_t wch, wchar_t* pDst) {
70 wch = wch & 0xFFFF;
71 wchar_t wFind = g_UnicodeData_Normalization[wch];
72 if (!wFind) {
73 if (pDst)
74 *pDst = wch;
75 return 1;
76 }
77 if (wFind >= 0x8000) {
78 wch = wFind - 0x8000;
79 wFind = 1;
80 } else {
81 wch = wFind & 0x0FFF;
82 wFind >>= 12;
83 }
84 const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind];
85 if (pMap == g_UnicodeData_Normalization_Map4) {
86 pMap = g_UnicodeData_Normalization_Map4 + wch;
87 wFind = (wchar_t)(*pMap++);
88 } else {
89 pMap += wch;
90 }
91 if (pDst) {
92 wchar_t n = wFind;
93 while (n--)
94 *pDst++ = *pMap++;
95 }
96 return static_cast<size_t>(wFind);
97 }
98
MaskPercentFilled(const std::vector<bool> & mask,int32_t start,int32_t end)99 float MaskPercentFilled(const std::vector<bool>& mask,
100 int32_t start,
101 int32_t end) {
102 if (start >= end)
103 return 0;
104 float count = std::count_if(mask.begin() + start, mask.begin() + end,
105 [](bool r) { return r; });
106 return count / (end - start);
107 }
108
IsHyphenCode(wchar_t c)109 bool IsHyphenCode(wchar_t c) {
110 return c == 0x2D || c == 0xAD;
111 }
112
113 } // namespace
114
PDFTEXT_Obj()115 PDFTEXT_Obj::PDFTEXT_Obj() {}
116
117 PDFTEXT_Obj::PDFTEXT_Obj(const PDFTEXT_Obj& that) = default;
118
~PDFTEXT_Obj()119 PDFTEXT_Obj::~PDFTEXT_Obj() {}
120
FPDF_CHAR_INFO()121 FPDF_CHAR_INFO::FPDF_CHAR_INFO()
122 : m_Unicode(0),
123 m_Charcode(0),
124 m_Flag(0),
125 m_FontSize(0),
126 m_pTextObj(nullptr) {}
127
~FPDF_CHAR_INFO()128 FPDF_CHAR_INFO::~FPDF_CHAR_INFO() {}
129
PAGECHAR_INFO()130 PAGECHAR_INFO::PAGECHAR_INFO()
131 : m_Index(0), m_CharCode(0), m_Unicode(0), m_Flag(0), m_pTextObj(nullptr) {}
132
133 PAGECHAR_INFO::PAGECHAR_INFO(const PAGECHAR_INFO&) = default;
134
~PAGECHAR_INFO()135 PAGECHAR_INFO::~PAGECHAR_INFO() {}
136
CPDF_TextPage(const CPDF_Page * pPage,FPDFText_Direction flags)137 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags)
138 : m_pPage(pPage),
139 m_parserflag(flags),
140 m_pPreTextObj(nullptr),
141 m_bIsParsed(false),
142 m_TextlineDir(TextOrientation::Unknown) {
143 m_TextBuf.EstimateSize(0, 10240);
144 m_DisplayMatrix =
145 pPage->GetDisplayMatrix(0, 0, static_cast<int>(pPage->GetPageWidth()),
146 static_cast<int>(pPage->GetPageHeight()), 0);
147 }
148
~CPDF_TextPage()149 CPDF_TextPage::~CPDF_TextPage() {}
150
IsControlChar(const PAGECHAR_INFO & charInfo)151 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
152 switch (charInfo.m_Unicode) {
153 case 0x2:
154 case 0x3:
155 case 0x93:
156 case 0x94:
157 case 0x96:
158 case 0x97:
159 case 0x98:
160 case 0xfffe:
161 return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
162 default:
163 return false;
164 }
165 }
166
ParseTextPage()167 void CPDF_TextPage::ParseTextPage() {
168 m_bIsParsed = false;
169 m_TextBuf.Clear();
170 m_CharList.clear();
171 m_pPreTextObj = nullptr;
172 ProcessObject();
173
174 m_bIsParsed = true;
175 m_CharIndex.clear();
176 int nCount = pdfium::CollectionSize<int>(m_CharList);
177 if (nCount)
178 m_CharIndex.push_back(0);
179
180 for (int i = 0; i < nCount; i++) {
181 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
182 const PAGECHAR_INFO& charinfo = m_CharList[i];
183 if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED ||
184 (charinfo.m_Unicode != 0 && !IsControlChar(charinfo))) {
185 if (indexSize % 2) {
186 m_CharIndex.push_back(1);
187 } else {
188 if (indexSize <= 0)
189 continue;
190 m_CharIndex[indexSize - 1] += 1;
191 }
192 } else {
193 if (indexSize % 2) {
194 if (indexSize <= 0)
195 continue;
196 m_CharIndex[indexSize - 1] = i + 1;
197 } else {
198 m_CharIndex.push_back(i + 1);
199 }
200 }
201 }
202 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
203 if (indexSize % 2)
204 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
205 }
206
CountChars() const207 int CPDF_TextPage::CountChars() const {
208 return pdfium::CollectionSize<int>(m_CharList);
209 }
210
CharIndexFromTextIndex(int TextIndex) const211 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const {
212 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
213 int count = 0;
214 for (int i = 0; i < indexSize; i += 2) {
215 count += m_CharIndex[i + 1];
216 if (count > TextIndex)
217 return TextIndex - count + m_CharIndex[i + 1] + m_CharIndex[i];
218 }
219 return -1;
220 }
221
TextIndexFromCharIndex(int CharIndex) const222 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const {
223 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
224 int count = 0;
225 for (int i = 0; i < indexSize; i += 2) {
226 count += m_CharIndex[i + 1];
227 if (m_CharIndex[i + 1] + m_CharIndex[i] > CharIndex) {
228 if (CharIndex - m_CharIndex[i] < 0)
229 return -1;
230
231 return CharIndex - m_CharIndex[i] + count - m_CharIndex[i + 1];
232 }
233 }
234 return -1;
235 }
236
GetRectArray(int start,int nCount) const237 std::vector<CFX_FloatRect> CPDF_TextPage::GetRectArray(int start,
238 int nCount) const {
239 if (start < 0 || nCount == 0 || !m_bIsParsed)
240 return std::vector<CFX_FloatRect>();
241
242 if (nCount + start > pdfium::CollectionSize<int>(m_CharList) ||
243 nCount == -1) {
244 nCount = pdfium::CollectionSize<int>(m_CharList) - start;
245 }
246
247 std::vector<CFX_FloatRect> rectArray;
248 CPDF_TextObject* pCurObj = nullptr;
249 CFX_FloatRect rect;
250 int curPos = start;
251 bool bFlagNewRect = true;
252 while (nCount--) {
253 PAGECHAR_INFO info_curchar = m_CharList[curPos++];
254 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED)
255 continue;
256 if (info_curchar.m_CharBox.Width() < 0.01 ||
257 info_curchar.m_CharBox.Height() < 0.01) {
258 continue;
259 }
260 if (!pCurObj)
261 pCurObj = info_curchar.m_pTextObj.Get();
262 if (pCurObj != info_curchar.m_pTextObj) {
263 rectArray.push_back(rect);
264 pCurObj = info_curchar.m_pTextObj.Get();
265 bFlagNewRect = true;
266 }
267 if (bFlagNewRect) {
268 CFX_Matrix matrix = info_curchar.m_pTextObj->GetTextMatrix();
269 matrix.Concat(info_curchar.m_Matrix);
270
271 CFX_PointF origin = matrix.GetInverse().Transform(info_curchar.m_Origin);
272 rect.left = info_curchar.m_CharBox.left;
273 rect.right = info_curchar.m_CharBox.right;
274 if (pCurObj->GetFont()->GetTypeDescent()) {
275 rect.bottom = origin.y +
276 pCurObj->GetFont()->GetTypeDescent() *
277 pCurObj->GetFontSize() / 1000;
278
279 rect.bottom = matrix.Transform(CFX_PointF(origin.x, rect.bottom)).y;
280 } else {
281 rect.bottom = info_curchar.m_CharBox.bottom;
282 }
283 if (pCurObj->GetFont()->GetTypeAscent()) {
284 rect.top =
285 origin.y +
286 pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
287 float xPosTemp =
288 origin.x +
289 GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) *
290 pCurObj->GetFontSize() / 1000;
291 rect.top = matrix.Transform(CFX_PointF(xPosTemp, rect.top)).y;
292 } else {
293 rect.top = info_curchar.m_CharBox.top;
294 }
295 bFlagNewRect = false;
296 rect = info_curchar.m_CharBox;
297 rect.Normalize();
298 } else {
299 info_curchar.m_CharBox.Normalize();
300 rect.left = std::min(rect.left, info_curchar.m_CharBox.left);
301 rect.right = std::max(rect.right, info_curchar.m_CharBox.right);
302 rect.top = std::max(rect.top, info_curchar.m_CharBox.top);
303 rect.bottom = std::min(rect.bottom, info_curchar.m_CharBox.bottom);
304 }
305 }
306 rectArray.push_back(rect);
307 return rectArray;
308 }
309
GetIndexAtPos(const CFX_PointF & point,const CFX_SizeF & tolerance) const310 int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point,
311 const CFX_SizeF& tolerance) const {
312 if (!m_bIsParsed)
313 return -3;
314
315 int pos = 0;
316 int NearPos = -1;
317 double xdif = 5000;
318 double ydif = 5000;
319 while (pos < pdfium::CollectionSize<int>(m_CharList)) {
320 PAGECHAR_INFO charinfo = m_CharList[pos];
321 CFX_FloatRect charrect = charinfo.m_CharBox;
322 if (charrect.Contains(point))
323 break;
324 if (tolerance.width > 0 || tolerance.height > 0) {
325 CFX_FloatRect charRectExt;
326 charrect.Normalize();
327 charRectExt.left = charrect.left - tolerance.width / 2;
328 charRectExt.right = charrect.right + tolerance.width / 2;
329 charRectExt.top = charrect.top + tolerance.height / 2;
330 charRectExt.bottom = charrect.bottom - tolerance.height / 2;
331 if (charRectExt.Contains(point)) {
332 double curXdif, curYdif;
333 curXdif = fabs(point.x - charrect.left) < fabs(point.x - charrect.right)
334 ? fabs(point.x - charrect.left)
335 : fabs(point.x - charrect.right);
336 curYdif = fabs(point.y - charrect.bottom) < fabs(point.y - charrect.top)
337 ? fabs(point.y - charrect.bottom)
338 : fabs(point.y - charrect.top);
339 if (curYdif + curXdif < xdif + ydif) {
340 ydif = curYdif;
341 xdif = curXdif;
342 NearPos = pos;
343 }
344 }
345 }
346 ++pos;
347 }
348 return pos < pdfium::CollectionSize<int>(m_CharList) ? pos : NearPos;
349 }
350
GetTextByRect(const CFX_FloatRect & rect) const351 WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
352 if (!m_bIsParsed)
353 return WideString();
354
355 float posy = 0;
356 bool IsContainPreChar = false;
357 bool IsAddLineFeed = false;
358 WideString strText;
359 for (const auto& charinfo : m_CharList) {
360 if (IsRectIntersect(rect, charinfo.m_CharBox)) {
361 if (fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar &&
362 IsAddLineFeed) {
363 posy = charinfo.m_Origin.y;
364 if (!strText.IsEmpty())
365 strText += L"\r\n";
366 }
367 IsContainPreChar = true;
368 IsAddLineFeed = false;
369 if (charinfo.m_Unicode)
370 strText += charinfo.m_Unicode;
371 } else if (charinfo.m_Unicode == 32) {
372 if (IsContainPreChar && charinfo.m_Unicode) {
373 strText += charinfo.m_Unicode;
374 IsContainPreChar = false;
375 IsAddLineFeed = false;
376 }
377 } else {
378 IsContainPreChar = false;
379 IsAddLineFeed = true;
380 }
381 }
382 return strText;
383 }
384
GetCharInfo(int index,FPDF_CHAR_INFO * info) const385 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
386 if (!m_bIsParsed || !pdfium::IndexInBounds(m_CharList, index))
387 return;
388
389 const PAGECHAR_INFO& charinfo = m_CharList[index];
390 info->m_Charcode = charinfo.m_CharCode;
391 info->m_Origin = charinfo.m_Origin;
392 info->m_Unicode = charinfo.m_Unicode;
393 info->m_Flag = charinfo.m_Flag;
394 info->m_CharBox = charinfo.m_CharBox;
395 info->m_pTextObj = charinfo.m_pTextObj;
396 if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont())
397 info->m_FontSize = charinfo.m_pTextObj->GetFontSize();
398 else
399 info->m_FontSize = kDefaultFontSize;
400 info->m_Matrix = charinfo.m_Matrix;
401 }
402
CheckMarkedContentObject(int32_t & start,int32_t & nCount) const403 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
404 int32_t& nCount) const {
405 PAGECHAR_INFO charinfo = m_CharList[start];
406 PAGECHAR_INFO charinfo2 = m_CharList[start + nCount - 1];
407 if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag &&
408 FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
409 return;
410 }
411 if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
412 PAGECHAR_INFO charinfo1 = charinfo;
413 int startIndex = start;
414 while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag &&
415 charinfo1.m_Index == charinfo.m_Index) {
416 startIndex--;
417 if (startIndex < 0)
418 break;
419 charinfo1 = m_CharList[startIndex];
420 }
421 startIndex++;
422 start = startIndex;
423 }
424 if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
425 PAGECHAR_INFO charinfo3 = charinfo2;
426 int endIndex = start + nCount - 1;
427 while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag &&
428 charinfo3.m_Index == charinfo2.m_Index) {
429 endIndex++;
430 if (endIndex >= pdfium::CollectionSize<int>(m_CharList))
431 break;
432 charinfo3 = m_CharList[endIndex];
433 }
434 endIndex--;
435 nCount = endIndex - start + 1;
436 }
437 }
438
GetPageText(int start,int count) const439 WideString CPDF_TextPage::GetPageText(int start, int count) const {
440 if (start < 0 || start >= CountChars() || count <= 0 || !m_bIsParsed ||
441 m_CharList.empty() || m_TextBuf.GetLength() == 0) {
442 return L"";
443 }
444
445 int text_start = TextIndexFromCharIndex(start);
446 if (text_start < 0)
447 return L"";
448
449 count = std::min(count, CountChars() - start);
450
451 int last = start + count - 1;
452 int text_last = TextIndexFromCharIndex(last);
453 if (text_last < 0 || text_last < text_start)
454 return L"";
455
456 int text_count = text_last - text_start + 1;
457
458 return WideString(m_TextBuf.AsStringView().Mid(
459 static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
460 }
461
CountRects(int start,int nCount)462 int CPDF_TextPage::CountRects(int start, int nCount) {
463 if (!m_bIsParsed || start < 0)
464 return -1;
465
466 if (nCount == -1 ||
467 nCount + start > pdfium::CollectionSize<int>(m_CharList)) {
468 nCount = pdfium::CollectionSize<int>(m_CharList) - start;
469 }
470 m_SelRects = GetRectArray(start, nCount);
471 return pdfium::CollectionSize<int>(m_SelRects);
472 }
473
GetRect(int rectIndex,CFX_FloatRect * pRect) const474 bool CPDF_TextPage::GetRect(int rectIndex, CFX_FloatRect* pRect) const {
475 if (!m_bIsParsed || !pdfium::IndexInBounds(m_SelRects, rectIndex))
476 return false;
477
478 *pRect = m_SelRects[rectIndex];
479 return true;
480 }
481
FindTextlineFlowOrientation() const482 CPDF_TextPage::TextOrientation CPDF_TextPage::FindTextlineFlowOrientation()
483 const {
484 if (m_pPage->GetPageObjectList()->empty())
485 return TextOrientation::Unknown;
486
487 const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth());
488 const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight());
489 if (nPageWidth <= 0 || nPageHeight <= 0)
490 return TextOrientation::Unknown;
491
492 std::vector<bool> nHorizontalMask(nPageWidth);
493 std::vector<bool> nVerticalMask(nPageHeight);
494 float fLineHeight = 0.0f;
495 int32_t nStartH = nPageWidth;
496 int32_t nEndH = 0;
497 int32_t nStartV = nPageHeight;
498 int32_t nEndV = 0;
499 for (const auto& pPageObj : *m_pPage->GetPageObjectList()) {
500 if (!pPageObj->IsText())
501 continue;
502
503 int32_t minH = std::max(static_cast<int32_t>(pPageObj->m_Left), 0);
504 int32_t maxH =
505 std::min(static_cast<int32_t>(pPageObj->m_Right), nPageWidth);
506 int32_t minV = std::max(static_cast<int32_t>(pPageObj->m_Bottom), 0);
507 int32_t maxV = std::min(static_cast<int32_t>(pPageObj->m_Top), nPageHeight);
508 if (minH >= maxH || minV >= maxV)
509 continue;
510
511 for (int32_t i = minH; i < maxH; ++i)
512 nHorizontalMask[i] = true;
513 for (int32_t i = minV; i < maxV; ++i)
514 nVerticalMask[i] = true;
515
516 nStartH = std::min(nStartH, minH);
517 nEndH = std::max(nEndH, maxH);
518 nStartV = std::min(nStartV, minV);
519 nEndV = std::max(nEndV, maxV);
520
521 if (fLineHeight <= 0.0f)
522 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
523 }
524 const int32_t nDoubleLineHeight = 2 * fLineHeight;
525 if ((nEndV - nStartV) < nDoubleLineHeight)
526 return TextOrientation::Horizontal;
527 if ((nEndH - nStartH) < nDoubleLineHeight)
528 return TextOrientation::Vertical;
529
530 const float nSumH = MaskPercentFilled(nHorizontalMask, nStartH, nEndH);
531 if (nSumH > 0.8f)
532 return TextOrientation::Horizontal;
533
534 const float nSumV = MaskPercentFilled(nVerticalMask, nStartV, nEndV);
535 if (nSumH > nSumV)
536 return TextOrientation::Horizontal;
537 if (nSumH < nSumV)
538 return TextOrientation::Vertical;
539 return TextOrientation::Unknown;
540 }
541
AppendGeneratedCharacter(wchar_t unicode,const CFX_Matrix & formMatrix)542 void CPDF_TextPage::AppendGeneratedCharacter(wchar_t unicode,
543 const CFX_Matrix& formMatrix) {
544 PAGECHAR_INFO generateChar;
545 if (!GenerateCharInfo(unicode, generateChar))
546 return;
547
548 m_TextBuf.AppendChar(unicode);
549 if (!formMatrix.IsIdentity())
550 generateChar.m_Matrix = formMatrix;
551 m_CharList.push_back(generateChar);
552 }
553
ProcessObject()554 void CPDF_TextPage::ProcessObject() {
555 if (m_pPage->GetPageObjectList()->empty())
556 return;
557
558 m_TextlineDir = FindTextlineFlowOrientation();
559 const CPDF_PageObjectList* pObjList = m_pPage->GetPageObjectList();
560 for (auto it = pObjList->begin(); it != pObjList->end(); ++it) {
561 if (CPDF_PageObject* pObj = it->get()) {
562 if (pObj->IsText()) {
563 CFX_Matrix matrix;
564 ProcessTextObject(pObj->AsText(), matrix, pObjList, it);
565 } else if (pObj->IsForm()) {
566 CFX_Matrix formMatrix;
567 ProcessFormObject(pObj->AsForm(), formMatrix);
568 }
569 }
570 }
571 for (const auto& obj : m_LineObj)
572 ProcessTextObject(obj);
573
574 m_LineObj.clear();
575 CloseTempLine();
576 }
577
ProcessFormObject(CPDF_FormObject * pFormObj,const CFX_Matrix & formMatrix)578 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
579 const CFX_Matrix& formMatrix) {
580 const CPDF_PageObjectList* pObjectList =
581 pFormObj->form()->GetPageObjectList();
582 if (pObjectList->empty())
583 return;
584
585 CFX_Matrix curFormMatrix = pFormObj->form_matrix();
586 curFormMatrix.Concat(formMatrix);
587
588 for (auto it = pObjectList->begin(); it != pObjectList->end(); ++it) {
589 if (CPDF_PageObject* pPageObj = it->get()) {
590 if (pPageObj->IsText())
591 ProcessTextObject(pPageObj->AsText(), curFormMatrix, pObjectList, it);
592 else if (pPageObj->IsForm())
593 ProcessFormObject(pPageObj->AsForm(), curFormMatrix);
594 }
595 }
596 }
597
GetCharWidth(uint32_t charCode,CPDF_Font * pFont) const598 int CPDF_TextPage::GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const {
599 if (charCode == CPDF_Font::kInvalidCharCode)
600 return 0;
601
602 if (int w = pFont->GetCharWidthF(charCode))
603 return w;
604
605 ByteString str;
606 pFont->AppendChar(&str, charCode);
607 if (int w = pFont->GetStringWidth(str.c_str(), 1))
608 return w;
609
610 return pFont->GetCharBBox(charCode).Width();
611 }
612
AddCharInfoByLRDirection(wchar_t wChar,PAGECHAR_INFO info)613 void CPDF_TextPage::AddCharInfoByLRDirection(wchar_t wChar,
614 PAGECHAR_INFO info) {
615 if (IsControlChar(info)) {
616 info.m_Index = -1;
617 m_CharList.push_back(info);
618 return;
619 }
620
621 info.m_Index = m_TextBuf.GetLength();
622 if (wChar >= 0xFB00 && wChar <= 0xFB06) {
623 wchar_t* pDst = nullptr;
624 size_t nCount = Unicode_GetNormalization(wChar, pDst);
625 if (nCount >= 1) {
626 pDst = FX_Alloc(wchar_t, nCount);
627 Unicode_GetNormalization(wChar, pDst);
628 for (size_t nIndex = 0; nIndex < nCount; nIndex++) {
629 PAGECHAR_INFO info2 = info;
630 info2.m_Unicode = pDst[nIndex];
631 info2.m_Flag = FPDFTEXT_CHAR_PIECE;
632 m_TextBuf.AppendChar(info2.m_Unicode);
633 m_CharList.push_back(info2);
634 }
635 FX_Free(pDst);
636 return;
637 }
638 }
639 m_TextBuf.AppendChar(wChar);
640 m_CharList.push_back(info);
641 }
642
AddCharInfoByRLDirection(wchar_t wChar,PAGECHAR_INFO info)643 void CPDF_TextPage::AddCharInfoByRLDirection(wchar_t wChar,
644 PAGECHAR_INFO info) {
645 if (IsControlChar(info)) {
646 info.m_Index = -1;
647 m_CharList.push_back(info);
648 return;
649 }
650
651 info.m_Index = m_TextBuf.GetLength();
652 wChar = FX_GetMirrorChar(wChar);
653 wchar_t* pDst = nullptr;
654 size_t nCount = Unicode_GetNormalization(wChar, pDst);
655 if (nCount >= 1) {
656 pDst = FX_Alloc(wchar_t, nCount);
657 Unicode_GetNormalization(wChar, pDst);
658 for (size_t nIndex = 0; nIndex < nCount; nIndex++) {
659 PAGECHAR_INFO info2 = info;
660 info2.m_Unicode = pDst[nIndex];
661 info2.m_Flag = FPDFTEXT_CHAR_PIECE;
662 m_TextBuf.AppendChar(info2.m_Unicode);
663 m_CharList.push_back(info2);
664 }
665 FX_Free(pDst);
666 return;
667 }
668 info.m_Unicode = wChar;
669 m_TextBuf.AppendChar(info.m_Unicode);
670 m_CharList.push_back(info);
671 }
672
CloseTempLine()673 void CPDF_TextPage::CloseTempLine() {
674 if (m_TempCharList.empty())
675 return;
676
677 WideString str = m_TempTextBuf.MakeString();
678 bool bPrevSpace = false;
679 for (size_t i = 0; i < str.GetLength(); i++) {
680 if (str[i] != ' ') {
681 bPrevSpace = false;
682 continue;
683 }
684 if (bPrevSpace) {
685 m_TempTextBuf.Delete(i, 1);
686 m_TempCharList.erase(m_TempCharList.begin() + i);
687 str.Delete(i);
688 i--;
689 }
690 bPrevSpace = true;
691 }
692 CFX_BidiString bidi(str);
693 if (m_parserflag == FPDFText_Direction::Right)
694 bidi.SetOverallDirectionRight();
695 CFX_BidiChar::Direction eCurrentDirection = bidi.OverallDirection();
696 for (const auto& segment : bidi) {
697 if (segment.direction == CFX_BidiChar::RIGHT ||
698 (segment.direction == CFX_BidiChar::NEUTRAL &&
699 eCurrentDirection == CFX_BidiChar::RIGHT)) {
700 eCurrentDirection = CFX_BidiChar::RIGHT;
701 for (int m = segment.start + segment.count; m > segment.start; --m)
702 AddCharInfoByRLDirection(bidi.CharAt(m - 1), m_TempCharList[m - 1]);
703 } else {
704 eCurrentDirection = CFX_BidiChar::LEFT;
705 for (int m = segment.start; m < segment.start + segment.count; m++)
706 AddCharInfoByLRDirection(bidi.CharAt(m), m_TempCharList[m]);
707 }
708 }
709 m_TempCharList.clear();
710 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
711 }
712
ProcessTextObject(CPDF_TextObject * pTextObj,const CFX_Matrix & formMatrix,const CPDF_PageObjectList * pObjList,CPDF_PageObjectList::const_iterator ObjPos)713 void CPDF_TextPage::ProcessTextObject(
714 CPDF_TextObject* pTextObj,
715 const CFX_Matrix& formMatrix,
716 const CPDF_PageObjectList* pObjList,
717 CPDF_PageObjectList::const_iterator ObjPos) {
718 if (fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f)
719 return;
720
721 size_t count = m_LineObj.size();
722 PDFTEXT_Obj Obj;
723 Obj.m_pTextObj = pTextObj;
724 Obj.m_formMatrix = formMatrix;
725 if (count == 0) {
726 m_LineObj.push_back(Obj);
727 return;
728 }
729 if (IsSameAsPreTextObject(pTextObj, pObjList, ObjPos))
730 return;
731
732 PDFTEXT_Obj prev_Obj = m_LineObj[count - 1];
733 size_t nItem = prev_Obj.m_pTextObj->CountItems();
734 if (nItem == 0)
735 return;
736
737 CPDF_TextObjectItem item;
738 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
739 float prev_width =
740 GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) *
741 prev_Obj.m_pTextObj->GetFontSize() / 1000;
742
743 CFX_Matrix prev_matrix = prev_Obj.m_pTextObj->GetTextMatrix();
744 prev_width = fabs(prev_width);
745 prev_matrix.Concat(prev_Obj.m_formMatrix);
746 prev_width = prev_matrix.TransformDistance(prev_width);
747 pTextObj->GetItemInfo(0, &item);
748 float this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) *
749 pTextObj->GetFontSize() / 1000;
750 this_width = fabs(this_width);
751
752 CFX_Matrix this_matrix = pTextObj->GetTextMatrix();
753 this_width = fabs(this_width);
754 this_matrix.Concat(formMatrix);
755 this_width = this_matrix.TransformDistance(this_width);
756
757 float threshold = prev_width > this_width ? prev_width / 4 : this_width / 4;
758 CFX_PointF prev_pos = m_DisplayMatrix.Transform(
759 prev_Obj.m_formMatrix.Transform(prev_Obj.m_pTextObj->GetPos()));
760 CFX_PointF this_pos =
761 m_DisplayMatrix.Transform(formMatrix.Transform(pTextObj->GetPos()));
762 if (fabs(this_pos.y - prev_pos.y) > threshold * 2) {
763 for (size_t i = 0; i < count; i++)
764 ProcessTextObject(m_LineObj[i]);
765 m_LineObj.clear();
766 m_LineObj.push_back(Obj);
767 return;
768 }
769
770 for (size_t i = count; i > 0; --i) {
771 PDFTEXT_Obj prev_text_obj = m_LineObj[i - 1];
772 CFX_PointF new_prev_pos =
773 m_DisplayMatrix.Transform(prev_text_obj.m_formMatrix.Transform(
774 prev_text_obj.m_pTextObj->GetPos()));
775 if (this_pos.x >= new_prev_pos.x) {
776 m_LineObj.insert(m_LineObj.begin() + i, Obj);
777 return;
778 }
779 }
780 m_LineObj.insert(m_LineObj.begin(), Obj);
781 }
782
PreMarkedContent(PDFTEXT_Obj Obj)783 FPDFText_MarkedContent CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
784 CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get();
785 if (!pTextObj->m_ContentMark.HasRef())
786 return FPDFText_MarkedContent::Pass;
787
788 size_t nContentMark = pTextObj->m_ContentMark.CountItems();
789 if (nContentMark == 0)
790 return FPDFText_MarkedContent::Pass;
791
792 WideString actText;
793 bool bExist = false;
794 CPDF_Dictionary* pDict = nullptr;
795 for (size_t i = 0; i < nContentMark; ++i) {
796 const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(i);
797 pDict = item.GetParam();
798 if (!pDict)
799 continue;
800 CPDF_String* temp = ToString(pDict->GetObjectFor("ActualText"));
801 if (temp) {
802 bExist = true;
803 actText = temp->GetUnicodeText();
804 }
805 }
806 if (!bExist)
807 return FPDFText_MarkedContent::Pass;
808
809 if (m_pPreTextObj) {
810 const CPDF_ContentMark& mark = m_pPreTextObj->m_ContentMark;
811 if (mark.HasRef() && mark.CountItems() == nContentMark &&
812 mark.GetItem(nContentMark - 1).GetParam() == pDict) {
813 return FPDFText_MarkedContent::Done;
814 }
815 }
816
817 if (actText.IsEmpty())
818 return FPDFText_MarkedContent::Pass;
819
820 CPDF_Font* pFont = pTextObj->GetFont();
821 bExist = false;
822 for (size_t i = 0; i < actText.GetLength(); i++) {
823 if (pFont->CharCodeFromUnicode(actText[i]) != CPDF_Font::kInvalidCharCode) {
824 bExist = true;
825 break;
826 }
827 }
828 if (!bExist)
829 return FPDFText_MarkedContent::Pass;
830
831 bExist = false;
832 for (size_t i = 0; i < actText.GetLength(); i++) {
833 wchar_t wChar = actText[i];
834 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
835 bExist = true;
836 break;
837 }
838 }
839 if (!bExist)
840 return FPDFText_MarkedContent::Done;
841
842 return FPDFText_MarkedContent::Delay;
843 }
844
ProcessMarkedContent(PDFTEXT_Obj Obj)845 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
846 CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get();
847 if (!pTextObj->m_ContentMark.HasRef())
848 return;
849
850 int nContentMark = pTextObj->m_ContentMark.CountItems();
851 if (nContentMark < 1)
852 return;
853
854 WideString actText;
855 for (int n = 0; n < nContentMark; n++) {
856 const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(n);
857 CPDF_Dictionary* pDict = item.GetParam();
858 if (pDict)
859 actText = pDict->GetUnicodeTextFor("ActualText");
860 }
861 if (actText.IsEmpty())
862 return;
863
864 CPDF_Font* pFont = pTextObj->GetFont();
865 CFX_Matrix matrix = pTextObj->GetTextMatrix();
866 matrix.Concat(Obj.m_formMatrix);
867
868 for (size_t k = 0; k < actText.GetLength(); k++) {
869 wchar_t wChar = actText[k];
870 if (wChar <= 0x80 && !isprint(wChar))
871 wChar = 0x20;
872 if (wChar >= 0xFFFD)
873 continue;
874
875 PAGECHAR_INFO charinfo;
876 charinfo.m_Origin = pTextObj->GetPos();
877 charinfo.m_Index = m_TextBuf.GetLength();
878 charinfo.m_Unicode = wChar;
879 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
880 charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
881 charinfo.m_pTextObj = pTextObj;
882 charinfo.m_CharBox = pTextObj->GetRect();
883 charinfo.m_Matrix = matrix;
884 m_TempTextBuf.AppendChar(wChar);
885 m_TempCharList.push_back(charinfo);
886 }
887 }
888
FindPreviousTextObject()889 void CPDF_TextPage::FindPreviousTextObject() {
890 if (m_TempCharList.empty() && m_CharList.empty())
891 return;
892
893 PAGECHAR_INFO preChar =
894 m_TempCharList.empty() ? m_CharList.back() : m_TempCharList.back();
895
896 if (preChar.m_pTextObj)
897 m_pPreTextObj = preChar.m_pTextObj;
898 }
899
SwapTempTextBuf(int32_t iCharListStartAppend,int32_t iBufStartAppend)900 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend,
901 int32_t iBufStartAppend) {
902 int32_t i = iCharListStartAppend;
903 int32_t j = pdfium::CollectionSize<int32_t>(m_TempCharList) - 1;
904 for (; i < j; i++, j--) {
905 std::swap(m_TempCharList[i], m_TempCharList[j]);
906 std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index);
907 }
908 wchar_t* pTempBuffer = m_TempTextBuf.GetBuffer();
909 i = iBufStartAppend;
910 j = m_TempTextBuf.GetLength() - 1;
911 for (; i < j; i++, j--)
912 std::swap(pTempBuffer[i], pTempBuffer[j]);
913 }
914
IsRightToLeft(const CPDF_TextObject * pTextObj,const CPDF_Font * pFont,size_t nItems) const915 bool CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj,
916 const CPDF_Font* pFont,
917 size_t nItems) const {
918 WideString str;
919 for (size_t i = 0; i < nItems; ++i) {
920 CPDF_TextObjectItem item;
921 pTextObj->GetItemInfo(i, &item);
922 if (item.m_CharCode == static_cast<uint32_t>(-1))
923 continue;
924 WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
925 wchar_t wChar = !wstrItem.IsEmpty() ? wstrItem[0] : 0;
926 if (wChar == 0)
927 wChar = item.m_CharCode;
928 if (wChar)
929 str += wChar;
930 }
931 return CFX_BidiString(str).OverallDirection() == CFX_BidiChar::RIGHT;
932 }
933
ProcessTextObject(PDFTEXT_Obj Obj)934 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
935 CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get();
936 if (fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f)
937 return;
938 CFX_Matrix formMatrix = Obj.m_formMatrix;
939 CPDF_Font* pFont = pTextObj->GetFont();
940 CFX_Matrix matrix = pTextObj->GetTextMatrix();
941 matrix.Concat(formMatrix);
942
943 FPDFText_MarkedContent ePreMKC = PreMarkedContent(Obj);
944 if (ePreMKC == FPDFText_MarkedContent::Done) {
945 m_pPreTextObj = pTextObj;
946 m_perMatrix = formMatrix;
947 return;
948 }
949 GenerateCharacter result = GenerateCharacter::None;
950 if (m_pPreTextObj) {
951 result = ProcessInsertObject(pTextObj, formMatrix);
952 if (result == GenerateCharacter::LineBreak)
953 m_CurlineRect = Obj.m_pTextObj->GetRect();
954 else
955 m_CurlineRect.Union(Obj.m_pTextObj->GetRect());
956
957 switch (result) {
958 case GenerateCharacter::None:
959 break;
960 case GenerateCharacter::Space: {
961 PAGECHAR_INFO generateChar;
962 if (GenerateCharInfo(TEXT_SPACE_CHAR, generateChar)) {
963 if (!formMatrix.IsIdentity())
964 generateChar.m_Matrix = formMatrix;
965 m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
966 m_TempCharList.push_back(generateChar);
967 }
968 break;
969 }
970 case GenerateCharacter::LineBreak:
971 CloseTempLine();
972 if (m_TextBuf.GetSize()) {
973 AppendGeneratedCharacter(TEXT_RETURN_CHAR, formMatrix);
974 AppendGeneratedCharacter(TEXT_LINEFEED_CHAR, formMatrix);
975 }
976 break;
977 case GenerateCharacter::Hyphen:
978 if (pTextObj->CountChars() == 1) {
979 CPDF_TextObjectItem item;
980 pTextObj->GetCharInfo(0, &item);
981 WideString wstrItem =
982 pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
983 if (wstrItem.IsEmpty())
984 wstrItem += (wchar_t)item.m_CharCode;
985 wchar_t curChar = wstrItem[0];
986 if (curChar == 0x2D || curChar == 0xAD)
987 return;
988 }
989 while (m_TempTextBuf.GetSize() > 0 &&
990 m_TempTextBuf.AsStringView()[m_TempTextBuf.GetLength() - 1] ==
991 0x20) {
992 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
993 m_TempCharList.pop_back();
994 }
995 PAGECHAR_INFO* charinfo = &m_TempCharList.back();
996 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
997 charinfo->m_Unicode = 0x2;
998 charinfo->m_Flag = FPDFTEXT_CHAR_HYPHEN;
999 m_TempTextBuf.AppendChar(0xfffe);
1000 break;
1001 }
1002 } else {
1003 m_CurlineRect = Obj.m_pTextObj->GetRect();
1004 }
1005
1006 if (ePreMKC == FPDFText_MarkedContent::Delay) {
1007 ProcessMarkedContent(Obj);
1008 m_pPreTextObj = pTextObj;
1009 m_perMatrix = formMatrix;
1010 return;
1011 }
1012 m_pPreTextObj = pTextObj;
1013 m_perMatrix = formMatrix;
1014 size_t nItems = pTextObj->CountItems();
1015 float baseSpace = CalculateBaseSpace(pTextObj, matrix);
1016
1017 const bool bR2L = IsRightToLeft(pTextObj, pFont, nItems);
1018 const bool bIsBidiAndMirrorInverse =
1019 bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1020 int32_t iBufStartAppend = m_TempTextBuf.GetLength();
1021 int32_t iCharListStartAppend =
1022 pdfium::CollectionSize<int32_t>(m_TempCharList);
1023
1024 float spacing = 0;
1025 for (size_t i = 0; i < nItems; ++i) {
1026 CPDF_TextObjectItem item;
1027 PAGECHAR_INFO charinfo;
1028 pTextObj->GetItemInfo(i, &item);
1029 if (item.m_CharCode == static_cast<uint32_t>(-1)) {
1030 WideString str = m_TempTextBuf.MakeString();
1031 if (str.IsEmpty())
1032 str = m_TextBuf.AsStringView();
1033 if (str.IsEmpty() || str[str.GetLength() - 1] == TEXT_SPACE_CHAR)
1034 continue;
1035
1036 float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1037 spacing = -fontsize_h * item.m_Origin.x / 1000;
1038 continue;
1039 }
1040 float charSpace = pTextObj->m_TextState.GetCharSpace();
1041 if (charSpace > 0.001)
1042 spacing += matrix.TransformDistance(charSpace);
1043 else if (charSpace < -0.001)
1044 spacing -= matrix.TransformDistance(fabs(charSpace));
1045 spacing -= baseSpace;
1046 if (spacing && i > 0) {
1047 int last_width = 0;
1048 float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1049 uint32_t space_charcode = pFont->CharCodeFromUnicode(' ');
1050 float threshold = 0;
1051 if (space_charcode != CPDF_Font::kInvalidCharCode)
1052 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1053 if (threshold > fontsize_h / 3)
1054 threshold = 0;
1055 else
1056 threshold /= 2;
1057 if (threshold == 0) {
1058 threshold = fontsize_h;
1059 int this_width = abs(GetCharWidth(item.m_CharCode, pFont));
1060 threshold =
1061 this_width > last_width ? (float)this_width : (float)last_width;
1062 threshold = NormalizeThreshold(threshold);
1063 threshold = fontsize_h * threshold / 1000;
1064 }
1065 if (threshold && (spacing && spacing >= threshold)) {
1066 charinfo.m_Unicode = TEXT_SPACE_CHAR;
1067 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
1068 charinfo.m_pTextObj = pTextObj;
1069 charinfo.m_Index = m_TextBuf.GetLength();
1070 m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
1071 charinfo.m_CharCode = CPDF_Font::kInvalidCharCode;
1072 charinfo.m_Matrix = formMatrix;
1073 charinfo.m_Origin = matrix.Transform(item.m_Origin);
1074 charinfo.m_CharBox =
1075 CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y,
1076 charinfo.m_Origin.x, charinfo.m_Origin.y);
1077 m_TempCharList.push_back(charinfo);
1078 }
1079 if (item.m_CharCode == CPDF_Font::kInvalidCharCode)
1080 continue;
1081 }
1082 spacing = 0;
1083 WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1084 bool bNoUnicode = false;
1085 if (wstrItem.IsEmpty() && item.m_CharCode) {
1086 wstrItem += static_cast<wchar_t>(item.m_CharCode);
1087 bNoUnicode = true;
1088 }
1089 charinfo.m_Index = -1;
1090 charinfo.m_CharCode = item.m_CharCode;
1091 if (bNoUnicode)
1092 charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1093 else
1094 charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1095
1096 charinfo.m_pTextObj = pTextObj;
1097 charinfo.m_Origin = matrix.Transform(item.m_Origin);
1098
1099 FX_RECT rect =
1100 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode);
1101 charinfo.m_CharBox.top =
1102 rect.top * pTextObj->GetFontSize() / 1000 + item.m_Origin.y;
1103 charinfo.m_CharBox.left =
1104 rect.left * pTextObj->GetFontSize() / 1000 + item.m_Origin.x;
1105 charinfo.m_CharBox.right =
1106 rect.right * pTextObj->GetFontSize() / 1000 + item.m_Origin.x;
1107 charinfo.m_CharBox.bottom =
1108 rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_Origin.y;
1109 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
1110 charinfo.m_CharBox.top =
1111 charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
1112 }
1113 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
1114 charinfo.m_CharBox.right =
1115 charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1116 }
1117 charinfo.m_CharBox = matrix.TransformRect(charinfo.m_CharBox);
1118 charinfo.m_Matrix = matrix;
1119 if (wstrItem.IsEmpty()) {
1120 charinfo.m_Unicode = 0;
1121 m_TempCharList.push_back(charinfo);
1122 m_TempTextBuf.AppendChar(0xfffe);
1123 continue;
1124 }
1125 int nTotal = wstrItem.GetLength();
1126 bool bDel = false;
1127 const int count = std::min(pdfium::CollectionSize<int>(m_TempCharList), 7);
1128 float threshold = charinfo.m_Matrix.TransformXDistance(
1129 (float)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize());
1130 for (int n = pdfium::CollectionSize<int>(m_TempCharList);
1131 n > pdfium::CollectionSize<int>(m_TempCharList) - count; n--) {
1132 const PAGECHAR_INFO& charinfo1 = m_TempCharList[n - 1];
1133 CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin;
1134 if (charinfo1.m_CharCode == charinfo.m_CharCode &&
1135 charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() &&
1136 fabs(diff.x) < threshold && fabs(diff.y) < threshold) {
1137 bDel = true;
1138 break;
1139 }
1140 }
1141 if (!bDel) {
1142 for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1143 charinfo.m_Unicode = wstrItem[nIndex];
1144 if (charinfo.m_Unicode) {
1145 charinfo.m_Index = m_TextBuf.GetLength();
1146 m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1147 } else {
1148 m_TempTextBuf.AppendChar(0xfffe);
1149 }
1150 m_TempCharList.push_back(charinfo);
1151 }
1152 } else if (i == 0) {
1153 WideString str = m_TempTextBuf.MakeString();
1154 if (!str.IsEmpty() && str[str.GetLength() - 1] == TEXT_SPACE_CHAR) {
1155 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1156 m_TempCharList.pop_back();
1157 }
1158 }
1159 }
1160 if (bIsBidiAndMirrorInverse)
1161 SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1162 }
1163
GetTextObjectWritingMode(const CPDF_TextObject * pTextObj) const1164 CPDF_TextPage::TextOrientation CPDF_TextPage::GetTextObjectWritingMode(
1165 const CPDF_TextObject* pTextObj) const {
1166 size_t nChars = pTextObj->CountChars();
1167 if (nChars <= 1)
1168 return m_TextlineDir;
1169
1170 CPDF_TextObjectItem first, last;
1171 pTextObj->GetCharInfo(0, &first);
1172 pTextObj->GetCharInfo(nChars - 1, &last);
1173
1174 CFX_Matrix textMatrix = pTextObj->GetTextMatrix();
1175 first.m_Origin = textMatrix.Transform(first.m_Origin);
1176 last.m_Origin = textMatrix.Transform(last.m_Origin);
1177
1178 float dX = fabs(last.m_Origin.x - first.m_Origin.x);
1179 float dY = fabs(last.m_Origin.y - first.m_Origin.y);
1180 if (dX <= 0.0001f && dY <= 0.0001f)
1181 return TextOrientation::Unknown;
1182
1183 CFX_VectorF v(dX, dY);
1184 v.Normalize();
1185 if (v.y <= 0.0872f)
1186 return v.x <= 0.0872f ? m_TextlineDir : TextOrientation::Horizontal;
1187
1188 if (v.x <= 0.0872f)
1189 return TextOrientation::Vertical;
1190
1191 return m_TextlineDir;
1192 }
1193
IsHyphen(wchar_t curChar) const1194 bool CPDF_TextPage::IsHyphen(wchar_t curChar) const {
1195 WideStringView curText = m_TempTextBuf.AsStringView();
1196 if (curText.IsEmpty())
1197 curText = m_TextBuf.AsStringView();
1198
1199 if (curText.IsEmpty())
1200 return false;
1201
1202 auto iter = curText.rbegin();
1203 for (; (iter + 1) != curText.rend() && *iter == 0x20; iter++) {
1204 // Do nothing
1205 }
1206
1207 if (!IsHyphenCode(*iter))
1208 return false;
1209
1210 if ((iter + 1) != curText.rend()) {
1211 iter++;
1212 if (FXSYS_iswalpha(*iter) && FXSYS_iswalpha(*iter))
1213 return true;
1214 }
1215
1216 const PAGECHAR_INFO* preInfo;
1217 if (!m_TempCharList.empty())
1218 preInfo = &m_TempCharList.back();
1219 else if (!m_CharList.empty())
1220 preInfo = &m_CharList.back();
1221 else
1222 return false;
1223
1224 return FPDFTEXT_CHAR_PIECE == preInfo->m_Flag &&
1225 IsHyphenCode(preInfo->m_Unicode);
1226 }
1227
ProcessInsertObject(const CPDF_TextObject * pObj,const CFX_Matrix & formMatrix)1228 CPDF_TextPage::GenerateCharacter CPDF_TextPage::ProcessInsertObject(
1229 const CPDF_TextObject* pObj,
1230 const CFX_Matrix& formMatrix) {
1231 FindPreviousTextObject();
1232 TextOrientation WritingMode = GetTextObjectWritingMode(pObj);
1233 if (WritingMode == TextOrientation::Unknown)
1234 WritingMode = GetTextObjectWritingMode(m_pPreTextObj.Get());
1235
1236 size_t nItem = m_pPreTextObj->CountItems();
1237 if (nItem == 0)
1238 return GenerateCharacter::None;
1239
1240 CPDF_TextObjectItem PrevItem;
1241 m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1242
1243 CPDF_TextObjectItem item;
1244 pObj->GetItemInfo(0, &item);
1245
1246 CFX_FloatRect this_rect = pObj->GetRect();
1247 CFX_FloatRect prev_rect = m_pPreTextObj->GetRect();
1248
1249 WideString wstrItem = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1250 if (wstrItem.IsEmpty())
1251 wstrItem += static_cast<wchar_t>(item.m_CharCode);
1252 wchar_t curChar = wstrItem[0];
1253 if (WritingMode == TextOrientation::Horizontal) {
1254 if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1255 float top = this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
1256 float bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom
1257 : prev_rect.bottom;
1258 if (bottom >= top) {
1259 return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1260 : GenerateCharacter::LineBreak;
1261 }
1262 }
1263 } else if (WritingMode == TextOrientation::Vertical) {
1264 if (this_rect.Width() > pObj->GetFontSize() * 0.1f &&
1265 prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1266 float left = this_rect.left > m_CurlineRect.left ? this_rect.left
1267 : m_CurlineRect.left;
1268 float right = this_rect.right < m_CurlineRect.right ? this_rect.right
1269 : m_CurlineRect.right;
1270 if (right <= left) {
1271 return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1272 : GenerateCharacter::LineBreak;
1273 }
1274 }
1275 }
1276
1277 float last_pos = PrevItem.m_Origin.x;
1278 int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
1279 float last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1280 last_width = fabs(last_width);
1281 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1282 float this_width = nThisWidth * pObj->GetFontSize() / 1000;
1283 this_width = fabs(this_width);
1284 float threshold = last_width > this_width ? last_width / 4 : this_width / 4;
1285
1286 CFX_Matrix prev_matrix = m_pPreTextObj->GetTextMatrix();
1287 prev_matrix.Concat(m_perMatrix);
1288
1289 CFX_Matrix prev_reverse = prev_matrix.GetInverse();
1290
1291 CFX_PointF pos = prev_reverse.Transform(formMatrix.Transform(pObj->GetPos()));
1292 if (last_width < this_width)
1293 threshold = prev_reverse.TransformDistance(threshold);
1294
1295 bool bNewline = false;
1296 if (WritingMode == TextOrientation::Horizontal) {
1297 CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom,
1298 m_pPreTextObj->m_Right, pObj->m_Top);
1299 CFX_FloatRect rect2 = m_pPreTextObj->GetRect();
1300 CFX_FloatRect rect3 = rect1;
1301 rect1.Intersect(rect2);
1302 if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) ||
1303 ((pos.y > threshold * 2 || pos.y < threshold * -3) &&
1304 (fabs(pos.y) < 1 ? fabs(pos.x) < fabs(pos.y) : true))) {
1305 bNewline = true;
1306 if (nItem > 1) {
1307 CPDF_TextObjectItem tempItem;
1308 m_pPreTextObj->GetItemInfo(0, &tempItem);
1309 CFX_Matrix m = m_pPreTextObj->GetTextMatrix();
1310 if (PrevItem.m_Origin.x > tempItem.m_Origin.x &&
1311 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1312 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1313 m.c < 0.1) {
1314 CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000,
1315 m_pPreTextObj->m_Top);
1316 if (re.Contains(pObj->GetPos())) {
1317 bNewline = false;
1318 } else {
1319 CFX_FloatRect rect(0, pObj->m_Bottom, 1000, pObj->m_Top);
1320 if (rect.Contains(m_pPreTextObj->GetPos()))
1321 bNewline = false;
1322 }
1323 }
1324 }
1325 }
1326 }
1327 if (bNewline) {
1328 return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1329 : GenerateCharacter::LineBreak;
1330 }
1331
1332 if (pObj->CountChars() == 1 && (0x2D == curChar || 0xAD == curChar) &&
1333 IsHyphen(curChar)) {
1334 return GenerateCharacter::Hyphen;
1335 }
1336 WideString PrevStr =
1337 m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1338 wchar_t preChar = PrevStr.Last();
1339 CFX_Matrix matrix = pObj->GetTextMatrix();
1340 matrix.Concat(formMatrix);
1341
1342 threshold = (float)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
1343 threshold = threshold > 400
1344 ? (threshold < 700
1345 ? threshold / 4
1346 : (threshold > 800 ? threshold / 6 : threshold / 5))
1347 : (threshold / 2);
1348 if (nLastWidth >= nThisWidth) {
1349 threshold *= fabs(m_pPreTextObj->GetFontSize());
1350 } else {
1351 threshold *= fabs(pObj->GetFontSize());
1352 threshold = matrix.TransformDistance(threshold);
1353 threshold = prev_reverse.TransformDistance(threshold);
1354 }
1355 threshold /= 1000;
1356 if ((threshold < 1.4881 && threshold > 1.4879) ||
1357 (threshold < 1.39001 && threshold > 1.38999)) {
1358 threshold *= 1.5;
1359 }
1360 if (fabs(last_pos + last_width - pos.x) > threshold && curChar != L' ' &&
1361 preChar != L' ') {
1362 if (curChar != L' ' && preChar != L' ') {
1363 if ((pos.x - last_pos - last_width) > threshold ||
1364 (last_pos - pos.x - last_width) > threshold) {
1365 return GenerateCharacter::Space;
1366 }
1367 if (pos.x < 0 && (last_pos - pos.x - last_width) > threshold)
1368 return GenerateCharacter::Space;
1369 if ((pos.x - last_pos - last_width) > this_width ||
1370 (pos.x - last_pos - this_width) > last_width) {
1371 return GenerateCharacter::Space;
1372 }
1373 }
1374 }
1375 return GenerateCharacter::None;
1376 }
1377
IsSameTextObject(CPDF_TextObject * pTextObj1,CPDF_TextObject * pTextObj2)1378 bool CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
1379 CPDF_TextObject* pTextObj2) {
1380 if (!pTextObj1 || !pTextObj2)
1381 return false;
1382
1383 CFX_FloatRect rcPreObj = pTextObj2->GetRect();
1384 CFX_FloatRect rcCurObj = pTextObj1->GetRect();
1385 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
1386 float dbXdif = fabs(rcPreObj.left - rcCurObj.left);
1387 size_t nCount = m_CharList.size();
1388 if (nCount >= 2) {
1389 PAGECHAR_INFO perCharTemp = m_CharList[nCount - 2];
1390 float dbSpace = perCharTemp.m_CharBox.Width();
1391 if (dbXdif > dbSpace)
1392 return false;
1393 }
1394 }
1395 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
1396 rcPreObj.Intersect(rcCurObj);
1397 if (rcPreObj.IsEmpty())
1398 return false;
1399 if (fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) {
1400 return false;
1401 }
1402 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize())
1403 return false;
1404 }
1405
1406 size_t nPreCount = pTextObj2->CountItems();
1407 if (nPreCount != pTextObj1->CountItems())
1408 return false;
1409
1410 // If both objects have no items, consider them same.
1411 if (nPreCount == 0)
1412 return true;
1413
1414 CPDF_TextObjectItem itemPer;
1415 CPDF_TextObjectItem itemCur;
1416 for (size_t i = 0; i < nPreCount; ++i) {
1417 pTextObj2->GetItemInfo(i, &itemPer);
1418 pTextObj1->GetItemInfo(i, &itemCur);
1419 if (itemCur.m_CharCode != itemPer.m_CharCode)
1420 return false;
1421 }
1422
1423 CFX_PointF diff = pTextObj1->GetPos() - pTextObj2->GetPos();
1424 float font_size = pTextObj2->GetFontSize();
1425 float char_size = GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont());
1426 float max_pre_size =
1427 std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), font_size);
1428 if (fabs(diff.x) > char_size * font_size / 1000 * 0.9 ||
1429 fabs(diff.y) > max_pre_size / 8) {
1430 return false;
1431 }
1432 return true;
1433 }
1434
IsSameAsPreTextObject(CPDF_TextObject * pTextObj,const CPDF_PageObjectList * pObjList,CPDF_PageObjectList::const_iterator iter)1435 bool CPDF_TextPage::IsSameAsPreTextObject(
1436 CPDF_TextObject* pTextObj,
1437 const CPDF_PageObjectList* pObjList,
1438 CPDF_PageObjectList::const_iterator iter) {
1439 int i = 0;
1440 while (i < 5 && iter != pObjList->begin()) {
1441 --iter;
1442 CPDF_PageObject* pOtherObj = iter->get();
1443 if (pOtherObj == pTextObj || !pOtherObj->IsText())
1444 continue;
1445 if (IsSameTextObject(pOtherObj->AsText(), pTextObj))
1446 return true;
1447 ++i;
1448 }
1449 return false;
1450 }
1451
GenerateCharInfo(wchar_t unicode,PAGECHAR_INFO & info)1452 bool CPDF_TextPage::GenerateCharInfo(wchar_t unicode, PAGECHAR_INFO& info) {
1453 const PAGECHAR_INFO* preChar;
1454 if (!m_TempCharList.empty())
1455 preChar = &m_TempCharList.back();
1456 else if (!m_CharList.empty())
1457 preChar = &m_CharList.back();
1458 else
1459 return false;
1460
1461 info.m_Index = m_TextBuf.GetLength();
1462 info.m_Unicode = unicode;
1463 info.m_pTextObj = nullptr;
1464 info.m_CharCode = CPDF_Font::kInvalidCharCode;
1465 info.m_Flag = FPDFTEXT_CHAR_GENERATED;
1466
1467 int preWidth = 0;
1468 if (preChar->m_pTextObj && preChar->m_CharCode != -1) {
1469 preWidth =
1470 GetCharWidth(preChar->m_CharCode, preChar->m_pTextObj->GetFont());
1471 }
1472
1473 float fFontSize = preChar->m_pTextObj ? preChar->m_pTextObj->GetFontSize()
1474 : preChar->m_CharBox.Height();
1475 if (!fFontSize)
1476 fFontSize = kDefaultFontSize;
1477
1478 info.m_Origin = CFX_PointF(
1479 preChar->m_Origin.x + preWidth * (fFontSize) / 1000, preChar->m_Origin.y);
1480 info.m_CharBox = CFX_FloatRect(info.m_Origin.x, info.m_Origin.y,
1481 info.m_Origin.x, info.m_Origin.y);
1482 return true;
1483 }
1484
IsRectIntersect(const CFX_FloatRect & rect1,const CFX_FloatRect & rect2)1485 bool CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
1486 const CFX_FloatRect& rect2) {
1487 CFX_FloatRect rect = rect1;
1488 rect.Intersect(rect2);
1489 return !rect.IsEmpty();
1490 }
1491