1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_textpage.h"
8
9 #include <algorithm>
10 #include <utility>
11 #include <vector>
12
13 #include "core/fpdfapi/font/cpdf_font.h"
14 #include "core/fpdfapi/page/cpdf_form.h"
15 #include "core/fpdfapi/page/cpdf_formobject.h"
16 #include "core/fpdfapi/page/cpdf_page.h"
17 #include "core/fpdfapi/page/cpdf_pageobject.h"
18 #include "core/fpdfapi/page/cpdf_textobject.h"
19 #include "core/fpdfapi/parser/cpdf_dictionary.h"
20 #include "core/fpdfapi/parser/cpdf_string.h"
21 #include "core/fpdftext/unicodenormalizationdata.h"
22 #include "core/fxcrt/fx_bidi.h"
23 #include "core/fxcrt/fx_ext.h"
24 #include "core/fxcrt/fx_ucd.h"
25 #include "third_party/base/stl_util.h"
26
27 namespace {
28
29 const FX_FLOAT kDefaultFontSize = 1.0f;
30 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {
31 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,
32 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};
33
NormalizeThreshold(FX_FLOAT threshold)34 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {
35 if (threshold < 300)
36 return threshold / 2.0f;
37 if (threshold < 500)
38 return threshold / 4.0f;
39 if (threshold < 700)
40 return threshold / 5.0f;
41 return threshold / 6.0f;
42 }
43
CalculateBaseSpace(const CPDF_TextObject * pTextObj,const CFX_Matrix & matrix)44 FX_FLOAT CalculateBaseSpace(const CPDF_TextObject* pTextObj,
45 const CFX_Matrix& matrix) {
46 FX_FLOAT baseSpace = 0.0;
47 const int nItems = pTextObj->CountItems();
48 if (pTextObj->m_TextState.GetCharSpace() && nItems >= 3) {
49 bool bAllChar = true;
50 FX_FLOAT spacing =
51 matrix.TransformDistance(pTextObj->m_TextState.GetCharSpace());
52 baseSpace = spacing;
53 for (int i = 0; i < nItems; i++) {
54 CPDF_TextObjectItem item;
55 pTextObj->GetItemInfo(i, &item);
56 if (item.m_CharCode == static_cast<uint32_t>(-1)) {
57 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
58 FX_FLOAT kerning = -fontsize_h * item.m_Origin.x / 1000;
59 baseSpace = std::min(baseSpace, kerning + spacing);
60 bAllChar = false;
61 }
62 }
63 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar))
64 baseSpace = 0.0;
65 }
66 return baseSpace;
67 }
68
Unicode_GetNormalization(FX_WCHAR wch,FX_WCHAR * pDst)69 FX_STRSIZE Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst) {
70 wch = wch & 0xFFFF;
71 FX_WCHAR wFind = g_UnicodeData_Normalization[wch];
72 if (!wFind) {
73 if (pDst)
74 *pDst = wch;
75 return 1;
76 }
77 if (wFind >= 0x8000) {
78 wch = wFind - 0x8000;
79 wFind = 1;
80 } else {
81 wch = wFind & 0x0FFF;
82 wFind >>= 12;
83 }
84 const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind];
85 if (pMap == g_UnicodeData_Normalization_Map4) {
86 pMap = g_UnicodeData_Normalization_Map4 + wch;
87 wFind = (FX_WCHAR)(*pMap++);
88 } else {
89 pMap += wch;
90 }
91 if (pDst) {
92 FX_WCHAR n = wFind;
93 while (n--)
94 *pDst++ = *pMap++;
95 }
96 return (FX_STRSIZE)wFind;
97 }
98
MaskPercentFilled(const std::vector<bool> & mask,int32_t start,int32_t end)99 float MaskPercentFilled(const std::vector<bool>& mask,
100 int32_t start,
101 int32_t end) {
102 if (start >= end)
103 return 0;
104 float count = std::count_if(mask.begin() + start, mask.begin() + end,
105 [](bool r) { return r; });
106 return count / (end - start);
107 }
108
109 } // namespace
110
FPDF_CHAR_INFO()111 FPDF_CHAR_INFO::FPDF_CHAR_INFO()
112 : m_Unicode(0),
113 m_Charcode(0),
114 m_Flag(0),
115 m_FontSize(0),
116 m_pTextObj(nullptr) {}
117
~FPDF_CHAR_INFO()118 FPDF_CHAR_INFO::~FPDF_CHAR_INFO() {}
119
PAGECHAR_INFO()120 PAGECHAR_INFO::PAGECHAR_INFO()
121 : m_Index(0), m_CharCode(0), m_Unicode(0), m_Flag(0), m_pTextObj(nullptr) {}
122
123 PAGECHAR_INFO::PAGECHAR_INFO(const PAGECHAR_INFO&) = default;
124
~PAGECHAR_INFO()125 PAGECHAR_INFO::~PAGECHAR_INFO() {}
126
CPDF_TextPage(const CPDF_Page * pPage,FPDFText_Direction flags)127 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags)
128 : m_pPage(pPage),
129 m_parserflag(flags),
130 m_pPreTextObj(nullptr),
131 m_bIsParsed(false),
132 m_TextlineDir(TextOrientation::Unknown) {
133 m_TextBuf.EstimateSize(0, 10240);
134 m_DisplayMatrix =
135 pPage->GetDisplayMatrix(0, 0, static_cast<int>(pPage->GetPageWidth()),
136 static_cast<int>(pPage->GetPageHeight()), 0);
137 }
138
~CPDF_TextPage()139 CPDF_TextPage::~CPDF_TextPage() {}
140
IsControlChar(const PAGECHAR_INFO & charInfo)141 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
142 switch (charInfo.m_Unicode) {
143 case 0x2:
144 case 0x3:
145 case 0x93:
146 case 0x94:
147 case 0x96:
148 case 0x97:
149 case 0x98:
150 case 0xfffe:
151 return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
152 default:
153 return false;
154 }
155 }
156
ParseTextPage()157 void CPDF_TextPage::ParseTextPage() {
158 m_bIsParsed = false;
159 m_TextBuf.Clear();
160 m_CharList.clear();
161 m_pPreTextObj = nullptr;
162 ProcessObject();
163
164 m_bIsParsed = true;
165 m_CharIndex.clear();
166 int nCount = pdfium::CollectionSize<int>(m_CharList);
167 if (nCount)
168 m_CharIndex.push_back(0);
169
170 for (int i = 0; i < nCount; i++) {
171 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
172 const PAGECHAR_INFO& charinfo = m_CharList[i];
173 if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED ||
174 (charinfo.m_Unicode != 0 && !IsControlChar(charinfo))) {
175 if (indexSize % 2) {
176 m_CharIndex.push_back(1);
177 } else {
178 if (indexSize <= 0)
179 continue;
180 m_CharIndex[indexSize - 1] += 1;
181 }
182 } else {
183 if (indexSize % 2) {
184 if (indexSize <= 0)
185 continue;
186 m_CharIndex[indexSize - 1] = i + 1;
187 } else {
188 m_CharIndex.push_back(i + 1);
189 }
190 }
191 }
192 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
193 if (indexSize % 2)
194 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
195 }
196
CountChars() const197 int CPDF_TextPage::CountChars() const {
198 return pdfium::CollectionSize<int>(m_CharList);
199 }
200
CharIndexFromTextIndex(int TextIndex) const201 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const {
202 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
203 int count = 0;
204 for (int i = 0; i < indexSize; i += 2) {
205 count += m_CharIndex[i + 1];
206 if (count > TextIndex)
207 return TextIndex - count + m_CharIndex[i + 1] + m_CharIndex[i];
208 }
209 return -1;
210 }
211
TextIndexFromCharIndex(int CharIndex) const212 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const {
213 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
214 int count = 0;
215 for (int i = 0; i < indexSize; i += 2) {
216 count += m_CharIndex[i + 1];
217 if (m_CharIndex[i + 1] + m_CharIndex[i] > CharIndex) {
218 if (CharIndex - m_CharIndex[i] < 0)
219 return -1;
220
221 return CharIndex - m_CharIndex[i] + count - m_CharIndex[i + 1];
222 }
223 }
224 return -1;
225 }
226
GetRectArray(int start,int nCount) const227 std::vector<CFX_FloatRect> CPDF_TextPage::GetRectArray(int start,
228 int nCount) const {
229 if (start < 0 || nCount == 0 || !m_bIsParsed)
230 return std::vector<CFX_FloatRect>();
231
232 if (nCount + start > pdfium::CollectionSize<int>(m_CharList) ||
233 nCount == -1) {
234 nCount = pdfium::CollectionSize<int>(m_CharList) - start;
235 }
236
237 std::vector<CFX_FloatRect> rectArray;
238 CPDF_TextObject* pCurObj = nullptr;
239 CFX_FloatRect rect;
240 int curPos = start;
241 bool bFlagNewRect = true;
242 while (nCount--) {
243 PAGECHAR_INFO info_curchar = m_CharList[curPos++];
244 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED)
245 continue;
246 if (info_curchar.m_CharBox.Width() < 0.01 ||
247 info_curchar.m_CharBox.Height() < 0.01) {
248 continue;
249 }
250 if (!pCurObj)
251 pCurObj = info_curchar.m_pTextObj;
252 if (pCurObj != info_curchar.m_pTextObj) {
253 rectArray.push_back(rect);
254 pCurObj = info_curchar.m_pTextObj;
255 bFlagNewRect = true;
256 }
257 if (bFlagNewRect) {
258 CFX_Matrix matrix = info_curchar.m_pTextObj->GetTextMatrix();
259 matrix.Concat(info_curchar.m_Matrix);
260
261 CFX_Matrix matrix_reverse;
262 matrix_reverse.SetReverse(matrix);
263
264 CFX_PointF origin = matrix_reverse.Transform(info_curchar.m_Origin);
265 rect.left = info_curchar.m_CharBox.left;
266 rect.right = info_curchar.m_CharBox.right;
267 if (pCurObj->GetFont()->GetTypeDescent()) {
268 rect.bottom = origin.y +
269 pCurObj->GetFont()->GetTypeDescent() *
270 pCurObj->GetFontSize() / 1000;
271
272 rect.bottom = matrix.Transform(CFX_PointF(origin.x, rect.bottom)).y;
273 } else {
274 rect.bottom = info_curchar.m_CharBox.bottom;
275 }
276 if (pCurObj->GetFont()->GetTypeAscent()) {
277 rect.top =
278 origin.y +
279 pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
280 FX_FLOAT xPosTemp =
281 origin.x +
282 GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) *
283 pCurObj->GetFontSize() / 1000;
284 rect.top = matrix.Transform(CFX_PointF(xPosTemp, rect.top)).y;
285 } else {
286 rect.top = info_curchar.m_CharBox.top;
287 }
288 bFlagNewRect = false;
289 rect = info_curchar.m_CharBox;
290 rect.Normalize();
291 } else {
292 info_curchar.m_CharBox.Normalize();
293 rect.left = std::min(rect.left, info_curchar.m_CharBox.left);
294 rect.right = std::max(rect.right, info_curchar.m_CharBox.right);
295 rect.top = std::max(rect.top, info_curchar.m_CharBox.top);
296 rect.bottom = std::min(rect.bottom, info_curchar.m_CharBox.bottom);
297 }
298 }
299 rectArray.push_back(rect);
300 return rectArray;
301 }
302
GetIndexAtPos(const CFX_PointF & point,const CFX_SizeF & tolerance) const303 int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point,
304 const CFX_SizeF& tolerance) const {
305 if (!m_bIsParsed)
306 return -3;
307
308 int pos = 0;
309 int NearPos = -1;
310 double xdif = 5000;
311 double ydif = 5000;
312 while (pos < pdfium::CollectionSize<int>(m_CharList)) {
313 PAGECHAR_INFO charinfo = m_CharList[pos];
314 CFX_FloatRect charrect = charinfo.m_CharBox;
315 if (charrect.Contains(point))
316 break;
317 if (tolerance.width > 0 || tolerance.height > 0) {
318 CFX_FloatRect charRectExt;
319 charrect.Normalize();
320 charRectExt.left = charrect.left - tolerance.width / 2;
321 charRectExt.right = charrect.right + tolerance.width / 2;
322 charRectExt.top = charrect.top + tolerance.height / 2;
323 charRectExt.bottom = charrect.bottom - tolerance.height / 2;
324 if (charRectExt.Contains(point)) {
325 double curXdif, curYdif;
326 curXdif = FXSYS_fabs(point.x - charrect.left) <
327 FXSYS_fabs(point.x - charrect.right)
328 ? FXSYS_fabs(point.x - charrect.left)
329 : FXSYS_fabs(point.x - charrect.right);
330 curYdif = FXSYS_fabs(point.y - charrect.bottom) <
331 FXSYS_fabs(point.y - charrect.top)
332 ? FXSYS_fabs(point.y - charrect.bottom)
333 : FXSYS_fabs(point.y - charrect.top);
334 if (curYdif + curXdif < xdif + ydif) {
335 ydif = curYdif;
336 xdif = curXdif;
337 NearPos = pos;
338 }
339 }
340 }
341 ++pos;
342 }
343 return pos < pdfium::CollectionSize<int>(m_CharList) ? pos : NearPos;
344 }
345
GetTextByRect(const CFX_FloatRect & rect) const346 CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
347 if (!m_bIsParsed)
348 return CFX_WideString();
349
350 FX_FLOAT posy = 0;
351 bool IsContainPreChar = false;
352 bool IsAddLineFeed = false;
353 CFX_WideString strText;
354 for (const auto& charinfo : m_CharList) {
355 if (IsRectIntersect(rect, charinfo.m_CharBox)) {
356 if (FXSYS_fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar &&
357 IsAddLineFeed) {
358 posy = charinfo.m_Origin.y;
359 if (!strText.IsEmpty())
360 strText += L"\r\n";
361 }
362 IsContainPreChar = true;
363 IsAddLineFeed = false;
364 if (charinfo.m_Unicode)
365 strText += charinfo.m_Unicode;
366 } else if (charinfo.m_Unicode == 32) {
367 if (IsContainPreChar && charinfo.m_Unicode) {
368 strText += charinfo.m_Unicode;
369 IsContainPreChar = false;
370 IsAddLineFeed = false;
371 }
372 } else {
373 IsContainPreChar = false;
374 IsAddLineFeed = true;
375 }
376 }
377 return strText;
378 }
379
GetCharInfo(int index,FPDF_CHAR_INFO * info) const380 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
381 if (!m_bIsParsed)
382 return;
383
384 if (index < 0 || index >= pdfium::CollectionSize<int>(m_CharList))
385 return;
386
387 const PAGECHAR_INFO& charinfo = m_CharList[index];
388 info->m_Charcode = charinfo.m_CharCode;
389 info->m_Origin = charinfo.m_Origin;
390 info->m_Unicode = charinfo.m_Unicode;
391 info->m_Flag = charinfo.m_Flag;
392 info->m_CharBox = charinfo.m_CharBox;
393 info->m_pTextObj = charinfo.m_pTextObj;
394 if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont())
395 info->m_FontSize = charinfo.m_pTextObj->GetFontSize();
396 else
397 info->m_FontSize = kDefaultFontSize;
398 info->m_Matrix = charinfo.m_Matrix;
399 }
400
CheckMarkedContentObject(int32_t & start,int32_t & nCount) const401 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
402 int32_t& nCount) const {
403 PAGECHAR_INFO charinfo = m_CharList[start];
404 PAGECHAR_INFO charinfo2 = m_CharList[start + nCount - 1];
405 if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag &&
406 FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
407 return;
408 }
409 if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
410 PAGECHAR_INFO charinfo1 = charinfo;
411 int startIndex = start;
412 while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag &&
413 charinfo1.m_Index == charinfo.m_Index) {
414 startIndex--;
415 if (startIndex < 0)
416 break;
417 charinfo1 = m_CharList[startIndex];
418 }
419 startIndex++;
420 start = startIndex;
421 }
422 if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
423 PAGECHAR_INFO charinfo3 = charinfo2;
424 int endIndex = start + nCount - 1;
425 while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag &&
426 charinfo3.m_Index == charinfo2.m_Index) {
427 endIndex++;
428 if (endIndex >= pdfium::CollectionSize<int>(m_CharList))
429 break;
430 charinfo3 = m_CharList[endIndex];
431 }
432 endIndex--;
433 nCount = endIndex - start + 1;
434 }
435 }
436
GetPageText(int start,int nCount) const437 CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
438 if (!m_bIsParsed || nCount == 0)
439 return L"";
440
441 if (start < 0)
442 start = 0;
443
444 if (nCount == -1) {
445 nCount = pdfium::CollectionSize<int>(m_CharList) - start;
446 return CFX_WideString(
447 m_TextBuf.AsStringC().Mid(start, m_TextBuf.AsStringC().GetLength()));
448 }
449 if (nCount <= 0 || m_CharList.empty())
450 return L"";
451 if (nCount + start > pdfium::CollectionSize<int>(m_CharList) - 1)
452 nCount = pdfium::CollectionSize<int>(m_CharList) - start;
453 if (nCount <= 0)
454 return L"";
455 CheckMarkedContentObject(start, nCount);
456 int startindex = 0;
457 PAGECHAR_INFO charinfo = m_CharList[start];
458 int startOffset = 0;
459 while (charinfo.m_Index == -1) {
460 startOffset++;
461 if (startOffset > nCount ||
462 start + startOffset >= pdfium::CollectionSize<int>(m_CharList)) {
463 return L"";
464 }
465 charinfo = m_CharList[start + startOffset];
466 }
467 startindex = charinfo.m_Index;
468 charinfo = m_CharList[start + nCount - 1];
469 int nCountOffset = 0;
470 while (charinfo.m_Index == -1) {
471 nCountOffset++;
472 if (nCountOffset >= nCount)
473 return L"";
474 charinfo = m_CharList[start + nCount - nCountOffset - 1];
475 }
476 nCount = start + nCount - nCountOffset - startindex;
477 if (nCount <= 0)
478 return L"";
479 return CFX_WideString(m_TextBuf.AsStringC().Mid(startindex, nCount));
480 }
481
CountRects(int start,int nCount)482 int CPDF_TextPage::CountRects(int start, int nCount) {
483 if (!m_bIsParsed || start < 0)
484 return -1;
485
486 if (nCount == -1 ||
487 nCount + start > pdfium::CollectionSize<int>(m_CharList)) {
488 nCount = pdfium::CollectionSize<int>(m_CharList) - start;
489 }
490 m_SelRects = GetRectArray(start, nCount);
491 return pdfium::CollectionSize<int>(m_SelRects);
492 }
493
GetRect(int rectIndex,FX_FLOAT & left,FX_FLOAT & top,FX_FLOAT & right,FX_FLOAT & bottom) const494 void CPDF_TextPage::GetRect(int rectIndex,
495 FX_FLOAT& left,
496 FX_FLOAT& top,
497 FX_FLOAT& right,
498 FX_FLOAT& bottom) const {
499 if (!m_bIsParsed)
500 return;
501
502 if (rectIndex < 0 || rectIndex >= pdfium::CollectionSize<int>(m_SelRects))
503 return;
504
505 left = m_SelRects[rectIndex].left;
506 top = m_SelRects[rectIndex].top;
507 right = m_SelRects[rectIndex].right;
508 bottom = m_SelRects[rectIndex].bottom;
509 }
510
FindTextlineFlowOrientation() const511 CPDF_TextPage::TextOrientation CPDF_TextPage::FindTextlineFlowOrientation()
512 const {
513 if (m_pPage->GetPageObjectList()->empty())
514 return TextOrientation::Unknown;
515
516 const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth());
517 const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight());
518 if (nPageWidth <= 0 || nPageHeight <= 0)
519 return TextOrientation::Unknown;
520
521 std::vector<bool> nHorizontalMask(nPageWidth);
522 std::vector<bool> nVerticalMask(nPageHeight);
523 FX_FLOAT fLineHeight = 0.0f;
524 int32_t nStartH = nPageWidth;
525 int32_t nEndH = 0;
526 int32_t nStartV = nPageHeight;
527 int32_t nEndV = 0;
528 for (const auto& pPageObj : *m_pPage->GetPageObjectList()) {
529 if (!pPageObj->IsText())
530 continue;
531
532 int32_t minH = std::max(static_cast<int32_t>(pPageObj->m_Left), 0);
533 int32_t maxH =
534 std::min(static_cast<int32_t>(pPageObj->m_Right), nPageWidth);
535 int32_t minV = std::max(static_cast<int32_t>(pPageObj->m_Bottom), 0);
536 int32_t maxV = std::min(static_cast<int32_t>(pPageObj->m_Top), nPageHeight);
537 if (minH >= maxH || minV >= maxV)
538 continue;
539
540 for (int32_t i = minH; i < maxH; ++i)
541 nHorizontalMask[i] = true;
542 for (int32_t i = minV; i < maxV; ++i)
543 nVerticalMask[i] = true;
544
545 nStartH = std::min(nStartH, minH);
546 nEndH = std::max(nEndH, maxH);
547 nStartV = std::min(nStartV, minV);
548 nEndV = std::max(nEndV, maxV);
549
550 if (fLineHeight <= 0.0f)
551 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
552 }
553 const int32_t nDoubleLineHeight = 2 * fLineHeight;
554 if ((nEndV - nStartV) < nDoubleLineHeight)
555 return TextOrientation::Horizontal;
556 if ((nEndH - nStartH) < nDoubleLineHeight)
557 return TextOrientation::Vertical;
558
559 const FX_FLOAT nSumH = MaskPercentFilled(nHorizontalMask, nStartH, nEndH);
560 if (nSumH > 0.8f)
561 return TextOrientation::Horizontal;
562
563 const FX_FLOAT nSumV = MaskPercentFilled(nVerticalMask, nStartV, nEndV);
564 if (nSumH > nSumV)
565 return TextOrientation::Horizontal;
566 if (nSumH < nSumV)
567 return TextOrientation::Vertical;
568 return TextOrientation::Unknown;
569 }
570
AppendGeneratedCharacter(FX_WCHAR unicode,const CFX_Matrix & formMatrix)571 void CPDF_TextPage::AppendGeneratedCharacter(FX_WCHAR unicode,
572 const CFX_Matrix& formMatrix) {
573 PAGECHAR_INFO generateChar;
574 if (!GenerateCharInfo(unicode, generateChar))
575 return;
576
577 m_TextBuf.AppendChar(unicode);
578 if (!formMatrix.IsIdentity())
579 generateChar.m_Matrix = formMatrix;
580 m_CharList.push_back(generateChar);
581 }
582
ProcessObject()583 void CPDF_TextPage::ProcessObject() {
584 if (m_pPage->GetPageObjectList()->empty())
585 return;
586
587 m_TextlineDir = FindTextlineFlowOrientation();
588 const CPDF_PageObjectList* pObjList = m_pPage->GetPageObjectList();
589 for (auto it = pObjList->begin(); it != pObjList->end(); ++it) {
590 if (CPDF_PageObject* pObj = it->get()) {
591 if (pObj->IsText()) {
592 CFX_Matrix matrix;
593 ProcessTextObject(pObj->AsText(), matrix, pObjList, it);
594 } else if (pObj->IsForm()) {
595 CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0);
596 ProcessFormObject(pObj->AsForm(), formMatrix);
597 }
598 }
599 }
600 for (const auto& obj : m_LineObj)
601 ProcessTextObject(obj);
602
603 m_LineObj.clear();
604 CloseTempLine();
605 }
606
ProcessFormObject(CPDF_FormObject * pFormObj,const CFX_Matrix & formMatrix)607 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
608 const CFX_Matrix& formMatrix) {
609 CPDF_PageObjectList* pObjectList = pFormObj->m_pForm->GetPageObjectList();
610 if (pObjectList->empty())
611 return;
612
613 CFX_Matrix curFormMatrix;
614 curFormMatrix = pFormObj->m_FormMatrix;
615 curFormMatrix.Concat(formMatrix);
616
617 for (auto it = pObjectList->begin(); it != pObjectList->end(); ++it) {
618 if (CPDF_PageObject* pPageObj = it->get()) {
619 if (pPageObj->IsText())
620 ProcessTextObject(pPageObj->AsText(), curFormMatrix, pObjectList, it);
621 else if (pPageObj->IsForm())
622 ProcessFormObject(pPageObj->AsForm(), curFormMatrix);
623 }
624 }
625 }
626
GetCharWidth(uint32_t charCode,CPDF_Font * pFont) const627 int CPDF_TextPage::GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const {
628 if (charCode == CPDF_Font::kInvalidCharCode)
629 return 0;
630
631 if (int w = pFont->GetCharWidthF(charCode))
632 return w;
633
634 CFX_ByteString str;
635 pFont->AppendChar(str, charCode);
636 if (int w = pFont->GetStringWidth(str.c_str(), 1))
637 return w;
638
639 return pFont->GetCharBBox(charCode).Width();
640 }
641
AddCharInfoByLRDirection(FX_WCHAR wChar,PAGECHAR_INFO info)642 void CPDF_TextPage::AddCharInfoByLRDirection(FX_WCHAR wChar,
643 PAGECHAR_INFO info) {
644 if (IsControlChar(info)) {
645 info.m_Index = -1;
646 m_CharList.push_back(info);
647 return;
648 }
649
650 info.m_Index = m_TextBuf.GetLength();
651 if (wChar >= 0xFB00 && wChar <= 0xFB06) {
652 FX_WCHAR* pDst = nullptr;
653 FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst);
654 if (nCount >= 1) {
655 pDst = FX_Alloc(FX_WCHAR, nCount);
656 Unicode_GetNormalization(wChar, pDst);
657 for (int nIndex = 0; nIndex < nCount; nIndex++) {
658 PAGECHAR_INFO info2 = info;
659 info2.m_Unicode = pDst[nIndex];
660 info2.m_Flag = FPDFTEXT_CHAR_PIECE;
661 m_TextBuf.AppendChar(info2.m_Unicode);
662 m_CharList.push_back(info2);
663 }
664 FX_Free(pDst);
665 return;
666 }
667 }
668 m_TextBuf.AppendChar(wChar);
669 m_CharList.push_back(info);
670 }
671
AddCharInfoByRLDirection(FX_WCHAR wChar,PAGECHAR_INFO info)672 void CPDF_TextPage::AddCharInfoByRLDirection(FX_WCHAR wChar,
673 PAGECHAR_INFO info) {
674 if (IsControlChar(info)) {
675 info.m_Index = -1;
676 m_CharList.push_back(info);
677 return;
678 }
679
680 info.m_Index = m_TextBuf.GetLength();
681 wChar = FX_GetMirrorChar(wChar, true, false);
682 FX_WCHAR* pDst = nullptr;
683 FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst);
684 if (nCount >= 1) {
685 pDst = FX_Alloc(FX_WCHAR, nCount);
686 Unicode_GetNormalization(wChar, pDst);
687 for (int nIndex = 0; nIndex < nCount; nIndex++) {
688 PAGECHAR_INFO info2 = info;
689 info2.m_Unicode = pDst[nIndex];
690 info2.m_Flag = FPDFTEXT_CHAR_PIECE;
691 m_TextBuf.AppendChar(info2.m_Unicode);
692 m_CharList.push_back(info2);
693 }
694 FX_Free(pDst);
695 return;
696 }
697 info.m_Unicode = wChar;
698 m_TextBuf.AppendChar(info.m_Unicode);
699 m_CharList.push_back(info);
700 }
701
CloseTempLine()702 void CPDF_TextPage::CloseTempLine() {
703 if (m_TempCharList.empty())
704 return;
705
706 CFX_WideString str = m_TempTextBuf.MakeString();
707 bool bPrevSpace = false;
708 for (int i = 0; i < str.GetLength(); i++) {
709 if (str.GetAt(i) != ' ') {
710 bPrevSpace = false;
711 continue;
712 }
713 if (bPrevSpace) {
714 m_TempTextBuf.Delete(i, 1);
715 m_TempCharList.erase(m_TempCharList.begin() + i);
716 str.Delete(i);
717 i--;
718 }
719 bPrevSpace = true;
720 }
721 CFX_BidiString bidi(str);
722 if (m_parserflag == FPDFText_Direction::Right)
723 bidi.SetOverallDirectionRight();
724 CFX_BidiChar::Direction eCurrentDirection = bidi.OverallDirection();
725 for (const auto& segment : bidi) {
726 if (segment.direction == CFX_BidiChar::RIGHT ||
727 (segment.direction == CFX_BidiChar::NEUTRAL &&
728 eCurrentDirection == CFX_BidiChar::RIGHT)) {
729 eCurrentDirection = CFX_BidiChar::RIGHT;
730 for (int m = segment.start + segment.count; m > segment.start; --m)
731 AddCharInfoByRLDirection(bidi.CharAt(m - 1), m_TempCharList[m - 1]);
732 } else {
733 eCurrentDirection = CFX_BidiChar::LEFT;
734 for (int m = segment.start; m < segment.start + segment.count; m++)
735 AddCharInfoByLRDirection(bidi.CharAt(m), m_TempCharList[m]);
736 }
737 }
738 m_TempCharList.clear();
739 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
740 }
741
ProcessTextObject(CPDF_TextObject * pTextObj,const CFX_Matrix & formMatrix,const CPDF_PageObjectList * pObjList,CPDF_PageObjectList::const_iterator ObjPos)742 void CPDF_TextPage::ProcessTextObject(
743 CPDF_TextObject* pTextObj,
744 const CFX_Matrix& formMatrix,
745 const CPDF_PageObjectList* pObjList,
746 CPDF_PageObjectList::const_iterator ObjPos) {
747 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f)
748 return;
749
750 size_t count = m_LineObj.size();
751 PDFTEXT_Obj Obj;
752 Obj.m_pTextObj = pTextObj;
753 Obj.m_formMatrix = formMatrix;
754 if (count == 0) {
755 m_LineObj.push_back(Obj);
756 return;
757 }
758 if (IsSameAsPreTextObject(pTextObj, pObjList, ObjPos))
759 return;
760
761 PDFTEXT_Obj prev_Obj = m_LineObj[count - 1];
762 CPDF_TextObjectItem item;
763 int nItem = prev_Obj.m_pTextObj->CountItems();
764 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
765 FX_FLOAT prev_width =
766 GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) *
767 prev_Obj.m_pTextObj->GetFontSize() / 1000;
768
769 CFX_Matrix prev_matrix = prev_Obj.m_pTextObj->GetTextMatrix();
770 prev_width = FXSYS_fabs(prev_width);
771 prev_matrix.Concat(prev_Obj.m_formMatrix);
772 prev_width = prev_matrix.TransformDistance(prev_width);
773 pTextObj->GetItemInfo(0, &item);
774 FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) *
775 pTextObj->GetFontSize() / 1000;
776 this_width = FXSYS_fabs(this_width);
777
778 CFX_Matrix this_matrix = pTextObj->GetTextMatrix();
779 this_width = FXSYS_fabs(this_width);
780 this_matrix.Concat(formMatrix);
781 this_width = this_matrix.TransformDistance(this_width);
782
783 FX_FLOAT threshold =
784 prev_width > this_width ? prev_width / 4 : this_width / 4;
785 CFX_PointF prev_pos = m_DisplayMatrix.Transform(
786 prev_Obj.m_formMatrix.Transform(prev_Obj.m_pTextObj->GetPos()));
787 CFX_PointF this_pos =
788 m_DisplayMatrix.Transform(formMatrix.Transform(pTextObj->GetPos()));
789 if (FXSYS_fabs(this_pos.y - prev_pos.y) > threshold * 2) {
790 for (size_t i = 0; i < count; i++)
791 ProcessTextObject(m_LineObj[i]);
792 m_LineObj.clear();
793 m_LineObj.push_back(Obj);
794 return;
795 }
796
797 for (size_t i = count; i > 0; --i) {
798 PDFTEXT_Obj prev_text_obj = m_LineObj[i - 1];
799 CFX_PointF new_prev_pos =
800 m_DisplayMatrix.Transform(prev_text_obj.m_formMatrix.Transform(
801 prev_text_obj.m_pTextObj->GetPos()));
802 if (this_pos.x >= new_prev_pos.x) {
803 m_LineObj.insert(m_LineObj.begin() + i, Obj);
804 return;
805 }
806 }
807 m_LineObj.insert(m_LineObj.begin(), Obj);
808 }
809
PreMarkedContent(PDFTEXT_Obj Obj)810 FPDFText_MarkedContent CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
811 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
812 if (!pTextObj->m_ContentMark)
813 return FPDFText_MarkedContent::Pass;
814
815 int nContentMark = pTextObj->m_ContentMark.CountItems();
816 if (nContentMark < 1)
817 return FPDFText_MarkedContent::Pass;
818
819 CFX_WideString actText;
820 bool bExist = false;
821 CPDF_Dictionary* pDict = nullptr;
822 int n = 0;
823 for (n = 0; n < nContentMark; n++) {
824 const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(n);
825 pDict = item.GetParam();
826 if (!pDict)
827 continue;
828 CPDF_String* temp = ToString(pDict->GetObjectFor("ActualText"));
829 if (temp) {
830 bExist = true;
831 actText = temp->GetUnicodeText();
832 }
833 }
834 if (!bExist)
835 return FPDFText_MarkedContent::Pass;
836
837 if (m_pPreTextObj && m_pPreTextObj->m_ContentMark &&
838 m_pPreTextObj->m_ContentMark.CountItems() == n &&
839 pDict == m_pPreTextObj->m_ContentMark.GetItem(n - 1).GetParam()) {
840 return FPDFText_MarkedContent::Done;
841 }
842
843 FX_STRSIZE nItems = actText.GetLength();
844 if (nItems < 1)
845 return FPDFText_MarkedContent::Pass;
846
847 CPDF_Font* pFont = pTextObj->GetFont();
848 bExist = false;
849 for (FX_STRSIZE i = 0; i < nItems; i++) {
850 if (pFont->CharCodeFromUnicode(actText.GetAt(i)) !=
851 CPDF_Font::kInvalidCharCode) {
852 bExist = true;
853 break;
854 }
855 }
856 if (!bExist)
857 return FPDFText_MarkedContent::Pass;
858
859 bExist = false;
860 for (FX_STRSIZE i = 0; i < nItems; i++) {
861 FX_WCHAR wChar = actText.GetAt(i);
862 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
863 bExist = true;
864 break;
865 }
866 }
867 if (!bExist)
868 return FPDFText_MarkedContent::Done;
869
870 return FPDFText_MarkedContent::Delay;
871 }
872
ProcessMarkedContent(PDFTEXT_Obj Obj)873 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
874 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
875 if (!pTextObj->m_ContentMark)
876 return;
877
878 int nContentMark = pTextObj->m_ContentMark.CountItems();
879 if (nContentMark < 1)
880 return;
881
882 CFX_WideString actText;
883 for (int n = 0; n < nContentMark; n++) {
884 const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(n);
885 CPDF_Dictionary* pDict = item.GetParam();
886 if (pDict)
887 actText = pDict->GetUnicodeTextFor("ActualText");
888 }
889 FX_STRSIZE nItems = actText.GetLength();
890 if (nItems < 1)
891 return;
892
893 CPDF_Font* pFont = pTextObj->GetFont();
894 CFX_Matrix matrix = pTextObj->GetTextMatrix();
895 matrix.Concat(Obj.m_formMatrix);
896
897 for (FX_STRSIZE k = 0; k < nItems; k++) {
898 FX_WCHAR wChar = actText.GetAt(k);
899 if (wChar <= 0x80 && !isprint(wChar))
900 wChar = 0x20;
901 if (wChar >= 0xFFFD)
902 continue;
903
904 PAGECHAR_INFO charinfo;
905 charinfo.m_Origin = pTextObj->GetPos();
906 charinfo.m_Index = m_TextBuf.GetLength();
907 charinfo.m_Unicode = wChar;
908 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
909 charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
910 charinfo.m_pTextObj = pTextObj;
911 charinfo.m_CharBox = pTextObj->GetRect();
912 charinfo.m_Matrix = matrix;
913 m_TempTextBuf.AppendChar(wChar);
914 m_TempCharList.push_back(charinfo);
915 }
916 }
917
FindPreviousTextObject()918 void CPDF_TextPage::FindPreviousTextObject() {
919 if (m_TempCharList.empty() && m_CharList.empty())
920 return;
921
922 PAGECHAR_INFO preChar =
923 m_TempCharList.empty() ? m_CharList.back() : m_TempCharList.back();
924
925 if (preChar.m_pTextObj)
926 m_pPreTextObj = preChar.m_pTextObj;
927 }
928
SwapTempTextBuf(int32_t iCharListStartAppend,int32_t iBufStartAppend)929 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend,
930 int32_t iBufStartAppend) {
931 int32_t i = iCharListStartAppend;
932 int32_t j = pdfium::CollectionSize<int32_t>(m_TempCharList) - 1;
933 for (; i < j; i++, j--) {
934 std::swap(m_TempCharList[i], m_TempCharList[j]);
935 std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index);
936 }
937 FX_WCHAR* pTempBuffer = m_TempTextBuf.GetBuffer();
938 i = iBufStartAppend;
939 j = m_TempTextBuf.GetLength() - 1;
940 for (; i < j; i++, j--)
941 std::swap(pTempBuffer[i], pTempBuffer[j]);
942 }
943
IsRightToLeft(const CPDF_TextObject * pTextObj,const CPDF_Font * pFont,int nItems) const944 bool CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj,
945 const CPDF_Font* pFont,
946 int nItems) const {
947 CFX_WideString str;
948 for (int32_t i = 0; i < nItems; i++) {
949 CPDF_TextObjectItem item;
950 pTextObj->GetItemInfo(i, &item);
951 if (item.m_CharCode == static_cast<uint32_t>(-1))
952 continue;
953 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
954 FX_WCHAR wChar = wstrItem.GetAt(0);
955 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode)
956 wChar = (FX_WCHAR)item.m_CharCode;
957 if (wChar)
958 str += wChar;
959 }
960 return CFX_BidiString(str).OverallDirection() == CFX_BidiChar::RIGHT;
961 }
962
ProcessTextObject(PDFTEXT_Obj Obj)963 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
964 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
965 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f)
966 return;
967 CFX_Matrix formMatrix = Obj.m_formMatrix;
968 CPDF_Font* pFont = pTextObj->GetFont();
969 CFX_Matrix matrix = pTextObj->GetTextMatrix();
970 matrix.Concat(formMatrix);
971
972 FPDFText_MarkedContent ePreMKC = PreMarkedContent(Obj);
973 if (ePreMKC == FPDFText_MarkedContent::Done) {
974 m_pPreTextObj = pTextObj;
975 m_perMatrix = formMatrix;
976 return;
977 }
978 GenerateCharacter result = GenerateCharacter::None;
979 if (m_pPreTextObj) {
980 result = ProcessInsertObject(pTextObj, formMatrix);
981 if (result == GenerateCharacter::LineBreak)
982 m_CurlineRect = Obj.m_pTextObj->GetRect();
983 else
984 m_CurlineRect.Union(Obj.m_pTextObj->GetRect());
985
986 switch (result) {
987 case GenerateCharacter::None:
988 break;
989 case GenerateCharacter::Space: {
990 PAGECHAR_INFO generateChar;
991 if (GenerateCharInfo(TEXT_SPACE_CHAR, generateChar)) {
992 if (!formMatrix.IsIdentity())
993 generateChar.m_Matrix = formMatrix;
994 m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
995 m_TempCharList.push_back(generateChar);
996 }
997 break;
998 }
999 case GenerateCharacter::LineBreak:
1000 CloseTempLine();
1001 if (m_TextBuf.GetSize()) {
1002 AppendGeneratedCharacter(TEXT_RETURN_CHAR, formMatrix);
1003 AppendGeneratedCharacter(TEXT_LINEFEED_CHAR, formMatrix);
1004 }
1005 break;
1006 case GenerateCharacter::Hyphen:
1007 if (pTextObj->CountChars() == 1) {
1008 CPDF_TextObjectItem item;
1009 pTextObj->GetCharInfo(0, &item);
1010 CFX_WideString wstrItem =
1011 pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1012 if (wstrItem.IsEmpty())
1013 wstrItem += (FX_WCHAR)item.m_CharCode;
1014 FX_WCHAR curChar = wstrItem.GetAt(0);
1015 if (curChar == 0x2D || curChar == 0xAD)
1016 return;
1017 }
1018 while (m_TempTextBuf.GetSize() > 0 &&
1019 m_TempTextBuf.AsStringC().GetAt(m_TempTextBuf.GetLength() - 1) ==
1020 0x20) {
1021 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1022 m_TempCharList.pop_back();
1023 }
1024 PAGECHAR_INFO* charinfo = &m_TempCharList.back();
1025 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1026 charinfo->m_Unicode = 0x2;
1027 charinfo->m_Flag = FPDFTEXT_CHAR_HYPHEN;
1028 m_TempTextBuf.AppendChar(0xfffe);
1029 break;
1030 }
1031 } else {
1032 m_CurlineRect = Obj.m_pTextObj->GetRect();
1033 }
1034
1035 if (ePreMKC == FPDFText_MarkedContent::Delay) {
1036 ProcessMarkedContent(Obj);
1037 m_pPreTextObj = pTextObj;
1038 m_perMatrix = formMatrix;
1039 return;
1040 }
1041 m_pPreTextObj = pTextObj;
1042 m_perMatrix = formMatrix;
1043 int nItems = pTextObj->CountItems();
1044 FX_FLOAT baseSpace = CalculateBaseSpace(pTextObj, matrix);
1045
1046 const bool bR2L = IsRightToLeft(pTextObj, pFont, nItems);
1047 const bool bIsBidiAndMirrorInverse =
1048 bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1049 int32_t iBufStartAppend = m_TempTextBuf.GetLength();
1050 int32_t iCharListStartAppend =
1051 pdfium::CollectionSize<int32_t>(m_TempCharList);
1052
1053 FX_FLOAT spacing = 0;
1054 for (int i = 0; i < nItems; i++) {
1055 CPDF_TextObjectItem item;
1056 PAGECHAR_INFO charinfo;
1057 pTextObj->GetItemInfo(i, &item);
1058 if (item.m_CharCode == static_cast<uint32_t>(-1)) {
1059 CFX_WideString str = m_TempTextBuf.MakeString();
1060 if (str.IsEmpty())
1061 str = m_TextBuf.AsStringC();
1062 if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_SPACE_CHAR)
1063 continue;
1064
1065 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1066 spacing = -fontsize_h * item.m_Origin.x / 1000;
1067 continue;
1068 }
1069 FX_FLOAT charSpace = pTextObj->m_TextState.GetCharSpace();
1070 if (charSpace > 0.001)
1071 spacing += matrix.TransformDistance(charSpace);
1072 else if (charSpace < -0.001)
1073 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
1074 spacing -= baseSpace;
1075 if (spacing && i > 0) {
1076 int last_width = 0;
1077 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1078 uint32_t space_charcode = pFont->CharCodeFromUnicode(' ');
1079 FX_FLOAT threshold = 0;
1080 if (space_charcode != CPDF_Font::kInvalidCharCode)
1081 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1082 if (threshold > fontsize_h / 3)
1083 threshold = 0;
1084 else
1085 threshold /= 2;
1086 if (threshold == 0) {
1087 threshold = fontsize_h;
1088 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
1089 threshold = this_width > last_width ? (FX_FLOAT)this_width
1090 : (FX_FLOAT)last_width;
1091 threshold = NormalizeThreshold(threshold);
1092 threshold = fontsize_h * threshold / 1000;
1093 }
1094 if (threshold && (spacing && spacing >= threshold)) {
1095 charinfo.m_Unicode = TEXT_SPACE_CHAR;
1096 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
1097 charinfo.m_pTextObj = pTextObj;
1098 charinfo.m_Index = m_TextBuf.GetLength();
1099 m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
1100 charinfo.m_CharCode = CPDF_Font::kInvalidCharCode;
1101 charinfo.m_Matrix = formMatrix;
1102 charinfo.m_Origin = matrix.Transform(item.m_Origin);
1103 charinfo.m_CharBox =
1104 CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y,
1105 charinfo.m_Origin.x, charinfo.m_Origin.y);
1106 m_TempCharList.push_back(charinfo);
1107 }
1108 if (item.m_CharCode == CPDF_Font::kInvalidCharCode)
1109 continue;
1110 }
1111 spacing = 0;
1112 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1113 bool bNoUnicode = false;
1114 if (wstrItem.IsEmpty() && item.m_CharCode) {
1115 wstrItem += static_cast<FX_WCHAR>(item.m_CharCode);
1116 bNoUnicode = true;
1117 }
1118 charinfo.m_Index = -1;
1119 charinfo.m_CharCode = item.m_CharCode;
1120 if (bNoUnicode)
1121 charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1122 else
1123 charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1124
1125 charinfo.m_pTextObj = pTextObj;
1126 charinfo.m_Origin = matrix.Transform(item.m_Origin);
1127
1128 FX_RECT rect =
1129 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode);
1130 charinfo.m_CharBox.top =
1131 rect.top * pTextObj->GetFontSize() / 1000 + item.m_Origin.y;
1132 charinfo.m_CharBox.left =
1133 rect.left * pTextObj->GetFontSize() / 1000 + item.m_Origin.x;
1134 charinfo.m_CharBox.right =
1135 rect.right * pTextObj->GetFontSize() / 1000 + item.m_Origin.x;
1136 charinfo.m_CharBox.bottom =
1137 rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_Origin.y;
1138 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
1139 charinfo.m_CharBox.top =
1140 charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
1141 }
1142 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
1143 charinfo.m_CharBox.right =
1144 charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1145 }
1146 matrix.TransformRect(charinfo.m_CharBox);
1147 charinfo.m_Matrix = matrix;
1148 if (wstrItem.IsEmpty()) {
1149 charinfo.m_Unicode = 0;
1150 m_TempCharList.push_back(charinfo);
1151 m_TempTextBuf.AppendChar(0xfffe);
1152 continue;
1153 } else {
1154 int nTotal = wstrItem.GetLength();
1155 bool bDel = false;
1156 const int count =
1157 std::min(pdfium::CollectionSize<int>(m_TempCharList), 7);
1158 FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance(
1159 (FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize());
1160 for (int n = pdfium::CollectionSize<int>(m_TempCharList);
1161 n > pdfium::CollectionSize<int>(m_TempCharList) - count; n--) {
1162 const PAGECHAR_INFO& charinfo1 = m_TempCharList[n - 1];
1163 CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin;
1164 if (charinfo1.m_CharCode == charinfo.m_CharCode &&
1165 charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() &&
1166 FXSYS_fabs(diff.x) < threshold && FXSYS_fabs(diff.y) < threshold) {
1167 bDel = true;
1168 break;
1169 }
1170 }
1171 if (!bDel) {
1172 for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1173 charinfo.m_Unicode = wstrItem.GetAt(nIndex);
1174 if (charinfo.m_Unicode) {
1175 charinfo.m_Index = m_TextBuf.GetLength();
1176 m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1177 } else {
1178 m_TempTextBuf.AppendChar(0xfffe);
1179 }
1180 m_TempCharList.push_back(charinfo);
1181 }
1182 } else if (i == 0) {
1183 CFX_WideString str = m_TempTextBuf.MakeString();
1184 if (!str.IsEmpty() &&
1185 str.GetAt(str.GetLength() - 1) == TEXT_SPACE_CHAR) {
1186 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1187 m_TempCharList.pop_back();
1188 }
1189 }
1190 }
1191 }
1192 if (bIsBidiAndMirrorInverse)
1193 SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1194 }
1195
GetTextObjectWritingMode(const CPDF_TextObject * pTextObj) const1196 CPDF_TextPage::TextOrientation CPDF_TextPage::GetTextObjectWritingMode(
1197 const CPDF_TextObject* pTextObj) const {
1198 int32_t nChars = pTextObj->CountChars();
1199 if (nChars == 1)
1200 return m_TextlineDir;
1201
1202 CPDF_TextObjectItem first, last;
1203 pTextObj->GetCharInfo(0, &first);
1204 pTextObj->GetCharInfo(nChars - 1, &last);
1205
1206 CFX_Matrix textMatrix = pTextObj->GetTextMatrix();
1207 first.m_Origin = textMatrix.Transform(first.m_Origin);
1208 last.m_Origin = textMatrix.Transform(last.m_Origin);
1209
1210 FX_FLOAT dX = FXSYS_fabs(last.m_Origin.x - first.m_Origin.x);
1211 FX_FLOAT dY = FXSYS_fabs(last.m_Origin.y - first.m_Origin.y);
1212 if (dX <= 0.0001f && dY <= 0.0001f)
1213 return TextOrientation::Unknown;
1214
1215 CFX_VectorF v(dX, dY);
1216 v.Normalize();
1217 if (v.y <= 0.0872f)
1218 return v.x <= 0.0872f ? m_TextlineDir : TextOrientation::Horizontal;
1219
1220 if (v.x <= 0.0872f)
1221 return TextOrientation::Vertical;
1222
1223 return m_TextlineDir;
1224 }
1225
IsHyphen(FX_WCHAR curChar)1226 bool CPDF_TextPage::IsHyphen(FX_WCHAR curChar) {
1227 CFX_WideString strCurText = m_TempTextBuf.MakeString();
1228 if (strCurText.IsEmpty())
1229 strCurText = m_TextBuf.AsStringC();
1230 FX_STRSIZE nCount = strCurText.GetLength();
1231 int nIndex = nCount - 1;
1232 FX_WCHAR wcTmp = strCurText.GetAt(nIndex);
1233 while (wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0)
1234 wcTmp = strCurText.GetAt(--nIndex);
1235 if (0x2D == wcTmp || 0xAD == wcTmp) {
1236 if (--nIndex > 0) {
1237 FX_WCHAR preChar = strCurText.GetAt((nIndex));
1238 if (((preChar >= L'A' && preChar <= L'Z') ||
1239 (preChar >= L'a' && preChar <= L'z')) &&
1240 ((curChar >= L'A' && curChar <= L'Z') ||
1241 (curChar >= L'a' && curChar <= L'z'))) {
1242 return true;
1243 }
1244 }
1245 const PAGECHAR_INFO* preInfo;
1246 if (!m_TempCharList.empty())
1247 preInfo = &m_TempCharList.back();
1248 else if (!m_CharList.empty())
1249 preInfo = &m_CharList.back();
1250 else
1251 return false;
1252 if (FPDFTEXT_CHAR_PIECE == preInfo->m_Flag &&
1253 (0xAD == preInfo->m_Unicode || 0x2D == preInfo->m_Unicode)) {
1254 return true;
1255 }
1256 }
1257 return false;
1258 }
1259
ProcessInsertObject(const CPDF_TextObject * pObj,const CFX_Matrix & formMatrix)1260 CPDF_TextPage::GenerateCharacter CPDF_TextPage::ProcessInsertObject(
1261 const CPDF_TextObject* pObj,
1262 const CFX_Matrix& formMatrix) {
1263 FindPreviousTextObject();
1264 TextOrientation WritingMode = GetTextObjectWritingMode(pObj);
1265 if (WritingMode == TextOrientation::Unknown)
1266 WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
1267
1268 CFX_FloatRect this_rect = pObj->GetRect();
1269 CFX_FloatRect prev_rect = m_pPreTextObj->GetRect();
1270 CPDF_TextObjectItem PrevItem;
1271 CPDF_TextObjectItem item;
1272 int nItem = m_pPreTextObj->CountItems();
1273 m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1274 pObj->GetItemInfo(0, &item);
1275 CFX_WideString wstrItem =
1276 pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1277 if (wstrItem.IsEmpty())
1278 wstrItem += static_cast<FX_WCHAR>(item.m_CharCode);
1279 FX_WCHAR curChar = wstrItem.GetAt(0);
1280 if (WritingMode == TextOrientation::Horizontal) {
1281 if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1282 FX_FLOAT top =
1283 this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
1284 FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom
1285 : prev_rect.bottom;
1286 if (bottom >= top) {
1287 return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1288 : GenerateCharacter::LineBreak;
1289 }
1290 }
1291 } else if (WritingMode == TextOrientation::Vertical) {
1292 if (this_rect.Width() > pObj->GetFontSize() * 0.1f &&
1293 prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1294 FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left
1295 : m_CurlineRect.left;
1296 FX_FLOAT right = this_rect.right < m_CurlineRect.right
1297 ? this_rect.right
1298 : m_CurlineRect.right;
1299 if (right <= left) {
1300 return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1301 : GenerateCharacter::LineBreak;
1302 }
1303 }
1304 }
1305
1306 FX_FLOAT last_pos = PrevItem.m_Origin.x;
1307 int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
1308 FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1309 last_width = FXSYS_fabs(last_width);
1310 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1311 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
1312 this_width = FXSYS_fabs(this_width);
1313 FX_FLOAT threshold =
1314 last_width > this_width ? last_width / 4 : this_width / 4;
1315
1316 CFX_Matrix prev_matrix = m_pPreTextObj->GetTextMatrix();
1317 prev_matrix.Concat(m_perMatrix);
1318
1319 CFX_Matrix prev_reverse;
1320 prev_reverse.SetReverse(prev_matrix);
1321
1322 CFX_PointF pos = prev_reverse.Transform(formMatrix.Transform(pObj->GetPos()));
1323 if (last_width < this_width)
1324 threshold = prev_reverse.TransformDistance(threshold);
1325
1326 bool bNewline = false;
1327 if (WritingMode == TextOrientation::Horizontal) {
1328 CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom,
1329 m_pPreTextObj->m_Right, pObj->m_Top);
1330 CFX_FloatRect rect2 = m_pPreTextObj->GetRect();
1331 CFX_FloatRect rect3 = rect1;
1332 rect1.Intersect(rect2);
1333 if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) ||
1334 ((pos.y > threshold * 2 || pos.y < threshold * -3) &&
1335 (FXSYS_fabs(pos.y) < 1 ? FXSYS_fabs(pos.x) < FXSYS_fabs(pos.y)
1336 : true))) {
1337 bNewline = true;
1338 if (nItem > 1) {
1339 CPDF_TextObjectItem tempItem;
1340 m_pPreTextObj->GetItemInfo(0, &tempItem);
1341 CFX_Matrix m = m_pPreTextObj->GetTextMatrix();
1342 if (PrevItem.m_Origin.x > tempItem.m_Origin.x &&
1343 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1344 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1345 m.c < 0.1) {
1346 CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000,
1347 m_pPreTextObj->m_Top);
1348 if (re.Contains(pObj->GetPos())) {
1349 bNewline = false;
1350 } else {
1351 CFX_FloatRect rect(0, pObj->m_Bottom, 1000, pObj->m_Top);
1352 if (rect.Contains(m_pPreTextObj->GetPos()))
1353 bNewline = false;
1354 }
1355 }
1356 }
1357 }
1358 }
1359 if (bNewline) {
1360 return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1361 : GenerateCharacter::LineBreak;
1362 }
1363
1364 int32_t nChars = pObj->CountChars();
1365 if (nChars == 1 && (0x2D == curChar || 0xAD == curChar) &&
1366 IsHyphen(curChar)) {
1367 return GenerateCharacter::Hyphen;
1368 }
1369 CFX_WideString PrevStr =
1370 m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1371 FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
1372 CFX_Matrix matrix = pObj->GetTextMatrix();
1373 matrix.Concat(formMatrix);
1374
1375 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
1376 threshold = threshold > 400
1377 ? (threshold < 700
1378 ? threshold / 4
1379 : (threshold > 800 ? threshold / 6 : threshold / 5))
1380 : (threshold / 2);
1381 if (nLastWidth >= nThisWidth) {
1382 threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
1383 } else {
1384 threshold *= FXSYS_fabs(pObj->GetFontSize());
1385 threshold = matrix.TransformDistance(threshold);
1386 threshold = prev_reverse.TransformDistance(threshold);
1387 }
1388 threshold /= 1000;
1389 if ((threshold < 1.4881 && threshold > 1.4879) ||
1390 (threshold < 1.39001 && threshold > 1.38999)) {
1391 threshold *= 1.5;
1392 }
1393 if (FXSYS_fabs(last_pos + last_width - pos.x) > threshold &&
1394 curChar != L' ' && preChar != L' ') {
1395 if (curChar != L' ' && preChar != L' ') {
1396 if ((pos.x - last_pos - last_width) > threshold ||
1397 (last_pos - pos.x - last_width) > threshold) {
1398 return GenerateCharacter::Space;
1399 }
1400 if (pos.x < 0 && (last_pos - pos.x - last_width) > threshold)
1401 return GenerateCharacter::Space;
1402 if ((pos.x - last_pos - last_width) > this_width ||
1403 (pos.x - last_pos - this_width) > last_width) {
1404 return GenerateCharacter::Space;
1405 }
1406 }
1407 }
1408 return GenerateCharacter::None;
1409 }
1410
IsSameTextObject(CPDF_TextObject * pTextObj1,CPDF_TextObject * pTextObj2)1411 bool CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
1412 CPDF_TextObject* pTextObj2) {
1413 if (!pTextObj1 || !pTextObj2)
1414 return false;
1415
1416 CFX_FloatRect rcPreObj = pTextObj2->GetRect();
1417 CFX_FloatRect rcCurObj = pTextObj1->GetRect();
1418 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
1419 FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
1420 size_t nCount = m_CharList.size();
1421 if (nCount >= 2) {
1422 PAGECHAR_INFO perCharTemp = m_CharList[nCount - 2];
1423 FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
1424 if (dbXdif > dbSpace)
1425 return false;
1426 }
1427 }
1428 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
1429 rcPreObj.Intersect(rcCurObj);
1430 if (rcPreObj.IsEmpty())
1431 return false;
1432 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
1433 rcCurObj.Width() / 2) {
1434 return false;
1435 }
1436 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize())
1437 return false;
1438 }
1439 int nPreCount = pTextObj2->CountItems();
1440 int nCurCount = pTextObj1->CountItems();
1441 if (nPreCount != nCurCount)
1442 return false;
1443 // If both objects have no items, consider them same.
1444 if (!nPreCount)
1445 return true;
1446
1447 CPDF_TextObjectItem itemPer;
1448 CPDF_TextObjectItem itemCur;
1449 for (int i = 0; i < nPreCount; i++) {
1450 pTextObj2->GetItemInfo(i, &itemPer);
1451 pTextObj1->GetItemInfo(i, &itemCur);
1452 if (itemCur.m_CharCode != itemPer.m_CharCode)
1453 return false;
1454 }
1455
1456 CFX_PointF diff = pTextObj1->GetPos() - pTextObj2->GetPos();
1457 FX_FLOAT font_size = pTextObj2->GetFontSize();
1458 FX_FLOAT char_size = GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont());
1459 FX_FLOAT max_pre_size =
1460 std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), font_size);
1461 if (FXSYS_fabs(diff.x) > char_size * font_size / 1000 * 0.9 ||
1462 FXSYS_fabs(diff.y) > max_pre_size / 8) {
1463 return false;
1464 }
1465 return true;
1466 }
1467
IsSameAsPreTextObject(CPDF_TextObject * pTextObj,const CPDF_PageObjectList * pObjList,CPDF_PageObjectList::const_iterator iter)1468 bool CPDF_TextPage::IsSameAsPreTextObject(
1469 CPDF_TextObject* pTextObj,
1470 const CPDF_PageObjectList* pObjList,
1471 CPDF_PageObjectList::const_iterator iter) {
1472 int i = 0;
1473 while (i < 5 && iter != pObjList->begin()) {
1474 --iter;
1475 CPDF_PageObject* pOtherObj = iter->get();
1476 if (pOtherObj == pTextObj || !pOtherObj->IsText())
1477 continue;
1478 if (IsSameTextObject(pOtherObj->AsText(), pTextObj))
1479 return true;
1480 ++i;
1481 }
1482 return false;
1483 }
1484
GenerateCharInfo(FX_WCHAR unicode,PAGECHAR_INFO & info)1485 bool CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) {
1486 const PAGECHAR_INFO* preChar;
1487 if (!m_TempCharList.empty())
1488 preChar = &m_TempCharList.back();
1489 else if (!m_CharList.empty())
1490 preChar = &m_CharList.back();
1491 else
1492 return false;
1493
1494 info.m_Index = m_TextBuf.GetLength();
1495 info.m_Unicode = unicode;
1496 info.m_pTextObj = nullptr;
1497 info.m_CharCode = CPDF_Font::kInvalidCharCode;
1498 info.m_Flag = FPDFTEXT_CHAR_GENERATED;
1499
1500 int preWidth = 0;
1501 if (preChar->m_pTextObj && preChar->m_CharCode != -1) {
1502 preWidth =
1503 GetCharWidth(preChar->m_CharCode, preChar->m_pTextObj->GetFont());
1504 }
1505
1506 FX_FLOAT fFontSize = preChar->m_pTextObj ? preChar->m_pTextObj->GetFontSize()
1507 : preChar->m_CharBox.Height();
1508 if (!fFontSize)
1509 fFontSize = kDefaultFontSize;
1510
1511 info.m_Origin = CFX_PointF(
1512 preChar->m_Origin.x + preWidth * (fFontSize) / 1000, preChar->m_Origin.y);
1513 info.m_CharBox = CFX_FloatRect(info.m_Origin.x, info.m_Origin.y,
1514 info.m_Origin.x, info.m_Origin.y);
1515 return true;
1516 }
1517
IsRectIntersect(const CFX_FloatRect & rect1,const CFX_FloatRect & rect2)1518 bool CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
1519 const CFX_FloatRect& rect2) {
1520 CFX_FloatRect rect = rect1;
1521 rect.Intersect(rect2);
1522 return !rect.IsEmpty();
1523 }
1524