• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include <algorithm>
8 #include <cctype>
9 #include <cwctype>
10 #include <memory>
11 
12 #include "core/include/fpdfapi/fpdf_module.h"
13 #include "core/include/fpdfapi/fpdf_page.h"
14 #include "core/include/fpdfapi/fpdf_pageobj.h"
15 #include "core/include/fpdfapi/fpdf_resource.h"
16 #include "core/include/fpdftext/fpdf_text.h"
17 #include "core/include/fxcrt/fx_bidi.h"
18 #include "core/include/fxcrt/fx_ext.h"
19 #include "core/include/fxcrt/fx_ucd.h"
20 #include "text_int.h"
21 
22 namespace {
23 
_IsIgnoreSpaceCharacter(FX_WCHAR curChar)24 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
25   if (curChar < 255) {
26     return FALSE;
27   }
28   if ((curChar >= 0x0600 && curChar <= 0x06FF) ||
29       (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
30       (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
31       (curChar >= 0x0400 && curChar <= 0x04FF) ||
32       (curChar >= 0x0500 && curChar <= 0x052F) ||
33       (curChar >= 0xA640 && curChar <= 0xA69F) ||
34       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
35       (curChar >= 0x2000 && curChar <= 0x206F)) {
36     return FALSE;
37   }
38   return TRUE;
39 }
40 
_NormalizeThreshold(FX_FLOAT threshold)41 FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) {
42   if (threshold < 300) {
43     return threshold / 2.0f;
44   }
45   if (threshold < 500) {
46     return threshold / 4.0f;
47   }
48   if (threshold < 700) {
49     return threshold / 5.0f;
50   }
51   return threshold / 6.0f;
52 }
53 
_CalculateBaseSpace(const CPDF_TextObject * pTextObj,const CFX_Matrix & matrix)54 FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj,
55                              const CFX_Matrix& matrix) {
56   FX_FLOAT baseSpace = 0.0;
57   const int nItems = pTextObj->CountItems();
58   if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) {
59     FX_BOOL bAllChar = TRUE;
60     FX_FLOAT spacing = matrix.TransformDistance(
61         pTextObj->m_TextState.GetObject()->m_CharSpace);
62     baseSpace = spacing;
63     for (int i = 0; i < nItems; i++) {
64       CPDF_TextObjectItem item;
65       pTextObj->GetItemInfo(i, &item);
66       if (item.m_CharCode == (FX_DWORD)-1) {
67         FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
68         FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000;
69         baseSpace = std::min(baseSpace, kerning + spacing);
70         bAllChar = FALSE;
71       }
72     }
73     if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) {
74       baseSpace = 0.0;
75     }
76   }
77   return baseSpace;
78 }
79 
80 const FX_FLOAT kDefaultFontSize = 1.0f;
81 
82 }  // namespace
83 
CPDFText_ParseOptions()84 CPDFText_ParseOptions::CPDFText_ParseOptions()
85     : m_bGetCharCodeOnly(FALSE),
86       m_bNormalizeObjs(TRUE),
87       m_bOutputHyphen(FALSE) {}
88 
CreateTextPage(const CPDF_Page * pPage,int flags)89 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage,
90                                              int flags) {
91   return new CPDF_TextPage(pPage, flags);
92 }
93 
CreatePageFind(const IPDF_TextPage * pTextPage)94 IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind(
95     const IPDF_TextPage* pTextPage) {
96   return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr;
97 }
98 
CreateLinkExtract()99 IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() {
100   return new CPDF_LinkExtract();
101 }
102 
103 #define TEXT_BLANK_CHAR L' '
104 #define TEXT_LINEFEED_CHAR L'\n'
105 #define TEXT_RETURN_CHAR L'\r'
106 #define TEXT_EMPTY L""
107 #define TEXT_BLANK L" "
108 #define TEXT_RETURN_LINEFEED L"\r\n"
109 #define TEXT_LINEFEED L"\n"
110 #define TEXT_CHARRATIO_GAPDELTA 0.070
111 
CPDF_TextPage(const CPDF_Page * pPage,int flags)112 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags)
113     : m_pPage(pPage),
114       m_charList(512),
115       m_TempCharList(50),
116       m_parserflag(flags),
117       m_pPreTextObj(nullptr),
118       m_bIsParsed(false),
119       m_TextlineDir(-1),
120       m_CurlineRect(0, 0, 0, 0) {
121   m_TextBuf.EstimateSize(0, 10240);
122   pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(),
123                           (int)pPage->GetPageHeight(), 0);
124 }
125 
NormalizeObjects(FX_BOOL bNormalize)126 void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) {
127   m_ParseOptions.m_bNormalizeObjs = bNormalize;
128 }
IsControlChar(const PAGECHAR_INFO & charInfo)129 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
130   switch (charInfo.m_Unicode) {
131     case 0x2:
132     case 0x3:
133     case 0x93:
134     case 0x94:
135     case 0x96:
136     case 0x97:
137     case 0x98:
138     case 0xfffe:
139       return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
140     default:
141       return false;
142   }
143 }
ParseTextPage()144 FX_BOOL CPDF_TextPage::ParseTextPage() {
145   m_bIsParsed = false;
146   if (!m_pPage)
147     return FALSE;
148 
149   m_TextBuf.Clear();
150   m_charList.RemoveAll();
151   m_pPreTextObj = NULL;
152   ProcessObject();
153   m_bIsParsed = true;
154   if (!m_ParseOptions.m_bGetCharCodeOnly) {
155     m_CharIndex.RemoveAll();
156     int nCount = m_charList.GetSize();
157     if (nCount) {
158       m_CharIndex.Add(0);
159     }
160     for (int i = 0; i < nCount; i++) {
161       int indexSize = m_CharIndex.GetSize();
162       FX_BOOL bNormal = FALSE;
163       PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i);
164       if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
165         bNormal = TRUE;
166       } else if (charinfo.m_Unicode == 0 || IsControlChar(charinfo)) {
167         bNormal = FALSE;
168       } else {
169         bNormal = TRUE;
170       }
171       if (bNormal) {
172         if (indexSize % 2) {
173           m_CharIndex.Add(1);
174         } else {
175           if (indexSize <= 0) {
176             continue;
177           }
178           m_CharIndex.SetAt(indexSize - 1,
179                             m_CharIndex.GetAt(indexSize - 1) + 1);
180         }
181       } else {
182         if (indexSize % 2) {
183           if (indexSize <= 0) {
184             continue;
185           }
186           m_CharIndex.SetAt(indexSize - 1, i + 1);
187         } else {
188           m_CharIndex.Add(i + 1);
189         }
190       }
191     }
192     int indexSize = m_CharIndex.GetSize();
193     if (indexSize % 2) {
194       m_CharIndex.RemoveAt(indexSize - 1);
195     }
196   }
197   return TRUE;
198 }
CountChars() const199 int CPDF_TextPage::CountChars() const {
200   if (m_ParseOptions.m_bGetCharCodeOnly) {
201     return m_TextBuf.GetSize();
202   }
203   return m_charList.GetSize();
204 }
CharIndexFromTextIndex(int TextIndex) const205 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const {
206   int indexSize = m_CharIndex.GetSize();
207   int count = 0;
208   for (int i = 0; i < indexSize; i += 2) {
209     count += m_CharIndex.GetAt(i + 1);
210     if (count > TextIndex) {
211       return TextIndex - count + m_CharIndex.GetAt(i + 1) +
212              m_CharIndex.GetAt(i);
213     }
214   }
215   return -1;
216 }
TextIndexFromCharIndex(int CharIndex) const217 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const {
218   int indexSize = m_CharIndex.GetSize();
219   int count = 0;
220   for (int i = 0; i < indexSize; i += 2) {
221     count += m_CharIndex.GetAt(i + 1);
222     if (m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) {
223       if (CharIndex - m_CharIndex.GetAt(i) < 0) {
224         return -1;
225       }
226       return CharIndex - m_CharIndex.GetAt(i) + count -
227              m_CharIndex.GetAt(i + 1);
228     }
229   }
230   return -1;
231 }
GetRectArray(int start,int nCount,CFX_RectArray & rectArray) const232 void CPDF_TextPage::GetRectArray(int start,
233                                  int nCount,
234                                  CFX_RectArray& rectArray) const {
235   if (m_ParseOptions.m_bGetCharCodeOnly) {
236     return;
237   }
238   if (start < 0 || nCount == 0) {
239     return;
240   }
241   if (!m_bIsParsed) {
242     return;
243   }
244   PAGECHAR_INFO info_curchar;
245   CPDF_TextObject* pCurObj = NULL;
246   CFX_FloatRect rect;
247   int curPos = start;
248   FX_BOOL flagNewRect = TRUE;
249   if (nCount + start > m_charList.GetSize() || nCount == -1) {
250     nCount = m_charList.GetSize() - start;
251   }
252   while (nCount--) {
253     info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++);
254     if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
255       continue;
256     }
257     if (info_curchar.m_CharBox.Width() < 0.01 ||
258         info_curchar.m_CharBox.Height() < 0.01) {
259       continue;
260     }
261     if (!pCurObj) {
262       pCurObj = info_curchar.m_pTextObj;
263     }
264     if (pCurObj != info_curchar.m_pTextObj) {
265       rectArray.Add(rect);
266       pCurObj = info_curchar.m_pTextObj;
267       flagNewRect = TRUE;
268     }
269     if (flagNewRect) {
270       FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_OriginY;
271       CFX_Matrix matrix, matrix_reverse;
272       info_curchar.m_pTextObj->GetTextMatrix(&matrix);
273       matrix.Concat(info_curchar.m_Matrix);
274       matrix_reverse.SetReverse(matrix);
275       matrix_reverse.Transform(orgX, orgY);
276       rect.left = info_curchar.m_CharBox.left;
277       rect.right = info_curchar.m_CharBox.right;
278       if (pCurObj->GetFont()->GetTypeDescent()) {
279         rect.bottom = orgY +
280                       pCurObj->GetFont()->GetTypeDescent() *
281                           pCurObj->GetFontSize() / 1000;
282         FX_FLOAT xPosTemp = orgX;
283         matrix.Transform(xPosTemp, rect.bottom);
284       } else {
285         rect.bottom = info_curchar.m_CharBox.bottom;
286       }
287       if (pCurObj->GetFont()->GetTypeAscent()) {
288         rect.top =
289             orgY +
290             pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
291         FX_FLOAT xPosTemp =
292             orgX +
293             GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) *
294                 pCurObj->GetFontSize() / 1000;
295         matrix.Transform(xPosTemp, rect.top);
296       } else {
297         rect.top = info_curchar.m_CharBox.top;
298       }
299       flagNewRect = FALSE;
300       rect = info_curchar.m_CharBox;
301       rect.Normalize();
302     } else {
303       info_curchar.m_CharBox.Normalize();
304       if (rect.left > info_curchar.m_CharBox.left) {
305         rect.left = info_curchar.m_CharBox.left;
306       }
307       if (rect.right < info_curchar.m_CharBox.right) {
308         rect.right = info_curchar.m_CharBox.right;
309       }
310       if (rect.top < info_curchar.m_CharBox.top) {
311         rect.top = info_curchar.m_CharBox.top;
312       }
313       if (rect.bottom > info_curchar.m_CharBox.bottom) {
314         rect.bottom = info_curchar.m_CharBox.bottom;
315       }
316     }
317   }
318   rectArray.Add(rect);
319   return;
320 }
GetIndexAtPos(CPDF_Point point,FX_FLOAT xTolerance,FX_FLOAT yTolerance) const321 int CPDF_TextPage::GetIndexAtPos(CPDF_Point point,
322                                  FX_FLOAT xTolerance,
323                                  FX_FLOAT yTolerance) const {
324   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
325     return -3;
326 
327   int pos = 0;
328   int NearPos = -1;
329   double xdif = 5000, ydif = 5000;
330   while (pos < m_charList.GetSize()) {
331     PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos));
332     CFX_FloatRect charrect = charinfo.m_CharBox;
333     if (charrect.Contains(point.x, point.y)) {
334       break;
335     }
336     if (xTolerance > 0 || yTolerance > 0) {
337       CFX_FloatRect charRectExt;
338       charrect.Normalize();
339       charRectExt.left = charrect.left - xTolerance / 2;
340       charRectExt.right = charrect.right + xTolerance / 2;
341       charRectExt.top = charrect.top + yTolerance / 2;
342       charRectExt.bottom = charrect.bottom - yTolerance / 2;
343       if (charRectExt.Contains(point.x, point.y)) {
344         double curXdif, curYdif;
345         curXdif = FXSYS_fabs(point.x - charrect.left) <
346                           FXSYS_fabs(point.x - charrect.right)
347                       ? FXSYS_fabs(point.x - charrect.left)
348                       : FXSYS_fabs(point.x - charrect.right);
349         curYdif = FXSYS_fabs(point.y - charrect.bottom) <
350                           FXSYS_fabs(point.y - charrect.top)
351                       ? FXSYS_fabs(point.y - charrect.bottom)
352                       : FXSYS_fabs(point.y - charrect.top);
353         if (curYdif + curXdif < xdif + ydif) {
354           ydif = curYdif;
355           xdif = curXdif;
356           NearPos = pos;
357         }
358       }
359     }
360     ++pos;
361   }
362   if (pos >= m_charList.GetSize()) {
363     pos = NearPos;
364   }
365   return pos;
366 }
GetTextByRect(const CFX_FloatRect & rect) const367 CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
368   CFX_WideString strText;
369   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
370     return strText;
371 
372   int nCount = m_charList.GetSize();
373   int pos = 0;
374   FX_FLOAT posy = 0;
375   FX_BOOL IsContainPreChar = FALSE;
376   FX_BOOL ISAddLineFeed = FALSE;
377   while (pos < nCount) {
378     PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
379     if (IsRectIntersect(rect, charinfo.m_CharBox)) {
380       if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar &&
381           ISAddLineFeed) {
382         posy = charinfo.m_OriginY;
383         if (strText.GetLength() > 0) {
384           strText += L"\r\n";
385         }
386       }
387       IsContainPreChar = TRUE;
388       ISAddLineFeed = FALSE;
389       if (charinfo.m_Unicode) {
390         strText += charinfo.m_Unicode;
391       }
392     } else if (charinfo.m_Unicode == 32) {
393       if (IsContainPreChar && charinfo.m_Unicode) {
394         strText += charinfo.m_Unicode;
395         IsContainPreChar = FALSE;
396         ISAddLineFeed = FALSE;
397       }
398     } else {
399       IsContainPreChar = FALSE;
400       ISAddLineFeed = TRUE;
401     }
402   }
403   return strText;
404 }
GetRectsArrayByRect(const CFX_FloatRect & rect,CFX_RectArray & resRectArray) const405 void CPDF_TextPage::GetRectsArrayByRect(const CFX_FloatRect& rect,
406                                         CFX_RectArray& resRectArray) const {
407   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
408     return;
409 
410   CFX_FloatRect curRect;
411   FX_BOOL flagNewRect = TRUE;
412   CPDF_TextObject* pCurObj = NULL;
413   int nCount = m_charList.GetSize();
414   int pos = 0;
415   while (pos < nCount) {
416     PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
417     if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
418       continue;
419     }
420     if (IsRectIntersect(rect, info_curchar.m_CharBox)) {
421       if (!pCurObj) {
422         pCurObj = info_curchar.m_pTextObj;
423       }
424       if (pCurObj != info_curchar.m_pTextObj) {
425         resRectArray.Add(curRect);
426         pCurObj = info_curchar.m_pTextObj;
427         flagNewRect = TRUE;
428       }
429       if (flagNewRect) {
430         curRect = info_curchar.m_CharBox;
431         flagNewRect = FALSE;
432         curRect.Normalize();
433       } else {
434         info_curchar.m_CharBox.Normalize();
435         if (curRect.left > info_curchar.m_CharBox.left) {
436           curRect.left = info_curchar.m_CharBox.left;
437         }
438         if (curRect.right < info_curchar.m_CharBox.right) {
439           curRect.right = info_curchar.m_CharBox.right;
440         }
441         if (curRect.top < info_curchar.m_CharBox.top) {
442           curRect.top = info_curchar.m_CharBox.top;
443         }
444         if (curRect.bottom > info_curchar.m_CharBox.bottom) {
445           curRect.bottom = info_curchar.m_CharBox.bottom;
446         }
447       }
448     }
449   }
450   resRectArray.Add(curRect);
451   return;
452 }
GetIndexAtPos(FX_FLOAT x,FX_FLOAT y,FX_FLOAT xTolerance,FX_FLOAT yTolerance) const453 int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x,
454                                  FX_FLOAT y,
455                                  FX_FLOAT xTolerance,
456                                  FX_FLOAT yTolerance) const {
457   if (m_ParseOptions.m_bGetCharCodeOnly) {
458     return -3;
459   }
460   CPDF_Point point(x, y);
461   return GetIndexAtPos(point, xTolerance, yTolerance);
462 }
463 
GetCharInfo(int index,FPDF_CHAR_INFO * info) const464 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
465   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
466     return;
467 
468   if (index < 0 || index >= m_charList.GetSize())
469     return;
470 
471   const PAGECHAR_INFO* charinfo =
472       static_cast<PAGECHAR_INFO*>(m_charList.GetAt(index));
473   info->m_Charcode = charinfo->m_CharCode;
474   info->m_OriginX = charinfo->m_OriginX;
475   info->m_OriginY = charinfo->m_OriginY;
476   info->m_Unicode = charinfo->m_Unicode;
477   info->m_Flag = charinfo->m_Flag;
478   info->m_CharBox = charinfo->m_CharBox;
479   info->m_pTextObj = charinfo->m_pTextObj;
480   if (charinfo->m_pTextObj && charinfo->m_pTextObj->GetFont()) {
481     info->m_FontSize = charinfo->m_pTextObj->GetFontSize();
482   } else {
483     info->m_FontSize = kDefaultFontSize;
484   }
485   info->m_Matrix.Copy(charinfo->m_Matrix);
486 }
487 
CheckMarkedContentObject(int32_t & start,int32_t & nCount) const488 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
489                                              int32_t& nCount) const {
490   PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
491   PAGECHAR_INFO charinfo2 =
492       *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
493   if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag &&
494       FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
495     return;
496   }
497   if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
498     PAGECHAR_INFO charinfo1 = charinfo;
499     int startIndex = start;
500     while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag &&
501            charinfo1.m_Index == charinfo.m_Index) {
502       startIndex--;
503       if (startIndex < 0) {
504         break;
505       }
506       charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex);
507     }
508     startIndex++;
509     start = startIndex;
510   }
511   if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
512     PAGECHAR_INFO charinfo3 = charinfo2;
513     int endIndex = start + nCount - 1;
514     while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag &&
515            charinfo3.m_Index == charinfo2.m_Index) {
516       endIndex++;
517       if (endIndex >= m_charList.GetSize()) {
518         break;
519       }
520       charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex);
521     }
522     endIndex--;
523     nCount = endIndex - start + 1;
524   }
525 }
GetPageText(int start,int nCount) const526 CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
527   if (!m_bIsParsed || nCount == 0)
528     return L"";
529 
530   if (start < 0)
531     start = 0;
532 
533   if (nCount == -1) {
534     nCount = m_charList.GetSize() - start;
535     return m_TextBuf.GetWideString().Mid(start,
536                                          m_TextBuf.GetWideString().GetLength());
537   }
538   if (nCount <= 0 || m_charList.GetSize() <= 0) {
539     return L"";
540   }
541   if (nCount + start > m_charList.GetSize() - 1) {
542     nCount = m_charList.GetSize() - start;
543   }
544   if (nCount <= 0) {
545     return L"";
546   }
547   CheckMarkedContentObject(start, nCount);
548   int startindex = 0;
549   PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
550   int startOffset = 0;
551   while (charinfo.m_Index == -1) {
552     startOffset++;
553     if (startOffset > nCount || start + startOffset >= m_charList.GetSize()) {
554       return L"";
555     }
556     charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset);
557   }
558   startindex = charinfo.m_Index;
559   charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
560   int nCountOffset = 0;
561   while (charinfo.m_Index == -1) {
562     nCountOffset++;
563     if (nCountOffset >= nCount) {
564       return L"";
565     }
566     charinfo =
567         *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffset - 1);
568   }
569   nCount = start + nCount - nCountOffset - startindex;
570   if (nCount <= 0) {
571     return L"";
572   }
573   return m_TextBuf.GetWideString().Mid(startindex, nCount);
574 }
CountRects(int start,int nCount)575 int CPDF_TextPage::CountRects(int start, int nCount) {
576   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed || start < 0)
577     return -1;
578 
579   if (nCount == -1 || nCount + start > m_charList.GetSize()) {
580     nCount = m_charList.GetSize() - start;
581   }
582   m_SelRects.RemoveAll();
583   GetRectArray(start, nCount, m_SelRects);
584   return m_SelRects.GetSize();
585 }
GetRect(int rectIndex,FX_FLOAT & left,FX_FLOAT & top,FX_FLOAT & right,FX_FLOAT & bottom) const586 void CPDF_TextPage::GetRect(int rectIndex,
587                             FX_FLOAT& left,
588                             FX_FLOAT& top,
589                             FX_FLOAT& right,
590                             FX_FLOAT& bottom) const {
591   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
592     return;
593 
594   if (rectIndex < 0 || rectIndex >= m_SelRects.GetSize())
595     return;
596 
597   left = m_SelRects.GetAt(rectIndex).left;
598   top = m_SelRects.GetAt(rectIndex).top;
599   right = m_SelRects.GetAt(rectIndex).right;
600   bottom = m_SelRects.GetAt(rectIndex).bottom;
601 }
602 
GetBaselineRotate(int start,int end,int & Rotate)603 FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate) {
604   if (m_ParseOptions.m_bGetCharCodeOnly) {
605     return FALSE;
606   }
607   if (end == start) {
608     return FALSE;
609   }
610   FPDF_CHAR_INFO info_start;
611   FPDF_CHAR_INFO info_end;
612   GetCharInfo(start, &info_start);
613   GetCharInfo(end, &info_end);
614   while (info_end.m_CharBox.Width() == 0 || info_end.m_CharBox.Height() == 0) {
615     if (--end <= start)
616       return FALSE;
617 
618     GetCharInfo(end, &info_end);
619   }
620   FX_FLOAT dx = (info_end.m_OriginX - info_start.m_OriginX);
621   FX_FLOAT dy = (info_end.m_OriginY - info_start.m_OriginY);
622   if (dx == 0) {
623     if (dy > 0) {
624       Rotate = 90;
625     } else if (dy < 0) {
626       Rotate = 270;
627     } else {
628       Rotate = 0;
629     }
630   } else {
631     float a = FXSYS_atan2(dy, dx);
632     Rotate = (int)(a * 180 / FX_PI + 0.5);
633   }
634   if (Rotate < 0) {
635     Rotate = -Rotate;
636   } else if (Rotate > 0) {
637     Rotate = 360 - Rotate;
638   }
639   return TRUE;
640 }
641 
GetBaselineRotate(const CFX_FloatRect & rect,int & Rotate)642 FX_BOOL CPDF_TextPage::GetBaselineRotate(const CFX_FloatRect& rect,
643                                          int& Rotate) {
644   if (m_ParseOptions.m_bGetCharCodeOnly) {
645     return FALSE;
646   }
647   int start, end, count,
648       n = CountBoundedSegments(rect.left, rect.top, rect.right, rect.bottom,
649                                TRUE);
650   if (n < 1) {
651     return FALSE;
652   }
653   if (n > 1) {
654     GetBoundedSegment(n - 1, start, count);
655     end = start + count - 1;
656     GetBoundedSegment(0, start, count);
657   } else {
658     GetBoundedSegment(0, start, count);
659     end = start + count - 1;
660   }
661   return GetBaselineRotate(start, end, Rotate);
662 }
GetBaselineRotate(int rectIndex,int & Rotate)663 FX_BOOL CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate) {
664   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
665     return FALSE;
666 
667   if (rectIndex < 0 || rectIndex > m_SelRects.GetSize())
668     return FALSE;
669 
670   CFX_FloatRect rect = m_SelRects.GetAt(rectIndex);
671   return GetBaselineRotate(rect, Rotate);
672 }
CountBoundedSegments(FX_FLOAT left,FX_FLOAT top,FX_FLOAT right,FX_FLOAT bottom,FX_BOOL bContains)673 int CPDF_TextPage::CountBoundedSegments(FX_FLOAT left,
674                                         FX_FLOAT top,
675                                         FX_FLOAT right,
676                                         FX_FLOAT bottom,
677                                         FX_BOOL bContains) {
678   if (m_ParseOptions.m_bGetCharCodeOnly)
679     return -1;
680 
681   m_Segment.RemoveAll();
682   if (!m_bIsParsed)
683     return -1;
684 
685   CFX_FloatRect rect(left, bottom, right, top);
686   rect.Normalize();
687   int nCount = m_charList.GetSize();
688   int pos = 0;
689   FPDF_SEGMENT segment;
690   segment.m_Start = 0;
691   segment.m_nCount = 0;
692   int segmentStatus = 0;
693   FX_BOOL IsContainPreChar = FALSE;
694   while (pos < nCount) {
695     PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos);
696     if (bContains && rect.Contains(charinfo.m_CharBox)) {
697       if (segmentStatus == 0 || segmentStatus == 2) {
698         segment.m_Start = pos;
699         segment.m_nCount = 1;
700         segmentStatus = 1;
701       } else if (segmentStatus == 1) {
702         segment.m_nCount++;
703       }
704       IsContainPreChar = TRUE;
705     } else if (!bContains &&
706                (IsRectIntersect(rect, charinfo.m_CharBox) ||
707                 rect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) {
708       if (segmentStatus == 0 || segmentStatus == 2) {
709         segment.m_Start = pos;
710         segment.m_nCount = 1;
711         segmentStatus = 1;
712       } else if (segmentStatus == 1) {
713         segment.m_nCount++;
714       }
715       IsContainPreChar = TRUE;
716     } else if (charinfo.m_Unicode == 32) {
717       if (IsContainPreChar == TRUE) {
718         if (segmentStatus == 0 || segmentStatus == 2) {
719           segment.m_Start = pos;
720           segment.m_nCount = 1;
721           segmentStatus = 1;
722         } else if (segmentStatus == 1) {
723           segment.m_nCount++;
724         }
725         IsContainPreChar = FALSE;
726       } else {
727         if (segmentStatus == 1) {
728           segmentStatus = 2;
729           m_Segment.Add(segment);
730           segment.m_Start = 0;
731           segment.m_nCount = 0;
732         }
733       }
734     } else {
735       if (segmentStatus == 1) {
736         segmentStatus = 2;
737         m_Segment.Add(segment);
738         segment.m_Start = 0;
739         segment.m_nCount = 0;
740       }
741       IsContainPreChar = FALSE;
742     }
743     pos++;
744   }
745   if (segmentStatus == 1) {
746     segmentStatus = 2;
747     m_Segment.Add(segment);
748     segment.m_Start = 0;
749     segment.m_nCount = 0;
750   }
751   return m_Segment.GetSize();
752 }
GetBoundedSegment(int index,int & start,int & count) const753 void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const {
754   if (m_ParseOptions.m_bGetCharCodeOnly) {
755     return;
756   }
757   if (index < 0 || index >= m_Segment.GetSize()) {
758     return;
759   }
760   start = m_Segment.GetAt(index).m_Start;
761   count = m_Segment.GetAt(index).m_nCount;
762 }
GetWordBreak(int index,int direction) const763 int CPDF_TextPage::GetWordBreak(int index, int direction) const {
764   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
765     return -1;
766 
767   if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT)
768     return -1;
769 
770   if (index < 0 || index >= m_charList.GetSize())
771     return -1;
772 
773   PAGECHAR_INFO charinfo;
774   charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
775   if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
776     return index;
777   }
778   if (!IsLetter(charinfo.m_Unicode)) {
779     return index;
780   }
781   int breakPos = index;
782   if (direction == FPDFTEXT_LEFT) {
783     while (--breakPos > 0) {
784       charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
785       if (!IsLetter(charinfo.m_Unicode)) {
786         return breakPos;
787       }
788     }
789   } else if (direction == FPDFTEXT_RIGHT) {
790     while (++breakPos < m_charList.GetSize()) {
791       charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
792       if (!IsLetter(charinfo.m_Unicode)) {
793         return breakPos;
794       }
795     }
796   }
797   return breakPos;
798 }
FindTextlineFlowDirection()799 int32_t CPDF_TextPage::FindTextlineFlowDirection() {
800   if (!m_pPage) {
801     return -1;
802   }
803   const int32_t nPageWidth = (int32_t)((CPDF_Page*)m_pPage)->GetPageWidth();
804   const int32_t nPageHeight = (int32_t)((CPDF_Page*)m_pPage)->GetPageHeight();
805   CFX_ByteArray nHorizontalMask;
806   if (!nHorizontalMask.SetSize(nPageWidth)) {
807     return -1;
808   }
809   uint8_t* pDataH = nHorizontalMask.GetData();
810   CFX_ByteArray nVerticalMask;
811   if (!nVerticalMask.SetSize(nPageHeight)) {
812     return -1;
813   }
814   uint8_t* pDataV = nVerticalMask.GetData();
815   int32_t index = 0;
816   FX_FLOAT fLineHeight = 0.0f;
817   CPDF_PageObject* pPageObj = NULL;
818   FX_POSITION pos = NULL;
819   pos = m_pPage->GetFirstObjectPosition();
820   if (!pos) {
821     return -1;
822   }
823   while (pos) {
824     pPageObj = m_pPage->GetNextObject(pos);
825     if (NULL == pPageObj) {
826       continue;
827     }
828     if (PDFPAGE_TEXT != pPageObj->m_Type) {
829       continue;
830     }
831     int32_t minH =
832         (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left;
833     int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth
834                        ? nPageWidth
835                        : (int32_t)pPageObj->m_Right;
836     int32_t minV =
837         (int32_t)pPageObj->m_Bottom < 0 ? 0 : (int32_t)pPageObj->m_Bottom;
838     int32_t maxV = (int32_t)pPageObj->m_Top > nPageHeight
839                        ? nPageHeight
840                        : (int32_t)pPageObj->m_Top;
841     if (minH >= maxH || minV >= maxV) {
842       continue;
843     }
844     FXSYS_memset(pDataH + minH, 1, maxH - minH);
845     FXSYS_memset(pDataV + minV, 1, maxV - minV);
846     if (fLineHeight <= 0.0f) {
847       fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
848     }
849     pPageObj = NULL;
850   }
851   int32_t nStartH = 0;
852   int32_t nEndH = 0;
853   FX_FLOAT nSumH = 0.0f;
854   for (index = 0; index < nPageWidth; index++)
855     if (1 == nHorizontalMask[index]) {
856       break;
857     }
858   nStartH = index;
859   for (index = nPageWidth; index > 0; index--)
860     if (1 == nHorizontalMask[index - 1]) {
861       break;
862     }
863   nEndH = index;
864   for (index = nStartH; index < nEndH; index++) {
865     nSumH += nHorizontalMask[index];
866   }
867   nSumH /= nEndH - nStartH;
868   int32_t nStartV = 0;
869   int32_t nEndV = 0;
870   FX_FLOAT nSumV = 0.0f;
871   for (index = 0; index < nPageHeight; index++)
872     if (1 == nVerticalMask[index]) {
873       break;
874     }
875   nStartV = index;
876   for (index = nPageHeight; index > 0; index--)
877     if (1 == nVerticalMask[index - 1]) {
878       break;
879     }
880   nEndV = index;
881   for (index = nStartV; index < nEndV; index++) {
882     nSumV += nVerticalMask[index];
883   }
884   nSumV /= nEndV - nStartV;
885   if ((nEndV - nStartV) < (int32_t)(2 * fLineHeight)) {
886     return 0;
887   }
888   if ((nEndH - nStartH) < (int32_t)(2 * fLineHeight)) {
889     return 1;
890   }
891   if (nSumH > 0.8f) {
892     return 0;
893   }
894   if (nSumH - nSumV > 0.0f) {
895     return 0;
896   }
897   if (nSumV - nSumH > 0.0f) {
898     return 1;
899   }
900   return -1;
901 }
ProcessObject()902 void CPDF_TextPage::ProcessObject() {
903   CPDF_PageObject* pPageObj = NULL;
904   if (!m_pPage) {
905     return;
906   }
907   FX_POSITION pos;
908   pos = m_pPage->GetFirstObjectPosition();
909   if (!pos) {
910     return;
911   }
912   m_TextlineDir = FindTextlineFlowDirection();
913   int nCount = 0;
914   while (pos) {
915     pPageObj = m_pPage->GetNextObject(pos);
916     if (pPageObj) {
917       if (pPageObj->m_Type == PDFPAGE_TEXT) {
918         CFX_Matrix matrix;
919         ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos);
920         nCount++;
921       } else if (pPageObj->m_Type == PDFPAGE_FORM) {
922         CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0);
923         ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix);
924       }
925     }
926     pPageObj = NULL;
927   }
928   int count = m_LineObj.GetSize();
929   for (int i = 0; i < count; i++) {
930     ProcessTextObject(m_LineObj.GetAt(i));
931   }
932   m_LineObj.RemoveAll();
933   CloseTempLine();
934 }
ProcessFormObject(CPDF_FormObject * pFormObj,const CFX_Matrix & formMatrix)935 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
936                                       const CFX_Matrix& formMatrix) {
937   CPDF_PageObject* pPageObj = NULL;
938   FX_POSITION pos;
939   if (!pFormObj) {
940     return;
941   }
942   pos = pFormObj->m_pForm->GetFirstObjectPosition();
943   if (!pos) {
944     return;
945   }
946   CFX_Matrix curFormMatrix;
947   curFormMatrix.Copy(pFormObj->m_FormMatrix);
948   curFormMatrix.Concat(formMatrix);
949   while (pos) {
950     pPageObj = pFormObj->m_pForm->GetNextObject(pos);
951     if (pPageObj) {
952       if (pPageObj->m_Type == PDFPAGE_TEXT) {
953         ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos);
954       } else if (pPageObj->m_Type == PDFPAGE_FORM) {
955         ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix);
956       }
957     }
958     pPageObj = NULL;
959   }
960 }
GetCharWidth(FX_DWORD charCode,CPDF_Font * pFont) const961 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const {
962   if (charCode == -1) {
963     return 0;
964   }
965   int w = pFont->GetCharWidthF(charCode);
966   if (w == 0) {
967     CFX_ByteString str;
968     pFont->AppendChar(str, charCode);
969     w = pFont->GetStringWidth(str, 1);
970     if (w == 0) {
971       FX_RECT BBox;
972       pFont->GetCharBBox(charCode, BBox);
973       w = BBox.right - BBox.left;
974     }
975   }
976   return w;
977 }
OnPiece(CFX_BidiChar * pBidi,CFX_WideString & str)978 void CPDF_TextPage::OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str) {
979   int32_t start, count;
980   CFX_BidiChar::Direction ret = pBidi->GetBidiInfo(&start, &count);
981   if (ret == CFX_BidiChar::RIGHT) {
982     for (int i = start + count - 1; i >= start; i--) {
983       m_TextBuf.AppendChar(str.GetAt(i));
984       m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
985     }
986   } else {
987     int end = start + count;
988     for (int i = start; i < end; i++) {
989       m_TextBuf.AppendChar(str.GetAt(i));
990       m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
991     }
992   }
993 }
AddCharInfoByLRDirection(CFX_WideString & str,int i)994 void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i) {
995   PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
996   FX_WCHAR wChar = str.GetAt(i);
997   if (!IsControlChar(Info)) {
998     Info.m_Index = m_TextBuf.GetLength();
999     if (wChar >= 0xFB00 && wChar <= 0xFB06) {
1000       FX_WCHAR* pDst = NULL;
1001       FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1002       if (nCount >= 1) {
1003         pDst = FX_Alloc(FX_WCHAR, nCount);
1004         FX_Unicode_GetNormalization(wChar, pDst);
1005         for (int nIndex = 0; nIndex < nCount; nIndex++) {
1006           PAGECHAR_INFO Info2 = Info;
1007           Info2.m_Unicode = pDst[nIndex];
1008           Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1009           m_TextBuf.AppendChar(Info2.m_Unicode);
1010           if (!m_ParseOptions.m_bGetCharCodeOnly) {
1011             m_charList.Add(Info2);
1012           }
1013         }
1014         FX_Free(pDst);
1015         return;
1016       }
1017     }
1018     m_TextBuf.AppendChar(wChar);
1019   } else {
1020     Info.m_Index = -1;
1021   }
1022   if (!m_ParseOptions.m_bGetCharCodeOnly) {
1023     m_charList.Add(Info);
1024   }
1025 }
AddCharInfoByRLDirection(CFX_WideString & str,int i)1026 void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i) {
1027   PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
1028   if (!IsControlChar(Info)) {
1029     Info.m_Index = m_TextBuf.GetLength();
1030     FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE);
1031     FX_WCHAR* pDst = NULL;
1032     FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1033     if (nCount >= 1) {
1034       pDst = FX_Alloc(FX_WCHAR, nCount);
1035       FX_Unicode_GetNormalization(wChar, pDst);
1036       for (int nIndex = 0; nIndex < nCount; nIndex++) {
1037         PAGECHAR_INFO Info2 = Info;
1038         Info2.m_Unicode = pDst[nIndex];
1039         Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1040         m_TextBuf.AppendChar(Info2.m_Unicode);
1041         if (!m_ParseOptions.m_bGetCharCodeOnly) {
1042           m_charList.Add(Info2);
1043         }
1044       }
1045       FX_Free(pDst);
1046       return;
1047     }
1048     Info.m_Unicode = wChar;
1049     m_TextBuf.AppendChar(Info.m_Unicode);
1050   } else {
1051     Info.m_Index = -1;
1052   }
1053   if (!m_ParseOptions.m_bGetCharCodeOnly) {
1054     m_charList.Add(Info);
1055   }
1056 }
CloseTempLine()1057 void CPDF_TextPage::CloseTempLine() {
1058   int count1 = m_TempCharList.GetSize();
1059   if (count1 <= 0) {
1060     return;
1061   }
1062   std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
1063   CFX_WideString str = m_TempTextBuf.GetWideString();
1064   CFX_WordArray order;
1065   FX_BOOL bR2L = FALSE;
1066   int32_t start = 0, count = 0;
1067   int nR2L = 0, nL2R = 0;
1068   FX_BOOL bPrevSpace = FALSE;
1069   for (int i = 0; i < str.GetLength(); i++) {
1070     if (str.GetAt(i) == 32) {
1071       if (bPrevSpace) {
1072         m_TempTextBuf.Delete(i, 1);
1073         m_TempCharList.Delete(i);
1074         str.Delete(i);
1075         count1--;
1076         i--;
1077         continue;
1078       }
1079       bPrevSpace = TRUE;
1080     } else {
1081       bPrevSpace = FALSE;
1082     }
1083     if (pBidiChar->AppendChar(str.GetAt(i))) {
1084       CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1085       order.Add(start);
1086       order.Add(count);
1087       order.Add(ret);
1088       if (!bR2L) {
1089         if (ret == CFX_BidiChar::RIGHT) {
1090           nR2L++;
1091         } else if (ret == CFX_BidiChar::LEFT) {
1092           nL2R++;
1093         }
1094       }
1095     }
1096   }
1097   if (pBidiChar->EndChar()) {
1098     CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1099     order.Add(start);
1100     order.Add(count);
1101     order.Add(ret);
1102     if (!bR2L) {
1103       if (ret == CFX_BidiChar::RIGHT) {
1104         nR2L++;
1105       } else if (ret == CFX_BidiChar::LEFT) {
1106         nL2R++;
1107       }
1108     }
1109   }
1110   if (nR2L > 0 && nR2L >= nL2R) {
1111     bR2L = TRUE;
1112   }
1113   if (m_parserflag == FPDFTEXT_RLTB || bR2L) {
1114     int count = order.GetSize();
1115     for (int i = count - 1; i > 0; i -= 3) {
1116       int ret = order.GetAt(i);
1117       int start = order.GetAt(i - 2);
1118       int count1 = order.GetAt(i - 1);
1119       if (ret == 2 || ret == 0) {
1120         for (int j = start + count1 - 1; j >= start; j--) {
1121           AddCharInfoByRLDirection(str, j);
1122         }
1123       } else {
1124         int j = i;
1125         FX_BOOL bSymbol = FALSE;
1126         while (j > 0 && order.GetAt(j) != 2) {
1127           bSymbol = !order.GetAt(j);
1128           j -= 3;
1129         }
1130         int end = start + count1;
1131         int n = 0;
1132         if (bSymbol) {
1133           n = j + 6;
1134         } else {
1135           n = j + 3;
1136         }
1137         if (n >= i) {
1138           for (int m = start; m < end; m++) {
1139             AddCharInfoByLRDirection(str, m);
1140           }
1141         } else {
1142           j = i;
1143           i = n;
1144           for (; n <= j; n += 3) {
1145             int start = order.GetAt(n - 2);
1146             int count1 = order.GetAt(n - 1);
1147             int end = start + count1;
1148             for (int m = start; m < end; m++) {
1149               AddCharInfoByLRDirection(str, m);
1150             }
1151           }
1152         }
1153       }
1154     }
1155   } else {
1156     int count = order.GetSize();
1157     FX_BOOL bL2R = FALSE;
1158     for (int i = 0; i < count; i += 3) {
1159       int ret = order.GetAt(i + 2);
1160       int start = order.GetAt(i);
1161       int count1 = order.GetAt(i + 1);
1162       if (ret == 2 || (i == 0 && ret == 0 && !bL2R)) {
1163         int j = i + 3;
1164         while (bR2L && j < count) {
1165           if (order.GetAt(j + 2) == 1) {
1166             break;
1167           } else {
1168             j += 3;
1169           }
1170         }
1171         if (j == 3) {
1172           i = -3;
1173           bL2R = TRUE;
1174           continue;
1175         }
1176         int end = m_TempCharList.GetSize() - 1;
1177         if (j < count) {
1178           end = order.GetAt(j) - 1;
1179         }
1180         i = j - 3;
1181         for (int n = end; n >= start; n--) {
1182           AddCharInfoByRLDirection(str, n);
1183         }
1184       } else {
1185         int end = start + count1;
1186         for (int n = start; n < end; n++) {
1187           AddCharInfoByLRDirection(str, n);
1188         }
1189       }
1190     }
1191   }
1192   order.RemoveAll();
1193   m_TempCharList.RemoveAll();
1194   m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
1195 }
ProcessTextObject(CPDF_TextObject * pTextObj,const CFX_Matrix & formMatrix,FX_POSITION ObjPos)1196 void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj,
1197                                       const CFX_Matrix& formMatrix,
1198                                       FX_POSITION ObjPos) {
1199   CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right,
1200                    pTextObj->m_Top);
1201   if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
1202     return;
1203   }
1204   int count = m_LineObj.GetSize();
1205   PDFTEXT_Obj Obj;
1206   Obj.m_pTextObj = pTextObj;
1207   Obj.m_formMatrix = formMatrix;
1208   if (count == 0) {
1209     m_LineObj.Add(Obj);
1210     return;
1211   }
1212   if (IsSameAsPreTextObject(pTextObj, ObjPos)) {
1213     return;
1214   }
1215   PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1);
1216   CPDF_TextObjectItem item;
1217   int nItem = prev_Obj.m_pTextObj->CountItems();
1218   prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
1219   FX_FLOAT prev_width =
1220       GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) *
1221       prev_Obj.m_pTextObj->GetFontSize() / 1000;
1222   CFX_Matrix prev_matrix;
1223   prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1224   prev_width = FXSYS_fabs(prev_width);
1225   prev_matrix.Concat(prev_Obj.m_formMatrix);
1226   prev_width = prev_matrix.TransformDistance(prev_width);
1227   pTextObj->GetItemInfo(0, &item);
1228   FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) *
1229                         pTextObj->GetFontSize() / 1000;
1230   this_width = FXSYS_fabs(this_width);
1231   CFX_Matrix this_matrix;
1232   pTextObj->GetTextMatrix(&this_matrix);
1233   this_width = FXSYS_fabs(this_width);
1234   this_matrix.Concat(formMatrix);
1235   this_width = this_matrix.TransformDistance(this_width);
1236   FX_FLOAT threshold =
1237       prev_width > this_width ? prev_width / 4 : this_width / 4;
1238   FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(),
1239            prev_y = prev_Obj.m_pTextObj->GetPosY();
1240   prev_Obj.m_formMatrix.Transform(prev_x, prev_y);
1241   m_DisplayMatrix.Transform(prev_x, prev_y);
1242   FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY();
1243   formMatrix.Transform(this_x, this_y);
1244   m_DisplayMatrix.Transform(this_x, this_y);
1245   if (FXSYS_fabs(this_y - prev_y) > threshold * 2) {
1246     for (int i = 0; i < count; i++) {
1247       ProcessTextObject(m_LineObj.GetAt(i));
1248     }
1249     m_LineObj.RemoveAll();
1250     m_LineObj.Add(Obj);
1251     return;
1252   }
1253   int i = 0;
1254   if (m_ParseOptions.m_bNormalizeObjs) {
1255     for (i = count - 1; i >= 0; i--) {
1256       PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i);
1257       CFX_Matrix prev_matrix;
1258       prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1259       FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(),
1260                Prev_y = prev_Obj.m_pTextObj->GetPosY();
1261       prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y);
1262       m_DisplayMatrix.Transform(Prev_x, Prev_y);
1263       if (this_x >= Prev_x) {
1264         if (i == count - 1) {
1265           m_LineObj.Add(Obj);
1266         } else {
1267           m_LineObj.InsertAt(i + 1, Obj);
1268         }
1269         break;
1270       }
1271     }
1272     if (i < 0) {
1273       m_LineObj.InsertAt(0, Obj);
1274     }
1275   } else {
1276     m_LineObj.Add(Obj);
1277   }
1278 }
PreMarkedContent(PDFTEXT_Obj Obj)1279 int32_t CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
1280   CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1281   CPDF_ContentMarkData* pMarkData =
1282       (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1283   if (!pMarkData) {
1284     return FPDFTEXT_MC_PASS;
1285   }
1286   int nContentMark = pMarkData->CountItems();
1287   if (nContentMark < 1) {
1288     return FPDFTEXT_MC_PASS;
1289   }
1290   CFX_WideString actText;
1291   FX_BOOL bExist = FALSE;
1292   CPDF_Dictionary* pDict = NULL;
1293   int n = 0;
1294   for (n = 0; n < nContentMark; n++) {
1295     CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1296     CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1297     pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam()));
1298     CPDF_String* temp =
1299         ToString(pDict ? pDict->GetElement("ActualText") : nullptr);
1300     if (temp) {
1301       bExist = TRUE;
1302       actText = temp->GetUnicodeText();
1303     }
1304   }
1305   if (!bExist) {
1306     return FPDFTEXT_MC_PASS;
1307   }
1308   if (m_pPreTextObj) {
1309     if (CPDF_ContentMarkData* pPreMarkData =
1310             (CPDF_ContentMarkData*)m_pPreTextObj->m_ContentMark.GetObject()) {
1311       if (pPreMarkData->CountItems() == n) {
1312         CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1);
1313         if (pDict == item.GetParam()) {
1314           return FPDFTEXT_MC_DONE;
1315         }
1316       }
1317     }
1318   }
1319   CPDF_Font* pFont = pTextObj->GetFont();
1320   FX_STRSIZE nItems = actText.GetLength();
1321   if (nItems < 1) {
1322     return FPDFTEXT_MC_PASS;
1323   }
1324   bExist = FALSE;
1325   for (FX_STRSIZE i = 0; i < nItems; i++) {
1326     FX_WCHAR wChar = actText.GetAt(i);
1327     if (-1 == pFont->CharCodeFromUnicode(wChar)) {
1328       continue;
1329     } else {
1330       bExist = TRUE;
1331       break;
1332     }
1333   }
1334   if (!bExist) {
1335     return FPDFTEXT_MC_PASS;
1336   }
1337   bExist = FALSE;
1338   for (FX_STRSIZE i = 0; i < nItems; i++) {
1339     FX_WCHAR wChar = actText.GetAt(i);
1340     if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
1341       bExist = TRUE;
1342       break;
1343     }
1344   }
1345   if (!bExist) {
1346     return FPDFTEXT_MC_DONE;
1347   }
1348   return FPDFTEXT_MC_DELAY;
1349 }
ProcessMarkedContent(PDFTEXT_Obj Obj)1350 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
1351   CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1352   CPDF_ContentMarkData* pMarkData =
1353       (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1354   if (!pMarkData) {
1355     return;
1356   }
1357   int nContentMark = pMarkData->CountItems();
1358   if (nContentMark < 1) {
1359     return;
1360   }
1361   CFX_WideString actText;
1362   CPDF_Dictionary* pDict = NULL;
1363   int n = 0;
1364   for (n = 0; n < nContentMark; n++) {
1365     CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1366     CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1367     pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam()));
1368     CPDF_String* temp =
1369         ToString(pDict ? pDict->GetElement("ActualText") : nullptr);
1370     if (temp) {
1371       actText = temp->GetUnicodeText();
1372     }
1373   }
1374   FX_STRSIZE nItems = actText.GetLength();
1375   if (nItems < 1) {
1376     return;
1377   }
1378   CPDF_Font* pFont = pTextObj->GetFont();
1379   CFX_Matrix formMatrix = Obj.m_formMatrix;
1380   CFX_Matrix matrix;
1381   pTextObj->GetTextMatrix(&matrix);
1382   matrix.Concat(formMatrix);
1383   FX_FLOAT fPosX = pTextObj->GetPosX();
1384   FX_FLOAT fPosY = pTextObj->GetPosY();
1385   int nCharInfoIndex = m_TextBuf.GetLength();
1386   CFX_FloatRect charBox;
1387   charBox.top = pTextObj->m_Top;
1388   charBox.left = pTextObj->m_Left;
1389   charBox.right = pTextObj->m_Right;
1390   charBox.bottom = pTextObj->m_Bottom;
1391   for (FX_STRSIZE k = 0; k < nItems; k++) {
1392     FX_WCHAR wChar = actText.GetAt(k);
1393     if (wChar <= 0x80 && !isprint(wChar)) {
1394       wChar = 0x20;
1395     }
1396     if (wChar >= 0xFFFD) {
1397       continue;
1398     }
1399     PAGECHAR_INFO charinfo;
1400     charinfo.m_OriginX = fPosX;
1401     charinfo.m_OriginY = fPosY;
1402     charinfo.m_Index = nCharInfoIndex;
1403     charinfo.m_Unicode = wChar;
1404     charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
1405     charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
1406     charinfo.m_pTextObj = pTextObj;
1407     charinfo.m_CharBox.top = charBox.top;
1408     charinfo.m_CharBox.left = charBox.left;
1409     charinfo.m_CharBox.right = charBox.right;
1410     charinfo.m_CharBox.bottom = charBox.bottom;
1411     charinfo.m_Matrix.Copy(matrix);
1412     m_TempTextBuf.AppendChar(wChar);
1413     m_TempCharList.Add(charinfo);
1414   }
1415 }
FindPreviousTextObject(void)1416 void CPDF_TextPage::FindPreviousTextObject(void) {
1417   if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) {
1418     return;
1419   }
1420   PAGECHAR_INFO preChar;
1421   if (m_TempCharList.GetSize() >= 1) {
1422     preChar =
1423         *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1424   } else {
1425     preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1);
1426   }
1427   if (preChar.m_pTextObj) {
1428     m_pPreTextObj = preChar.m_pTextObj;
1429   }
1430 }
SwapTempTextBuf(int32_t iCharListStartAppend,int32_t iBufStartAppend)1431 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend,
1432                                     int32_t iBufStartAppend) {
1433   int32_t i, j;
1434   i = iCharListStartAppend;
1435   j = m_TempCharList.GetSize() - 1;
1436   for (; i < j; i++, j--) {
1437     std::swap(m_TempCharList[i], m_TempCharList[j]);
1438     std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index);
1439   }
1440   FX_WCHAR* pTempBuffer = m_TempTextBuf.GetBuffer();
1441   i = iBufStartAppend;
1442   j = m_TempTextBuf.GetLength() - 1;
1443   for (; i < j; i++, j--) {
1444     std::swap(pTempBuffer[i], pTempBuffer[j]);
1445   }
1446 }
IsRightToLeft(const CPDF_TextObject * pTextObj,const CPDF_Font * pFont,int nItems) const1447 FX_BOOL CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj,
1448                                      const CPDF_Font* pFont,
1449                                      int nItems) const {
1450   std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
1451   int32_t nR2L = 0;
1452   int32_t nL2R = 0;
1453   int32_t start = 0, count = 0;
1454   CPDF_TextObjectItem item;
1455   for (int32_t i = 0; i < nItems; i++) {
1456     pTextObj->GetItemInfo(i, &item);
1457     if (item.m_CharCode == (FX_DWORD)-1) {
1458       continue;
1459     }
1460     CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1461     FX_WCHAR wChar = wstrItem.GetAt(0);
1462     if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1463       wChar = (FX_WCHAR)item.m_CharCode;
1464     }
1465     if (!wChar) {
1466       continue;
1467     }
1468     if (pBidiChar->AppendChar(wChar)) {
1469       CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1470       if (ret == CFX_BidiChar::RIGHT) {
1471         nR2L++;
1472       } else if (ret == CFX_BidiChar::LEFT) {
1473         nL2R++;
1474       }
1475     }
1476   }
1477   if (pBidiChar->EndChar()) {
1478     CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1479     if (ret == CFX_BidiChar::RIGHT) {
1480       nR2L++;
1481     } else if (ret == CFX_BidiChar::LEFT) {
1482       nL2R++;
1483     }
1484   }
1485   return (nR2L > 0 && nR2L >= nL2R);
1486 }
ProcessTextObject(PDFTEXT_Obj Obj)1487 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
1488   CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1489   if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
1490     return;
1491   }
1492   CFX_Matrix formMatrix = Obj.m_formMatrix;
1493   CPDF_Font* pFont = pTextObj->GetFont();
1494   CFX_Matrix matrix;
1495   pTextObj->GetTextMatrix(&matrix);
1496   matrix.Concat(formMatrix);
1497   int32_t bPreMKC = PreMarkedContent(Obj);
1498   if (FPDFTEXT_MC_DONE == bPreMKC) {
1499     m_pPreTextObj = pTextObj;
1500     m_perMatrix.Copy(formMatrix);
1501     return;
1502   }
1503   int result = 0;
1504   if (m_pPreTextObj) {
1505     result = ProcessInsertObject(pTextObj, formMatrix);
1506     if (2 == result) {
1507       m_CurlineRect =
1508           CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1509                         Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1510     } else {
1511       m_CurlineRect.Union(
1512           CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1513                         Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top));
1514     }
1515     PAGECHAR_INFO generateChar;
1516     if (result == 1) {
1517       if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) {
1518         if (!formMatrix.IsIdentity()) {
1519           generateChar.m_Matrix.Copy(formMatrix);
1520         }
1521         m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1522         m_TempCharList.Add(generateChar);
1523       }
1524     } else if (result == 2) {
1525       CloseTempLine();
1526       if (m_TextBuf.GetSize()) {
1527         if (m_ParseOptions.m_bGetCharCodeOnly) {
1528           m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1529           m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1530         } else {
1531           if (GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) {
1532             m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1533             if (!formMatrix.IsIdentity()) {
1534               generateChar.m_Matrix.Copy(formMatrix);
1535             }
1536             m_charList.Add(generateChar);
1537           }
1538           if (GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) {
1539             m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1540             if (!formMatrix.IsIdentity()) {
1541               generateChar.m_Matrix.Copy(formMatrix);
1542             }
1543             m_charList.Add(generateChar);
1544           }
1545         }
1546       }
1547     } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) {
1548       int32_t nChars = pTextObj->CountChars();
1549       if (nChars == 1) {
1550         CPDF_TextObjectItem item;
1551         pTextObj->GetCharInfo(0, &item);
1552         CFX_WideString wstrItem =
1553             pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1554         if (wstrItem.IsEmpty()) {
1555           wstrItem += (FX_WCHAR)item.m_CharCode;
1556         }
1557         FX_WCHAR curChar = wstrItem.GetAt(0);
1558         if (0x2D == curChar || 0xAD == curChar) {
1559           return;
1560         }
1561       }
1562       while (m_TempTextBuf.GetSize() > 0 &&
1563              m_TempTextBuf.GetWideString().GetAt(m_TempTextBuf.GetLength() -
1564                                                  1) == 0x20) {
1565         m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1566         m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1567       }
1568       PAGECHAR_INFO* cha =
1569           (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1570       m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1571       cha->m_Unicode = 0x2;
1572       cha->m_Flag = FPDFTEXT_CHAR_HYPHEN;
1573       m_TempTextBuf.AppendChar(0xfffe);
1574     }
1575   } else {
1576     m_CurlineRect =
1577         CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1578                       Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1579   }
1580   if (FPDFTEXT_MC_DELAY == bPreMKC) {
1581     ProcessMarkedContent(Obj);
1582     m_pPreTextObj = pTextObj;
1583     m_perMatrix.Copy(formMatrix);
1584     return;
1585   }
1586   m_pPreTextObj = pTextObj;
1587   m_perMatrix.Copy(formMatrix);
1588   int nItems = pTextObj->CountItems();
1589   FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix);
1590 
1591   const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems);
1592   const FX_BOOL bIsBidiAndMirrorInverse =
1593       bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1594   int32_t iBufStartAppend = m_TempTextBuf.GetLength();
1595   int32_t iCharListStartAppend = m_TempCharList.GetSize();
1596 
1597   FX_FLOAT spacing = 0;
1598   for (int i = 0; i < nItems; i++) {
1599     CPDF_TextObjectItem item;
1600     PAGECHAR_INFO charinfo;
1601     charinfo.m_OriginX = 0;
1602     charinfo.m_OriginY = 0;
1603     pTextObj->GetItemInfo(i, &item);
1604     if (item.m_CharCode == (FX_DWORD)-1) {
1605       CFX_WideString str = m_TempTextBuf.GetWideString();
1606       if (str.IsEmpty()) {
1607         str = m_TextBuf.GetWideString();
1608       }
1609       if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1610         continue;
1611       }
1612       FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1613       spacing = -fontsize_h * item.m_OriginX / 1000;
1614       continue;
1615     }
1616     FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace;
1617     if (charSpace > 0.001) {
1618       spacing += matrix.TransformDistance(charSpace);
1619     } else if (charSpace < -0.001) {
1620       spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
1621     }
1622     spacing -= baseSpace;
1623     if (spacing && i > 0) {
1624       int last_width = 0;
1625       FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1626       FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
1627       FX_FLOAT threshold = 0;
1628       if (space_charcode != -1) {
1629         threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1630       }
1631       if (threshold > fontsize_h / 3) {
1632         threshold = 0;
1633       } else {
1634         threshold /= 2;
1635       }
1636       if (threshold == 0) {
1637         threshold = fontsize_h;
1638         int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
1639         threshold = this_width > last_width ? (FX_FLOAT)this_width
1640                                             : (FX_FLOAT)last_width;
1641         threshold = _NormalizeThreshold(threshold);
1642         threshold = fontsize_h * threshold / 1000;
1643       }
1644       if (threshold && (spacing && spacing >= threshold)) {
1645         charinfo.m_Unicode = TEXT_BLANK_CHAR;
1646         charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
1647         charinfo.m_pTextObj = pTextObj;
1648         charinfo.m_Index = m_TextBuf.GetLength();
1649         m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1650         charinfo.m_CharCode = -1;
1651         charinfo.m_Matrix.Copy(formMatrix);
1652         matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
1653                          charinfo.m_OriginY);
1654         charinfo.m_CharBox =
1655             CFX_FloatRect(charinfo.m_OriginX, charinfo.m_OriginY,
1656                           charinfo.m_OriginX, charinfo.m_OriginY);
1657         m_TempCharList.Add(charinfo);
1658       }
1659       if (item.m_CharCode == (FX_DWORD)-1) {
1660         continue;
1661       }
1662     }
1663     spacing = 0;
1664     CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1665     FX_BOOL bNoUnicode = FALSE;
1666     FX_WCHAR wChar = wstrItem.GetAt(0);
1667     if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1668       if (wstrItem.IsEmpty()) {
1669         wstrItem += (FX_WCHAR)item.m_CharCode;
1670       } else {
1671         wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode);
1672       }
1673       bNoUnicode = TRUE;
1674     }
1675     charinfo.m_Index = -1;
1676     charinfo.m_CharCode = item.m_CharCode;
1677     if (bNoUnicode) {
1678       charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1679     } else {
1680       charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1681     }
1682     charinfo.m_pTextObj = pTextObj;
1683     charinfo.m_OriginX = 0, charinfo.m_OriginY = 0;
1684     matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
1685                      charinfo.m_OriginY);
1686     FX_RECT rect(0, 0, 0, 0);
1687     rect.Intersect(0, 0, 0, 0);
1688     charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect);
1689     charinfo.m_CharBox.top =
1690         rect.top * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1691     charinfo.m_CharBox.left =
1692         rect.left * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1693     charinfo.m_CharBox.right =
1694         rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1695     charinfo.m_CharBox.bottom =
1696         rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1697     if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
1698       charinfo.m_CharBox.top =
1699           charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
1700     }
1701     if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
1702       charinfo.m_CharBox.right =
1703           charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1704     }
1705     matrix.TransformRect(charinfo.m_CharBox);
1706     charinfo.m_Matrix.Copy(matrix);
1707     if (wstrItem.IsEmpty()) {
1708       charinfo.m_Unicode = 0;
1709       m_TempCharList.Add(charinfo);
1710       m_TempTextBuf.AppendChar(0xfffe);
1711       continue;
1712     } else {
1713       int nTotal = wstrItem.GetLength();
1714       FX_BOOL bDel = FALSE;
1715       const int count = std::min(m_TempCharList.GetSize(), 7);
1716       FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance(
1717           (FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize());
1718       for (int n = m_TempCharList.GetSize();
1719            n > m_TempCharList.GetSize() - count; n--) {
1720         PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(n - 1);
1721         if (charinfo1->m_CharCode == charinfo.m_CharCode &&
1722             charinfo1->m_pTextObj->GetFont() ==
1723                 charinfo.m_pTextObj->GetFont() &&
1724             FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < threshold &&
1725             FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < threshold) {
1726           bDel = TRUE;
1727           break;
1728         }
1729       }
1730       if (!bDel) {
1731         for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1732           charinfo.m_Unicode = wstrItem.GetAt(nIndex);
1733           if (charinfo.m_Unicode) {
1734             charinfo.m_Index = m_TextBuf.GetLength();
1735             m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1736           } else {
1737             m_TempTextBuf.AppendChar(0xfffe);
1738           }
1739           m_TempCharList.Add(charinfo);
1740         }
1741       } else if (i == 0) {
1742         CFX_WideString str = m_TempTextBuf.GetWideString();
1743         if (!str.IsEmpty() &&
1744             str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1745           m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1746           m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1747         }
1748       }
1749     }
1750   }
1751   if (bIsBidiAndMirrorInverse) {
1752     SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1753   }
1754 }
GetTextObjectWritingMode(const CPDF_TextObject * pTextObj)1755 int32_t CPDF_TextPage::GetTextObjectWritingMode(
1756     const CPDF_TextObject* pTextObj) {
1757   int32_t nChars = pTextObj->CountChars();
1758   if (nChars == 1) {
1759     return m_TextlineDir;
1760   }
1761   CPDF_TextObjectItem first, last;
1762   pTextObj->GetCharInfo(0, &first);
1763   pTextObj->GetCharInfo(nChars - 1, &last);
1764   CFX_Matrix textMatrix;
1765   pTextObj->GetTextMatrix(&textMatrix);
1766   textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY);
1767   textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY);
1768   FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX);
1769   FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY);
1770   if (dX <= 0.0001f && dY <= 0.0001f) {
1771     return -1;
1772   }
1773   CFX_VectorF v;
1774   v.Set(dX, dY);
1775   v.Normalize();
1776   if (v.y <= 0.0872f) {
1777     return v.x <= 0.0872f ? m_TextlineDir : 0;
1778   }
1779   if (v.x <= 0.0872f) {
1780     return 1;
1781   }
1782   return m_TextlineDir;
1783 }
IsHyphen(FX_WCHAR curChar)1784 FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar) {
1785   CFX_WideString strCurText = m_TempTextBuf.GetWideString();
1786   if (strCurText.GetLength() == 0) {
1787     strCurText = m_TextBuf.GetWideString();
1788   }
1789   FX_STRSIZE nCount = strCurText.GetLength();
1790   int nIndex = nCount - 1;
1791   FX_WCHAR wcTmp = strCurText.GetAt(nIndex);
1792   while (wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) {
1793     wcTmp = strCurText.GetAt(--nIndex);
1794   }
1795   if (0x2D == wcTmp || 0xAD == wcTmp) {
1796     if (--nIndex > 0) {
1797       FX_WCHAR preChar = strCurText.GetAt((nIndex));
1798       if (((preChar >= L'A' && preChar <= L'Z') ||
1799            (preChar >= L'a' && preChar <= L'z')) &&
1800           ((curChar >= L'A' && curChar <= L'Z') ||
1801            (curChar >= L'a' && curChar <= L'z'))) {
1802         return TRUE;
1803       }
1804     }
1805     int size = m_TempCharList.GetSize();
1806     PAGECHAR_INFO preChar;
1807     if (size) {
1808       preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
1809     } else {
1810       size = m_charList.GetSize();
1811       if (size == 0) {
1812         return FALSE;
1813       }
1814       preChar = (PAGECHAR_INFO)m_charList[size - 1];
1815     }
1816     if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag &&
1817         (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode)) {
1818       return TRUE;
1819     }
1820   }
1821   return FALSE;
1822 }
ProcessInsertObject(const CPDF_TextObject * pObj,const CFX_Matrix & formMatrix)1823 int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj,
1824                                        const CFX_Matrix& formMatrix) {
1825   FindPreviousTextObject();
1826   FX_BOOL bNewline = FALSE;
1827   int WritingMode = GetTextObjectWritingMode(pObj);
1828   if (WritingMode == -1) {
1829     WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
1830   }
1831   CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right,
1832                           pObj->m_Top);
1833   CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
1834                           m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1835   CPDF_TextObjectItem PrevItem, item;
1836   int nItem = m_pPreTextObj->CountItems();
1837   m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1838   pObj->GetItemInfo(0, &item);
1839   CFX_WideString wstrItem =
1840       pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1841   if (wstrItem.IsEmpty()) {
1842     wstrItem += (FX_WCHAR)item.m_CharCode;
1843   }
1844   FX_WCHAR curChar = wstrItem.GetAt(0);
1845   if (WritingMode == 0) {
1846     if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1847       FX_FLOAT top =
1848           this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
1849       FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom
1850                                                             : prev_rect.bottom;
1851       if (bottom >= top) {
1852         if (IsHyphen(curChar)) {
1853           return 3;
1854         }
1855         return 2;
1856       }
1857     }
1858   } else if (WritingMode == 1) {
1859     if (this_rect.Width() > pObj->GetFontSize() * 0.1f &&
1860         prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1861       FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left
1862                                                           : m_CurlineRect.left;
1863       FX_FLOAT right = this_rect.right < m_CurlineRect.right
1864                            ? this_rect.right
1865                            : m_CurlineRect.right;
1866       if (right <= left) {
1867         if (IsHyphen(curChar)) {
1868           return 3;
1869         }
1870         return 2;
1871       }
1872     }
1873   }
1874   FX_FLOAT last_pos = PrevItem.m_OriginX;
1875   int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
1876   FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1877   last_width = FXSYS_fabs(last_width);
1878   int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1879   FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
1880   this_width = FXSYS_fabs(this_width);
1881   FX_FLOAT threshold =
1882       last_width > this_width ? last_width / 4 : this_width / 4;
1883   CFX_Matrix prev_matrix, prev_reverse;
1884   m_pPreTextObj->GetTextMatrix(&prev_matrix);
1885   prev_matrix.Concat(m_perMatrix);
1886   prev_reverse.SetReverse(prev_matrix);
1887   FX_FLOAT x = pObj->GetPosX();
1888   FX_FLOAT y = pObj->GetPosY();
1889   formMatrix.Transform(x, y);
1890   prev_reverse.Transform(x, y);
1891   if (last_width < this_width) {
1892     threshold = prev_reverse.TransformDistance(threshold);
1893   }
1894   CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom,
1895                       m_pPreTextObj->m_Right, pObj->m_Top);
1896   CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
1897                       m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1898   CFX_FloatRect rect3 = rect1;
1899   rect1.Intersect(rect2);
1900   if (WritingMode == 0) {
1901     if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) ||
1902         ((y > threshold * 2 || y < threshold * -3) &&
1903          (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) {
1904       bNewline = TRUE;
1905       if (nItem > 1) {
1906         CPDF_TextObjectItem tempItem;
1907         m_pPreTextObj->GetItemInfo(0, &tempItem);
1908         CFX_Matrix m;
1909         m_pPreTextObj->GetTextMatrix(&m);
1910         if (PrevItem.m_OriginX > tempItem.m_OriginX &&
1911             m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1912             m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1913             m.c < 0.1) {
1914           CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000,
1915                            m_pPreTextObj->m_Top);
1916           if (re.Contains(pObj->GetPosX(), pObj->GetPosY())) {
1917             bNewline = FALSE;
1918           } else {
1919             CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top);
1920             if (re.Contains(m_pPreTextObj->GetPosX(),
1921                             m_pPreTextObj->GetPosY())) {
1922               bNewline = FALSE;
1923             }
1924           }
1925         }
1926       }
1927     }
1928   }
1929   if (bNewline)
1930     return IsHyphen(curChar) ? 3 : 2;
1931 
1932   int32_t nChars = pObj->CountChars();
1933   if (nChars == 1 && (0x2D == curChar || 0xAD == curChar) &&
1934       IsHyphen(curChar)) {
1935     return 3;
1936   }
1937   CFX_WideString PrevStr =
1938       m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1939   FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
1940   CFX_Matrix matrix;
1941   pObj->GetTextMatrix(&matrix);
1942   matrix.Concat(formMatrix);
1943   threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
1944   threshold = threshold > 400
1945                   ? (threshold < 700
1946                          ? threshold / 4
1947                          : (threshold > 800 ? threshold / 6 : threshold / 5))
1948                   : (threshold / 2);
1949   if (nLastWidth >= nThisWidth) {
1950     threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
1951   } else {
1952     threshold *= FXSYS_fabs(pObj->GetFontSize());
1953     threshold = matrix.TransformDistance(threshold);
1954     threshold = prev_reverse.TransformDistance(threshold);
1955   }
1956   threshold /= 1000;
1957   if ((threshold < 1.4881 && threshold > 1.4879) ||
1958       (threshold < 1.39001 && threshold > 1.38999)) {
1959     threshold *= 1.5;
1960   }
1961   if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
1962       preChar != L' ') {
1963     if (curChar != L' ' && preChar != L' ') {
1964       if ((x - last_pos - last_width) > threshold ||
1965           (last_pos - x - last_width) > threshold) {
1966         return 1;
1967       }
1968       if (x < 0 && (last_pos - x - last_width) > threshold) {
1969         return 1;
1970       }
1971       if ((x - last_pos - last_width) > this_width ||
1972           (x - last_pos - this_width) > last_width) {
1973         return 1;
1974       }
1975     }
1976   }
1977   return 0;
1978 }
IsSameTextObject(CPDF_TextObject * pTextObj1,CPDF_TextObject * pTextObj2)1979 FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
1980                                         CPDF_TextObject* pTextObj2) {
1981   if (!pTextObj1 || !pTextObj2) {
1982     return FALSE;
1983   }
1984   CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
1985                          pTextObj2->m_Right, pTextObj2->m_Top);
1986   CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
1987                          pTextObj1->m_Right, pTextObj1->m_Top);
1988   if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() &&
1989       !m_ParseOptions.m_bGetCharCodeOnly) {
1990     FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
1991     int nCount = m_charList.GetSize();
1992     if (nCount >= 2) {
1993       PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2];
1994       FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
1995       if (dbXdif > dbSpace) {
1996         return FALSE;
1997       }
1998     }
1999   }
2000   if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
2001     rcPreObj.Intersect(rcCurObj);
2002     if (rcPreObj.IsEmpty()) {
2003       return FALSE;
2004     }
2005     if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
2006         rcCurObj.Width() / 2) {
2007       return FALSE;
2008     }
2009     if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
2010       return FALSE;
2011     }
2012   }
2013   int nPreCount = pTextObj2->CountItems();
2014   int nCurCount = pTextObj1->CountItems();
2015   if (nPreCount != nCurCount) {
2016     return FALSE;
2017   }
2018   CPDF_TextObjectItem itemPer, itemCur;
2019   for (int i = 0; i < nPreCount; i++) {
2020     pTextObj2->GetItemInfo(i, &itemPer);
2021     pTextObj1->GetItemInfo(i, &itemCur);
2022     if (itemCur.m_CharCode != itemPer.m_CharCode) {
2023       return FALSE;
2024     }
2025   }
2026   if (FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) >
2027           GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont()) *
2028               pTextObj2->GetFontSize() / 1000 * 0.9 ||
2029       FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) >
2030           std::max(std::max(rcPreObj.Height(), rcPreObj.Width()),
2031                    pTextObj2->GetFontSize()) /
2032               8) {
2033     return FALSE;
2034   }
2035   return TRUE;
2036 }
IsSameAsPreTextObject(CPDF_TextObject * pTextObj,FX_POSITION ObjPos)2037 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
2038                                              FX_POSITION ObjPos) {
2039   if (!pTextObj) {
2040     return FALSE;
2041   }
2042   int i = 0;
2043   if (!ObjPos) {
2044     ObjPos = m_pPage->GetLastObjectPosition();
2045   }
2046   CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos);
2047   while (i < 5 && ObjPos) {
2048     pObj = m_pPage->GetPrevObject(ObjPos);
2049     if (pObj == pTextObj) {
2050       continue;
2051     }
2052     if (pObj->m_Type != PDFPAGE_TEXT) {
2053       continue;
2054     }
2055     if (IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) {
2056       return TRUE;
2057     }
2058     i++;
2059   }
2060   return FALSE;
2061 }
2062 
GenerateCharInfo(FX_WCHAR unicode,PAGECHAR_INFO & info)2063 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) {
2064   int size = m_TempCharList.GetSize();
2065   PAGECHAR_INFO preChar;
2066   if (size) {
2067     preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
2068   } else {
2069     size = m_charList.GetSize();
2070     if (size == 0) {
2071       return FALSE;
2072     }
2073     preChar = (PAGECHAR_INFO)m_charList[size - 1];
2074   }
2075   info.m_Index = m_TextBuf.GetLength();
2076   info.m_Unicode = unicode;
2077   info.m_pTextObj = NULL;
2078   info.m_CharCode = -1;
2079   info.m_Flag = FPDFTEXT_CHAR_GENERATED;
2080   int preWidth = 0;
2081   if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD)-1)
2082     preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont());
2083 
2084   FX_FLOAT fFontSize = preChar.m_pTextObj ? preChar.m_pTextObj->GetFontSize()
2085                                           : preChar.m_CharBox.Height();
2086   if (!fFontSize)
2087     fFontSize = kDefaultFontSize;
2088 
2089   info.m_OriginX = preChar.m_OriginX + preWidth * (fFontSize) / 1000;
2090   info.m_OriginY = preChar.m_OriginY;
2091   info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_OriginX,
2092                                  info.m_OriginY);
2093   return TRUE;
2094 }
2095 
IsRectIntersect(const CFX_FloatRect & rect1,const CFX_FloatRect & rect2)2096 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
2097                                        const CFX_FloatRect& rect2) {
2098   CFX_FloatRect rect = rect1;
2099   rect.Intersect(rect2);
2100   return !rect.IsEmpty();
2101 }
IsLetter(FX_WCHAR unicode)2102 FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) {
2103   if (unicode < L'A') {
2104     return FALSE;
2105   }
2106   if (unicode > L'Z' && unicode < L'a') {
2107     return FALSE;
2108   }
2109   if (unicode > L'z') {
2110     return FALSE;
2111   }
2112   return TRUE;
2113 }
CPDF_TextPageFind(const IPDF_TextPage * pTextPage)2114 CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage)
2115     : m_pTextPage(pTextPage),
2116       m_flags(0),
2117       m_findNextStart(-1),
2118       m_findPreStart(-1),
2119       m_bMatchCase(FALSE),
2120       m_bMatchWholeWord(FALSE),
2121       m_resStart(0),
2122       m_resEnd(-1),
2123       m_IsFind(FALSE) {
2124   m_strText = m_pTextPage->GetPageText();
2125   int nCount = pTextPage->CountChars();
2126   if (nCount) {
2127     m_CharIndex.Add(0);
2128   }
2129   for (int i = 0; i < nCount; i++) {
2130     FPDF_CHAR_INFO info;
2131     pTextPage->GetCharInfo(i, &info);
2132     int indexSize = m_CharIndex.GetSize();
2133     if (info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) {
2134       if (indexSize % 2) {
2135         m_CharIndex.Add(1);
2136       } else {
2137         if (indexSize <= 0) {
2138           continue;
2139         }
2140         m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1);
2141       }
2142     } else {
2143       if (indexSize % 2) {
2144         if (indexSize <= 0) {
2145           continue;
2146         }
2147         m_CharIndex.SetAt(indexSize - 1, i + 1);
2148       } else {
2149         m_CharIndex.Add(i + 1);
2150       }
2151     }
2152   }
2153   int indexSize = m_CharIndex.GetSize();
2154   if (indexSize % 2) {
2155     m_CharIndex.RemoveAt(indexSize - 1);
2156   }
2157 }
GetCharIndex(int index) const2158 int CPDF_TextPageFind::GetCharIndex(int index) const {
2159   return m_pTextPage->CharIndexFromTextIndex(index);
2160   int indexSize = m_CharIndex.GetSize();
2161   int count = 0;
2162   for (int i = 0; i < indexSize; i += 2) {
2163     count += m_CharIndex.GetAt(i + 1);
2164     if (count > index) {
2165       return index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i);
2166     }
2167   }
2168   return -1;
2169 }
FindFirst(const CFX_WideString & findwhat,int flags,int startPos)2170 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
2171                                      int flags,
2172                                      int startPos) {
2173   if (!m_pTextPage) {
2174     return FALSE;
2175   }
2176   if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {
2177     m_strText = m_pTextPage->GetPageText();
2178   }
2179   CFX_WideString findwhatStr = findwhat;
2180   m_findWhat = findwhatStr;
2181   m_flags = flags;
2182   m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
2183   if (m_strText.IsEmpty()) {
2184     m_IsFind = FALSE;
2185     return TRUE;
2186   }
2187   FX_STRSIZE len = findwhatStr.GetLength();
2188   if (!m_bMatchCase) {
2189     findwhatStr.MakeLower();
2190     m_strText.MakeLower();
2191   }
2192   m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
2193   m_findNextStart = startPos;
2194   if (startPos == -1) {
2195     m_findPreStart = m_strText.GetLength() - 1;
2196   } else {
2197     m_findPreStart = startPos;
2198   }
2199   m_csFindWhatArray.RemoveAll();
2200   int i = 0;
2201   while (i < len) {
2202     if (findwhatStr.GetAt(i) != ' ') {
2203       break;
2204     }
2205     i++;
2206   }
2207   if (i < len) {
2208     ExtractFindWhat(findwhatStr);
2209   } else {
2210     m_csFindWhatArray.Add(findwhatStr);
2211   }
2212   if (m_csFindWhatArray.GetSize() <= 0) {
2213     return FALSE;
2214   }
2215   m_IsFind = TRUE;
2216   m_resStart = 0;
2217   m_resEnd = -1;
2218   return TRUE;
2219 }
FindNext()2220 FX_BOOL CPDF_TextPageFind::FindNext() {
2221   if (!m_pTextPage) {
2222     return FALSE;
2223   }
2224   m_resArray.RemoveAll();
2225   if (m_findNextStart == -1) {
2226     return FALSE;
2227   }
2228   if (m_strText.IsEmpty()) {
2229     m_IsFind = FALSE;
2230     return m_IsFind;
2231   }
2232   int strLen = m_strText.GetLength();
2233   if (m_findNextStart > strLen - 1) {
2234     m_IsFind = FALSE;
2235     return m_IsFind;
2236   }
2237   int nCount = m_csFindWhatArray.GetSize();
2238   int nResultPos = 0;
2239   int nStartPos = 0;
2240   nStartPos = m_findNextStart;
2241   FX_BOOL bSpaceStart = FALSE;
2242   for (int iWord = 0; iWord < nCount; iWord++) {
2243     CFX_WideString csWord = m_csFindWhatArray[iWord];
2244     if (csWord.IsEmpty()) {
2245       if (iWord == nCount - 1) {
2246         FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
2247         if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CHAR ||
2248             strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
2249           nResultPos = nStartPos + 1;
2250           break;
2251         }
2252         iWord = -1;
2253       } else if (iWord == 0) {
2254         bSpaceStart = TRUE;
2255       }
2256       continue;
2257     }
2258     int endIndex;
2259     nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
2260     if (nResultPos == -1) {
2261       m_IsFind = FALSE;
2262       return m_IsFind;
2263     }
2264     endIndex = nResultPos + csWord.GetLength() - 1;
2265     if (iWord == 0) {
2266       m_resStart = nResultPos;
2267     }
2268     FX_BOOL bMatch = TRUE;
2269     if (iWord != 0 && !bSpaceStart) {
2270       int PreResEndPos = nStartPos;
2271       int curChar = csWord.GetAt(0);
2272       CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
2273       int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
2274       if (nStartPos == nResultPos &&
2275           !(_IsIgnoreSpaceCharacter(lastChar) ||
2276             _IsIgnoreSpaceCharacter(curChar))) {
2277         bMatch = FALSE;
2278       }
2279       for (int d = PreResEndPos; d < nResultPos; d++) {
2280         FX_WCHAR strInsert = m_strText.GetAt(d);
2281         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
2282             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2283           bMatch = FALSE;
2284           break;
2285         }
2286       }
2287     } else if (bSpaceStart) {
2288       if (nResultPos > 0) {
2289         FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
2290         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
2291             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2292           bMatch = FALSE;
2293           m_resStart = nResultPos;
2294         } else {
2295           m_resStart = nResultPos - 1;
2296         }
2297       }
2298     }
2299     if (m_bMatchWholeWord && bMatch) {
2300       bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
2301     }
2302     nStartPos = endIndex + 1;
2303     if (!bMatch) {
2304       iWord = -1;
2305       if (bSpaceStart) {
2306         nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
2307       } else {
2308         nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
2309       }
2310     }
2311   }
2312   m_resEnd = nResultPos +
2313              m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].GetLength() - 1;
2314   m_IsFind = TRUE;
2315   int resStart = GetCharIndex(m_resStart);
2316   int resEnd = GetCharIndex(m_resEnd);
2317   m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray);
2318   if (m_flags & FPDFTEXT_CONSECUTIVE) {
2319     m_findNextStart = m_resStart + 1;
2320     m_findPreStart = m_resEnd - 1;
2321   } else {
2322     m_findNextStart = m_resEnd + 1;
2323     m_findPreStart = m_resStart - 1;
2324   }
2325   return m_IsFind;
2326 }
FindPrev()2327 FX_BOOL CPDF_TextPageFind::FindPrev() {
2328   if (!m_pTextPage) {
2329     return FALSE;
2330   }
2331   m_resArray.RemoveAll();
2332   if (m_strText.IsEmpty() || m_findPreStart < 0) {
2333     m_IsFind = FALSE;
2334     return m_IsFind;
2335   }
2336   CPDF_TextPageFind findEngine(m_pTextPage);
2337   FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
2338   if (!ret) {
2339     m_IsFind = FALSE;
2340     return m_IsFind;
2341   }
2342   int order = -1, MatchedCount = 0;
2343   while (ret) {
2344     ret = findEngine.FindNext();
2345     if (ret) {
2346       int order1 = findEngine.GetCurOrder();
2347       int MatchedCount1 = findEngine.GetMatchedCount();
2348       if (((order1 + MatchedCount1) - 1) > m_findPreStart) {
2349         break;
2350       }
2351       order = order1;
2352       MatchedCount = MatchedCount1;
2353     }
2354   }
2355   if (order == -1) {
2356     m_IsFind = FALSE;
2357     return m_IsFind;
2358   }
2359   m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
2360   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
2361   m_IsFind = TRUE;
2362   m_pTextPage->GetRectArray(order, MatchedCount, m_resArray);
2363   if (m_flags & FPDFTEXT_CONSECUTIVE) {
2364     m_findNextStart = m_resStart + 1;
2365     m_findPreStart = m_resEnd - 1;
2366   } else {
2367     m_findNextStart = m_resEnd + 1;
2368     m_findPreStart = m_resStart - 1;
2369   }
2370   return m_IsFind;
2371 }
ExtractFindWhat(const CFX_WideString & findwhat)2372 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
2373   if (findwhat.IsEmpty()) {
2374     return;
2375   }
2376   int index = 0;
2377   while (1) {
2378     CFX_WideString csWord = TEXT_EMPTY;
2379     int ret =
2380         ExtractSubString(csWord, findwhat.c_str(), index, TEXT_BLANK_CHAR);
2381     if (csWord.IsEmpty()) {
2382       if (ret) {
2383         m_csFindWhatArray.Add(CFX_WideString(L""));
2384         index++;
2385         continue;
2386       } else {
2387         break;
2388       }
2389     }
2390     int pos = 0;
2391     while (pos < csWord.GetLength()) {
2392       CFX_WideString curStr = csWord.Mid(pos, 1);
2393       FX_WCHAR curChar = csWord.GetAt(pos);
2394       if (_IsIgnoreSpaceCharacter(curChar)) {
2395         if (pos > 0 && curChar == 0x2019) {
2396           pos++;
2397           continue;
2398         }
2399         if (pos > 0) {
2400           CFX_WideString preStr = csWord.Mid(0, pos);
2401           m_csFindWhatArray.Add(preStr);
2402         }
2403         m_csFindWhatArray.Add(curStr);
2404         if (pos == csWord.GetLength() - 1) {
2405           csWord.Empty();
2406           break;
2407         }
2408         csWord = csWord.Right(csWord.GetLength() - pos - 1);
2409         pos = 0;
2410         continue;
2411       }
2412       pos++;
2413     }
2414     if (!csWord.IsEmpty()) {
2415       m_csFindWhatArray.Add(csWord);
2416     }
2417     index++;
2418   }
2419 }
IsMatchWholeWord(const CFX_WideString & csPageText,int startPos,int endPos)2420 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
2421                                             int startPos,
2422                                             int endPos) {
2423   FX_WCHAR char_left = 0;
2424   FX_WCHAR char_right = 0;
2425   int char_count = endPos - startPos + 1;
2426   if (char_count < 1) {
2427     return FALSE;
2428   }
2429   if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
2430     return TRUE;
2431   }
2432   if (startPos - 1 >= 0) {
2433     char_left = csPageText.GetAt(startPos - 1);
2434   }
2435   if (startPos + char_count < csPageText.GetLength()) {
2436     char_right = csPageText.GetAt(startPos + char_count);
2437   }
2438   if ((char_left > 'A' && char_left < 'a') ||
2439       (char_left > 'a' && char_left < 'z') ||
2440       (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
2441       (char_right > 'A' && char_right < 'a') ||
2442       (char_right > 'a' && char_right < 'z') ||
2443       (char_right > 0xfb00 && char_right < 0xfb06) ||
2444       std::iswdigit(char_right)) {
2445     return FALSE;
2446   }
2447   if (!(('A' > char_left || char_left > 'Z') &&
2448         ('a' > char_left || char_left > 'z') &&
2449         ('A' > char_right || char_right > 'Z') &&
2450         ('a' > char_right || char_right > 'z'))) {
2451     return FALSE;
2452   }
2453   if (char_count > 0) {
2454     if (csPageText.GetAt(startPos) >= L'0' &&
2455         csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
2456         char_left <= L'9') {
2457       return FALSE;
2458     }
2459     if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
2460         char_right >= L'0' && char_right <= L'9') {
2461       return FALSE;
2462     }
2463   }
2464   return TRUE;
2465 }
ExtractSubString(CFX_WideString & rString,const FX_WCHAR * lpszFullString,int iSubString,FX_WCHAR chSep)2466 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
2467                                             const FX_WCHAR* lpszFullString,
2468                                             int iSubString,
2469                                             FX_WCHAR chSep) {
2470   if (!lpszFullString) {
2471     return FALSE;
2472   }
2473   while (iSubString--) {
2474     lpszFullString = FXSYS_wcschr(lpszFullString, chSep);
2475     if (!lpszFullString) {
2476       rString.Empty();
2477       return FALSE;
2478     }
2479     lpszFullString++;
2480     while (*lpszFullString == chSep) {
2481       lpszFullString++;
2482     }
2483   }
2484   const FX_WCHAR* lpchEnd = FXSYS_wcschr(lpszFullString, chSep);
2485   int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
2486                      : (int)FXSYS_wcslen(lpszFullString);
2487   ASSERT(nLen >= 0);
2488   FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
2489                nLen * sizeof(FX_WCHAR));
2490   rString.ReleaseBuffer();
2491   return TRUE;
2492 }
MakeReverse(const CFX_WideString & str)2493 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
2494   CFX_WideString str2;
2495   str2.Empty();
2496   int nlen = str.GetLength();
2497   for (int i = nlen - 1; i >= 0; i--) {
2498     str2 += str.GetAt(i);
2499   }
2500   return str2;
2501 }
GetRectArray(CFX_RectArray & rects) const2502 void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const {
2503   rects.Copy(m_resArray);
2504 }
GetCurOrder() const2505 int CPDF_TextPageFind::GetCurOrder() const {
2506   return GetCharIndex(m_resStart);
2507 }
GetMatchedCount() const2508 int CPDF_TextPageFind::GetMatchedCount() const {
2509   int resStart = GetCharIndex(m_resStart);
2510   int resEnd = GetCharIndex(m_resEnd);
2511   return resEnd - resStart + 1;
2512 }
2513 
CPDF_LinkExtract()2514 CPDF_LinkExtract::CPDF_LinkExtract()
2515     : m_pTextPage(nullptr), m_bIsParsed(false) {
2516 }
2517 
~CPDF_LinkExtract()2518 CPDF_LinkExtract::~CPDF_LinkExtract() {
2519   DeleteLinkList();
2520 }
2521 
ExtractLinks(const IPDF_TextPage * pTextPage)2522 FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) {
2523   if (!pTextPage || !pTextPage->IsParsed())
2524     return FALSE;
2525 
2526   m_pTextPage = (const CPDF_TextPage*)pTextPage;
2527   m_strPageText = m_pTextPage->GetPageText(0, -1);
2528   DeleteLinkList();
2529   if (m_strPageText.IsEmpty()) {
2530     return FALSE;
2531   }
2532   ParseLink();
2533   m_bIsParsed = true;
2534   return TRUE;
2535 }
2536 
DeleteLinkList()2537 void CPDF_LinkExtract::DeleteLinkList() {
2538   while (m_LinkList.GetSize()) {
2539     CPDF_LinkExt* linkinfo = NULL;
2540     linkinfo = m_LinkList.GetAt(0);
2541     m_LinkList.RemoveAt(0);
2542     delete linkinfo;
2543   }
2544   m_LinkList.RemoveAll();
2545 }
CountLinks() const2546 int CPDF_LinkExtract::CountLinks() const {
2547   if (!m_bIsParsed) {
2548     return -1;
2549   }
2550   return m_LinkList.GetSize();
2551 }
ParseLink()2552 void CPDF_LinkExtract::ParseLink() {
2553   int start = 0, pos = 0;
2554   int TotalChar = m_pTextPage->CountChars();
2555   while (pos < TotalChar) {
2556     FPDF_CHAR_INFO pageChar;
2557     m_pTextPage->GetCharInfo(pos, &pageChar);
2558     if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 ||
2559         pos == TotalChar - 1) {
2560       int nCount = pos - start;
2561       if (pos == TotalChar - 1) {
2562         nCount++;
2563       }
2564       CFX_WideString strBeCheck;
2565       strBeCheck = m_pTextPage->GetPageText(start, nCount);
2566       if (strBeCheck.GetLength() > 5) {
2567         while (strBeCheck.GetLength() > 0) {
2568           FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
2569           if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
2570             strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
2571             nCount--;
2572           } else {
2573             break;
2574           }
2575         }
2576         if (nCount > 5 &&
2577             (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
2578           AppendToLinkList(start, nCount, strBeCheck);
2579         }
2580       }
2581       start = ++pos;
2582     } else {
2583       pos++;
2584     }
2585   }
2586 }
CheckWebLink(CFX_WideString & strBeCheck)2587 FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
2588   CFX_WideString str = strBeCheck;
2589   str.MakeLower();
2590   if (str.Find(L"http://www.") != -1) {
2591     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
2592     return TRUE;
2593   }
2594   if (str.Find(L"http://") != -1) {
2595     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
2596     return TRUE;
2597   }
2598   if (str.Find(L"https://www.") != -1) {
2599     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
2600     return TRUE;
2601   }
2602   if (str.Find(L"https://") != -1) {
2603     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
2604     return TRUE;
2605   }
2606   if (str.Find(L"www.") != -1) {
2607     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2608     strBeCheck = L"http://" + strBeCheck;
2609     return TRUE;
2610   }
2611   return FALSE;
2612 }
CheckMailLink(CFX_WideString & str)2613 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
2614   int aPos = str.Find(L'@');
2615   // Invalid when no '@'.
2616   if (aPos < 1) {
2617     return FALSE;
2618   }
2619 
2620   // Check the local part.
2621   int pPos = aPos;  // Used to track the position of '@' or '.'.
2622   for (int i = aPos - 1; i >= 0; i--) {
2623     FX_WCHAR ch = str.GetAt(i);
2624     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) {
2625       continue;
2626     }
2627     if (ch != L'.' || i == pPos - 1 || i == 0) {
2628       if (i == aPos - 1) {
2629         // There is '.' or invalid char before '@'.
2630         return FALSE;
2631       }
2632       // End extracting for other invalid chars, '.' at the beginning, or
2633       // consecutive '.'.
2634       int removed_len = i == pPos - 1 ? i + 2 : i + 1;
2635       str = str.Right(str.GetLength() - removed_len);
2636       break;
2637     }
2638     // Found a valid '.'.
2639     pPos = i;
2640   }
2641 
2642   // Check the domain name part.
2643   aPos = str.Find(L'@');
2644   if (aPos < 1) {
2645     return FALSE;
2646   }
2647   str.TrimRight(L'.');
2648   // At least one '.' in domain name, but not at the beginning.
2649   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
2650   // Check whether we should remove this check.
2651   int ePos = str.Find(L'.', aPos + 1);
2652   if (ePos == -1 || ePos == aPos + 1) {
2653     return FALSE;
2654   }
2655   // Validate all other chars in domain name.
2656   int nLen = str.GetLength();
2657   pPos = 0;  // Used to track the position of '.'.
2658   for (int i = aPos + 1; i < nLen; i++) {
2659     FX_WCHAR wch = str.GetAt(i);
2660     if (wch == L'-' || FXSYS_iswalnum(wch)) {
2661       continue;
2662     }
2663     if (wch != L'.' || i == pPos + 1) {
2664       // Domain name should end before invalid char.
2665       int host_end = i == pPos + 1 ? i - 2 : i - 1;
2666       if (pPos > 0 && host_end - aPos >= 3) {
2667         // Trim the ending invalid chars if there is at least one '.' and name.
2668         str = str.Left(host_end + 1);
2669         break;
2670       }
2671       return FALSE;
2672     }
2673     pPos = i;
2674   }
2675 
2676   if (str.Find(L"mailto:") == -1) {
2677     str = L"mailto:" + str;
2678   }
2679   return TRUE;
2680 }
2681 
AppendToLinkList(int start,int count,const CFX_WideString & strUrl)2682 void CPDF_LinkExtract::AppendToLinkList(int start,
2683                                         int count,
2684                                         const CFX_WideString& strUrl) {
2685   CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
2686   linkInfo->m_strUrl = strUrl;
2687   linkInfo->m_Start = start;
2688   linkInfo->m_Count = count;
2689   m_LinkList.Add(linkInfo);
2690 }
2691 
GetURL(int index) const2692 CFX_WideString CPDF_LinkExtract::GetURL(int index) const {
2693   if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2694     return L"";
2695   }
2696   CPDF_LinkExt* link = NULL;
2697   link = m_LinkList.GetAt(index);
2698   if (!link) {
2699     return L"";
2700   }
2701   return link->m_strUrl;
2702 }
GetBoundedSegment(int index,int & start,int & count) const2703 void CPDF_LinkExtract::GetBoundedSegment(int index,
2704                                          int& start,
2705                                          int& count) const {
2706   if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2707     return;
2708   }
2709   CPDF_LinkExt* link = NULL;
2710   link = m_LinkList.GetAt(index);
2711   if (!link) {
2712     return;
2713   }
2714   start = link->m_Start;
2715   count = link->m_Count;
2716 }
GetRects(int index,CFX_RectArray & rects) const2717 void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const {
2718   if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2719     return;
2720   }
2721   CPDF_LinkExt* link = NULL;
2722   link = m_LinkList.GetAt(index);
2723   if (!link) {
2724     return;
2725   }
2726   m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2727 }
2728