1 // Copyright 2016 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 9 10 #include <deque> 11 #include <vector> 12 13 #include "core/fpdfapi/page/cpdf_pageobjectlist.h" 14 #include "core/fxcrt/fx_basic.h" 15 #include "core/fxcrt/fx_coordinates.h" 16 #include "core/fxcrt/fx_string.h" 17 18 class CPDF_Font; 19 class CPDF_FormObject; 20 class CPDF_Page; 21 class CPDF_TextObject; 22 23 #define FPDFTEXT_MATCHCASE 0x00000001 24 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 25 #define FPDFTEXT_CONSECUTIVE 0x00000004 26 27 #define FPDFTEXT_CHAR_ERROR -1 28 #define FPDFTEXT_CHAR_NORMAL 0 29 #define FPDFTEXT_CHAR_GENERATED 1 30 #define FPDFTEXT_CHAR_UNUNICODE 2 31 #define FPDFTEXT_CHAR_HYPHEN 3 32 #define FPDFTEXT_CHAR_PIECE 4 33 34 #define TEXT_SPACE_CHAR L' ' 35 #define TEXT_LINEFEED_CHAR L'\n' 36 #define TEXT_RETURN_CHAR L'\r' 37 #define TEXT_EMPTY L"" 38 #define TEXT_SPACE L" " 39 #define TEXT_RETURN_LINEFEED L"\r\n" 40 #define TEXT_LINEFEED L"\n" 41 #define TEXT_CHARRATIO_GAPDELTA 0.070 42 43 enum class FPDFText_MarkedContent { Pass = 0, Done, Delay }; 44 45 enum class FPDFText_Direction { Left = -1, Right = 1 }; 46 47 class FPDF_CHAR_INFO { 48 public: 49 FPDF_CHAR_INFO(); 50 ~FPDF_CHAR_INFO(); 51 52 FX_WCHAR m_Unicode; 53 FX_WCHAR m_Charcode; 54 int32_t m_Flag; 55 FX_FLOAT m_FontSize; 56 CFX_PointF m_Origin; 57 CFX_FloatRect m_CharBox; 58 CPDF_TextObject* m_pTextObj; 59 CFX_Matrix m_Matrix; 60 }; 61 62 struct FPDF_SEGMENT { 63 int m_Start; 64 int m_nCount; 65 }; 66 67 class PAGECHAR_INFO { 68 public: 69 PAGECHAR_INFO(); 70 PAGECHAR_INFO(const PAGECHAR_INFO&); 71 ~PAGECHAR_INFO(); 72 73 int m_Index; 74 int m_CharCode; 75 FX_WCHAR m_Unicode; 76 int32_t m_Flag; 77 CFX_PointF m_Origin; 78 CFX_FloatRect m_CharBox; 79 CPDF_TextObject* m_pTextObj; 80 CFX_Matrix m_Matrix; 81 }; 82 83 struct PDFTEXT_Obj { 84 CPDF_TextObject* m_pTextObj; 85 CFX_Matrix m_formMatrix; 86 }; 87 88 class CPDF_TextPage { 89 public: 90 CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags); 91 ~CPDF_TextPage(); 92 93 // IPDF_TextPage: 94 void ParseTextPage(); IsParsed()95 bool IsParsed() const { return m_bIsParsed; } 96 int CharIndexFromTextIndex(int TextIndex) const; 97 int TextIndexFromCharIndex(int CharIndex) const; 98 int CountChars() const; 99 void GetCharInfo(int index, FPDF_CHAR_INFO* info) const; 100 std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const; 101 int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const; 102 CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const; 103 CFX_WideString GetPageText(int start = 0, int nCount = -1) const; 104 int CountRects(int start, int nCount); 105 void GetRect(int rectIndex, 106 FX_FLOAT& left, 107 FX_FLOAT& top, 108 FX_FLOAT& right, 109 FX_FLOAT& bottom) const; 110 111 static bool IsRectIntersect(const CFX_FloatRect& rect1, 112 const CFX_FloatRect& rect2); 113 114 private: 115 enum class TextOrientation { 116 Unknown, 117 Horizontal, 118 Vertical, 119 }; 120 121 enum class GenerateCharacter { 122 None, 123 Space, 124 LineBreak, 125 Hyphen, 126 }; 127 128 bool IsHyphen(FX_WCHAR curChar); 129 bool IsControlChar(const PAGECHAR_INFO& charInfo); 130 void ProcessObject(); 131 void ProcessFormObject(CPDF_FormObject* pFormObj, 132 const CFX_Matrix& formMatrix); 133 void ProcessTextObject(PDFTEXT_Obj pObj); 134 void ProcessTextObject(CPDF_TextObject* pTextObj, 135 const CFX_Matrix& formMatrix, 136 const CPDF_PageObjectList* pObjList, 137 CPDF_PageObjectList::const_iterator ObjPos); 138 GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj, 139 const CFX_Matrix& formMatrix); 140 bool GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); 141 bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj, 142 const CPDF_PageObjectList* pObjList, 143 CPDF_PageObjectList::const_iterator ObjPos); 144 bool IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2); 145 int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const; 146 void CloseTempLine(); 147 FPDFText_MarkedContent PreMarkedContent(PDFTEXT_Obj pObj); 148 void ProcessMarkedContent(PDFTEXT_Obj pObj); 149 void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const; 150 void FindPreviousTextObject(); 151 void AddCharInfoByLRDirection(FX_WCHAR wChar, PAGECHAR_INFO info); 152 void AddCharInfoByRLDirection(FX_WCHAR wChar, PAGECHAR_INFO info); 153 TextOrientation GetTextObjectWritingMode( 154 const CPDF_TextObject* pTextObj) const; 155 TextOrientation FindTextlineFlowOrientation() const; 156 void AppendGeneratedCharacter(FX_WCHAR unicode, const CFX_Matrix& formMatrix); 157 158 void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); 159 bool IsRightToLeft(const CPDF_TextObject* pTextObj, 160 const CPDF_Font* pFont, 161 int nItems) const; 162 163 const CPDF_Page* const m_pPage; 164 std::vector<uint16_t> m_CharIndex; 165 std::deque<PAGECHAR_INFO> m_CharList; 166 std::deque<PAGECHAR_INFO> m_TempCharList; 167 CFX_WideTextBuf m_TextBuf; 168 CFX_WideTextBuf m_TempTextBuf; 169 const FPDFText_Direction m_parserflag; 170 CPDF_TextObject* m_pPreTextObj; 171 CFX_Matrix m_perMatrix; 172 bool m_bIsParsed; 173 CFX_Matrix m_DisplayMatrix; 174 std::vector<CFX_FloatRect> m_SelRects; 175 std::vector<PDFTEXT_Obj> m_LineObj; 176 TextOrientation m_TextlineDir; 177 CFX_FloatRect m_CurlineRect; 178 }; 179 180 #endif // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 181