1 // Copyright 2016 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 9 10 #include <deque> 11 #include <vector> 12 13 #include "core/fpdfapi/page/cpdf_pageobjectlist.h" 14 #include "core/fxcrt/cfx_widetextbuf.h" 15 #include "core/fxcrt/fx_coordinates.h" 16 #include "core/fxcrt/fx_string.h" 17 #include "core/fxcrt/unowned_ptr.h" 18 19 class CPDF_Font; 20 class CPDF_FormObject; 21 class CPDF_Page; 22 class CPDF_TextObject; 23 24 #define FPDFTEXT_MATCHCASE 0x00000001 25 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 26 #define FPDFTEXT_CONSECUTIVE 0x00000004 27 28 #define FPDFTEXT_CHAR_NORMAL 0 29 #define FPDFTEXT_CHAR_GENERATED 1 30 #define FPDFTEXT_CHAR_UNUNICODE 2 31 #define FPDFTEXT_CHAR_HYPHEN 3 32 #define FPDFTEXT_CHAR_PIECE 4 33 34 #define TEXT_SPACE_CHAR L' ' 35 #define TEXT_LINEFEED_CHAR L'\n' 36 #define TEXT_RETURN_CHAR L'\r' 37 #define TEXT_HYPHEN_CHAR L'-' 38 #define TEXT_EMPTY L"" 39 #define TEXT_HYPHEN L"-" 40 #define TEXT_CHARRATIO_GAPDELTA 0.070 41 42 enum class FPDFText_MarkedContent { Pass = 0, Done, Delay }; 43 44 enum class FPDFText_Direction { Left = -1, Right = 1 }; 45 46 class FPDF_CHAR_INFO { 47 public: 48 FPDF_CHAR_INFO(); 49 ~FPDF_CHAR_INFO(); 50 51 wchar_t m_Unicode; 52 wchar_t m_Charcode; 53 int32_t m_Flag; 54 float m_FontSize; 55 CFX_PointF m_Origin; 56 CFX_FloatRect m_CharBox; 57 UnownedPtr<CPDF_TextObject> m_pTextObj; 58 CFX_Matrix m_Matrix; 59 }; 60 61 struct FPDF_SEGMENT { 62 int m_Start; 63 int m_nCount; 64 }; 65 66 class PAGECHAR_INFO { 67 public: 68 PAGECHAR_INFO(); 69 PAGECHAR_INFO(const PAGECHAR_INFO&); 70 ~PAGECHAR_INFO(); 71 72 int m_Index; 73 int m_CharCode; 74 wchar_t m_Unicode; 75 int32_t m_Flag; 76 CFX_PointF m_Origin; 77 CFX_FloatRect m_CharBox; 78 UnownedPtr<CPDF_TextObject> m_pTextObj; 79 CFX_Matrix m_Matrix; 80 }; 81 82 struct PDFTEXT_Obj { 83 PDFTEXT_Obj(); 84 PDFTEXT_Obj(const PDFTEXT_Obj& that); 85 ~PDFTEXT_Obj(); 86 87 UnownedPtr<CPDF_TextObject> m_pTextObj; 88 CFX_Matrix m_formMatrix; 89 }; 90 91 class CPDF_TextPage { 92 public: 93 CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags); 94 ~CPDF_TextPage(); 95 96 // IPDF_TextPage: 97 void ParseTextPage(); IsParsed()98 bool IsParsed() const { return m_bIsParsed; } 99 int CharIndexFromTextIndex(int TextIndex) const; 100 int TextIndexFromCharIndex(int CharIndex) const; 101 int CountChars() const; 102 void GetCharInfo(int index, FPDF_CHAR_INFO* info) const; 103 std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const; 104 int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const; 105 WideString GetTextByRect(const CFX_FloatRect& rect) const; 106 107 // Returns string with the text from |m_TextBuf| that are covered by the input 108 // range. |start| and |count| are in terms of the m_CharIndex, so the range 109 // will be converted into appropriate indices. 110 WideString GetPageText(int start, int count) const; GetAllPageText()111 WideString GetAllPageText() const { return GetPageText(0, CountChars()); } 112 113 int CountRects(int start, int nCount); 114 bool GetRect(int rectIndex, CFX_FloatRect* pRect) const; 115 116 static bool IsRectIntersect(const CFX_FloatRect& rect1, 117 const CFX_FloatRect& rect2); 118 119 private: 120 enum class TextOrientation { 121 Unknown, 122 Horizontal, 123 Vertical, 124 }; 125 126 enum class GenerateCharacter { 127 None, 128 Space, 129 LineBreak, 130 Hyphen, 131 }; 132 133 bool IsHyphen(wchar_t curChar) const; 134 bool IsControlChar(const PAGECHAR_INFO& charInfo); 135 void ProcessObject(); 136 void ProcessFormObject(CPDF_FormObject* pFormObj, 137 const CFX_Matrix& formMatrix); 138 void ProcessTextObject(PDFTEXT_Obj pObj); 139 void ProcessTextObject(CPDF_TextObject* pTextObj, 140 const CFX_Matrix& formMatrix, 141 const CPDF_PageObjectList* pObjList, 142 CPDF_PageObjectList::const_iterator ObjPos); 143 GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj, 144 const CFX_Matrix& formMatrix); 145 bool GenerateCharInfo(wchar_t unicode, PAGECHAR_INFO& info); 146 bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj, 147 const CPDF_PageObjectList* pObjList, 148 CPDF_PageObjectList::const_iterator ObjPos); 149 bool IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2); 150 int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const; 151 void CloseTempLine(); 152 FPDFText_MarkedContent PreMarkedContent(PDFTEXT_Obj pObj); 153 void ProcessMarkedContent(PDFTEXT_Obj pObj); 154 void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const; 155 void FindPreviousTextObject(); 156 void AddCharInfoByLRDirection(wchar_t wChar, PAGECHAR_INFO info); 157 void AddCharInfoByRLDirection(wchar_t wChar, PAGECHAR_INFO info); 158 TextOrientation GetTextObjectWritingMode( 159 const CPDF_TextObject* pTextObj) const; 160 TextOrientation FindTextlineFlowOrientation() const; 161 void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix); 162 163 void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); 164 bool IsRightToLeft(const CPDF_TextObject* pTextObj, 165 const CPDF_Font* pFont, 166 size_t nItems) const; 167 168 UnownedPtr<const CPDF_Page> const m_pPage; 169 std::vector<uint16_t> m_CharIndex; 170 std::deque<PAGECHAR_INFO> m_CharList; 171 std::deque<PAGECHAR_INFO> m_TempCharList; 172 CFX_WideTextBuf m_TextBuf; 173 CFX_WideTextBuf m_TempTextBuf; 174 const FPDFText_Direction m_parserflag; 175 UnownedPtr<CPDF_TextObject> m_pPreTextObj; 176 CFX_Matrix m_perMatrix; 177 bool m_bIsParsed; 178 CFX_Matrix m_DisplayMatrix; 179 std::vector<CFX_FloatRect> m_SelRects; 180 std::vector<PDFTEXT_Obj> m_LineObj; 181 TextOrientation m_TextlineDir; 182 CFX_FloatRect m_CurlineRect; 183 }; 184 185 #endif // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 186