1 // Copyright 2016 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 9 10 #include <deque> 11 #include <functional> 12 #include <vector> 13 14 #include "core/fpdfapi/page/cpdf_pageobjectholder.h" 15 #include "core/fxcrt/cfx_widetextbuf.h" 16 #include "core/fxcrt/fx_coordinates.h" 17 #include "core/fxcrt/fx_string.h" 18 #include "core/fxcrt/unowned_ptr.h" 19 #include "third_party/base/optional.h" 20 21 class CPDF_Font; 22 class CPDF_FormObject; 23 class CPDF_Page; 24 class CPDF_TextObject; 25 26 struct PDFTEXT_Obj { 27 PDFTEXT_Obj(); 28 PDFTEXT_Obj(const PDFTEXT_Obj& that); 29 ~PDFTEXT_Obj(); 30 31 UnownedPtr<CPDF_TextObject> m_pTextObj; 32 CFX_Matrix m_formMatrix; 33 }; 34 35 class CPDF_TextPage { 36 public: 37 enum class CharType : uint8_t { 38 kNormal, 39 kGenerated, 40 kNotUnicode, 41 kHyphen, 42 kPiece, 43 }; 44 45 class CharInfo { 46 public: 47 CharInfo(); 48 CharInfo(const CharInfo&); 49 ~CharInfo(); 50 51 int m_Index = 0; 52 uint32_t m_CharCode = 0; 53 wchar_t m_Unicode = 0; 54 CharType m_CharType = CharType::kNormal; 55 CFX_PointF m_Origin; 56 CFX_FloatRect m_CharBox; 57 UnownedPtr<CPDF_TextObject> m_pTextObj; 58 CFX_Matrix m_Matrix; 59 }; 60 61 CPDF_TextPage(const CPDF_Page* pPage, bool rtl); 62 ~CPDF_TextPage(); 63 64 int CharIndexFromTextIndex(int text_index) const; 65 int TextIndexFromCharIndex(int char_index) const; size()66 size_t size() const { return m_CharList.size(); } 67 int CountChars() const; 68 69 // These methods CHECK() to make sure |index| is within bounds. 70 const CharInfo& GetCharInfo(size_t index) const; 71 float GetCharFontSize(size_t index) const; 72 73 std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const; 74 int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const; 75 WideString GetTextByRect(const CFX_FloatRect& rect) const; 76 WideString GetTextByObject(const CPDF_TextObject* pTextObj) const; 77 78 // Returns string with the text from |m_TextBuf| that are covered by the input 79 // range. |start| and |count| are in terms of the |m_CharIndices|, so the 80 // range will be converted into appropriate indices. 81 WideString GetPageText(int start, int count) const; GetAllPageText()82 WideString GetAllPageText() const { return GetPageText(0, CountChars()); } 83 84 int CountRects(int start, int nCount); 85 bool GetRect(int rectIndex, CFX_FloatRect* pRect) const; 86 87 private: 88 enum class TextOrientation { 89 kUnknown, 90 kHorizontal, 91 kVertical, 92 }; 93 94 enum class GenerateCharacter { 95 kNone, 96 kSpace, 97 kLineBreak, 98 kHyphen, 99 }; 100 101 enum class MarkedContentState { kPass = 0, kDone, kDelay }; 102 103 void Init(); 104 bool IsHyphen(wchar_t curChar) const; 105 void ProcessObject(); 106 void ProcessFormObject(CPDF_FormObject* pFormObj, 107 const CFX_Matrix& formMatrix); 108 void ProcessTextObject(PDFTEXT_Obj pObj); 109 void ProcessTextObject(CPDF_TextObject* pTextObj, 110 const CFX_Matrix& formMatrix, 111 const CPDF_PageObjectHolder* pObjList, 112 CPDF_PageObjectHolder::const_iterator ObjPos); 113 GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj, 114 const CFX_Matrix& formMatrix); 115 const CharInfo* GetPrevCharInfo() const; 116 Optional<CharInfo> GenerateCharInfo(wchar_t unicode); 117 bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj, 118 const CPDF_PageObjectHolder* pObjList, 119 CPDF_PageObjectHolder::const_iterator iter) const; 120 bool IsSameTextObject(CPDF_TextObject* pTextObj1, 121 CPDF_TextObject* pTextObj2) const; 122 void CloseTempLine(); 123 MarkedContentState PreMarkedContent(PDFTEXT_Obj pObj); 124 void ProcessMarkedContent(PDFTEXT_Obj pObj); 125 void FindPreviousTextObject(); 126 void AddCharInfoByLRDirection(wchar_t wChar, const CharInfo& info); 127 void AddCharInfoByRLDirection(wchar_t wChar, const CharInfo& info); 128 TextOrientation GetTextObjectWritingMode( 129 const CPDF_TextObject* pTextObj) const; 130 TextOrientation FindTextlineFlowOrientation() const; 131 void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix); 132 void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); 133 WideString GetTextByPredicate( 134 const std::function<bool(const CharInfo&)>& predicate) const; 135 136 UnownedPtr<const CPDF_Page> const m_pPage; 137 std::vector<uint16_t> m_CharIndices; 138 std::deque<CharInfo> m_CharList; 139 std::deque<CharInfo> m_TempCharList; 140 CFX_WideTextBuf m_TextBuf; 141 CFX_WideTextBuf m_TempTextBuf; 142 UnownedPtr<CPDF_TextObject> m_pPrevTextObj; 143 CFX_Matrix m_PrevMatrix; 144 const bool m_rtl; 145 const CFX_Matrix m_DisplayMatrix; 146 std::vector<CFX_FloatRect> m_SelRects; 147 std::vector<PDFTEXT_Obj> m_LineObj; 148 TextOrientation m_TextlineDir = TextOrientation::kUnknown; 149 CFX_FloatRect m_CurlineRect; 150 }; 151 152 #endif // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 153