1 // Copyright 2016 The PDFium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 9 10 #include <stdint.h> 11 12 #include <deque> 13 #include <functional> 14 #include <optional> 15 #include <vector> 16 17 #include "core/fpdfapi/page/cpdf_pageobjectholder.h" 18 #include "core/fxcrt/data_vector.h" 19 #include "core/fxcrt/fx_coordinates.h" 20 #include "core/fxcrt/fx_memory_wrappers.h" 21 #include "core/fxcrt/unowned_ptr.h" 22 #include "core/fxcrt/widestring.h" 23 #include "core/fxcrt/widetext_buffer.h" 24 25 class CPDF_FormObject; 26 class CPDF_Page; 27 class CPDF_TextObject; 28 29 struct TextPageCharSegment { 30 int index; 31 int count; 32 }; 33 34 FX_DATA_PARTITION_EXCEPTION(TextPageCharSegment); 35 36 class CPDF_TextPage { 37 public: 38 enum class CharType : uint8_t { 39 kNormal, 40 kGenerated, 41 kNotUnicode, 42 kHyphen, 43 kPiece, 44 }; 45 46 class CharInfo { 47 public: 48 CharInfo(); 49 CharInfo(CharType char_type, 50 uint32_t char_code, 51 wchar_t unicode, 52 CFX_PointF origin, 53 CFX_FloatRect char_box, 54 CFX_Matrix matrix, 55 CPDF_TextObject* text_object); 56 CharInfo(const CharInfo&); 57 ~CharInfo(); 58 char_type()59 CharType char_type() const { return char_type_; } set_char_type(CharType char_type)60 void set_char_type(CharType char_type) { char_type_ = char_type; } 61 char_code()62 uint32_t char_code() const { return char_code_; } 63 unicode()64 wchar_t unicode() const { return unicode_; } set_unicode(wchar_t unicode)65 void set_unicode(wchar_t unicode) { unicode_ = unicode; } 66 origin()67 const CFX_PointF& origin() const { return origin_; } 68 char_box()69 const CFX_FloatRect& char_box() const { return char_box_; } 70 matrix()71 const CFX_Matrix& matrix() const { return matrix_; } 72 text_object()73 const CPDF_TextObject* text_object() const { return text_object_; } text_object()74 CPDF_TextObject* text_object() { return text_object_; } 75 76 private: 77 CharType char_type_ = CharType::kNormal; 78 wchar_t unicode_ = 0; // Above `char_code_` to potentially pack tighter. 79 uint32_t char_code_ = 0; 80 CFX_PointF origin_; 81 CFX_FloatRect char_box_; 82 CFX_Matrix matrix_; 83 UnownedPtr<CPDF_TextObject> text_object_; 84 }; 85 86 CPDF_TextPage(const CPDF_Page* pPage, bool rtl); 87 ~CPDF_TextPage(); 88 89 int CharIndexFromTextIndex(int text_index) const; 90 int TextIndexFromCharIndex(int char_index) const; size()91 size_t size() const { return m_CharList.size(); } 92 int CountChars() const; 93 94 // These methods CHECK() to make sure |index| is within bounds. 95 const CharInfo& GetCharInfo(size_t index) const; 96 CharInfo& GetCharInfo(size_t index); 97 float GetCharFontSize(size_t index) const; 98 CFX_FloatRect GetCharLooseBounds(size_t index) const; 99 100 std::vector<CFX_FloatRect> GetRectArray(int start, int count) const; 101 int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const; 102 WideString GetTextByRect(const CFX_FloatRect& rect) const; 103 WideString GetTextByObject(const CPDF_TextObject* pTextObj) const; 104 105 // Returns string with the text from |m_TextBuf| that are covered by the input 106 // range. |start| and |count| are in terms of the |m_CharIndices|, so the 107 // range will be converted into appropriate indices. 108 WideString GetPageText(int start, int count) const; GetAllPageText()109 WideString GetAllPageText() const { return GetPageText(0, CountChars()); } 110 111 int CountRects(int start, int nCount); 112 bool GetRect(int rectIndex, CFX_FloatRect* pRect) const; 113 114 private: 115 enum class TextOrientation { 116 kUnknown, 117 kHorizontal, 118 kVertical, 119 }; 120 121 enum class GenerateCharacter { 122 kNone, 123 kSpace, 124 kLineBreak, 125 kHyphen, 126 }; 127 128 enum class MarkedContentState { kPass = 0, kDone, kDelay }; 129 130 struct TransformedTextObject { 131 TransformedTextObject(); 132 TransformedTextObject(const TransformedTextObject& that); 133 ~TransformedTextObject(); 134 135 UnownedPtr<CPDF_TextObject> m_pTextObj; 136 CFX_Matrix m_formMatrix; 137 }; 138 139 void Init(); 140 bool IsHyphen(wchar_t curChar) const; 141 void ProcessObject(); 142 void ProcessFormObject(CPDF_FormObject* pFormObj, 143 const CFX_Matrix& form_matrix); 144 void ProcessTextObject(const TransformedTextObject& obj); 145 void ProcessTextObject(CPDF_TextObject* pTextObj, 146 const CFX_Matrix& form_matrix, 147 const CPDF_PageObjectHolder* pObjList, 148 CPDF_PageObjectHolder::const_iterator ObjPos); 149 GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj, 150 const CFX_Matrix& form_matrix); 151 // Returns whether to continue or not. 152 bool ProcessGenerateCharacter(GenerateCharacter type, 153 const CPDF_TextObject* text_object, 154 const CFX_Matrix& form_matrix); 155 void ProcessTextObjectItems(CPDF_TextObject* text_object, 156 const CFX_Matrix& form_matrix, 157 const CFX_Matrix& matrix); 158 const CharInfo* GetPrevCharInfo() const; 159 std::optional<CharInfo> GenerateCharInfo(wchar_t unicode, 160 const CFX_Matrix& form_matrix); 161 bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj, 162 const CPDF_PageObjectHolder* pObjList, 163 CPDF_PageObjectHolder::const_iterator iter) const; 164 bool IsSameTextObject(CPDF_TextObject* pTextObj1, 165 CPDF_TextObject* pTextObj2) const; 166 void CloseTempLine(); 167 MarkedContentState PreMarkedContent(const CPDF_TextObject* pTextObj); 168 void ProcessMarkedContent(const TransformedTextObject& obj); 169 void FindPreviousTextObject(); 170 void AddCharInfoByLRDirection(wchar_t wChar, const CharInfo& info); 171 void AddCharInfoByRLDirection(wchar_t wChar, const CharInfo& info); 172 TextOrientation GetTextObjectWritingMode( 173 const CPDF_TextObject* pTextObj) const; 174 TextOrientation FindTextlineFlowOrientation() const; 175 void AppendGeneratedCharacter(wchar_t unicode, 176 const CFX_Matrix& form_matrix, 177 bool use_temp_buffer); 178 void SwapTempTextBuf(size_t iCharListStartAppend, size_t iBufStartAppend); 179 WideString GetTextByPredicate( 180 const std::function<bool(const CharInfo&)>& predicate) const; 181 182 UnownedPtr<const CPDF_Page> const m_pPage; 183 DataVector<TextPageCharSegment> m_CharIndices; 184 std::deque<CharInfo> m_CharList; 185 std::deque<CharInfo> m_TempCharList; 186 WideTextBuffer m_TextBuf; 187 WideTextBuffer m_TempTextBuf; 188 UnownedPtr<const CPDF_TextObject> m_pPrevTextObj; 189 CFX_Matrix m_PrevMatrix; 190 const bool m_rtl; 191 const CFX_Matrix m_DisplayMatrix; 192 std::vector<CFX_FloatRect> m_SelRects; 193 std::vector<TransformedTextObject> mTextObjects; 194 TextOrientation m_TextlineDir = TextOrientation::kUnknown; 195 CFX_FloatRect m_CurlineRect; 196 }; 197 198 #endif // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 199