• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
9 
10 #include <stdint.h>
11 
12 #include <deque>
13 #include <functional>
14 #include <optional>
15 #include <vector>
16 
17 #include "core/fpdfapi/page/cpdf_pageobjectholder.h"
18 #include "core/fxcrt/data_vector.h"
19 #include "core/fxcrt/fx_coordinates.h"
20 #include "core/fxcrt/fx_memory_wrappers.h"
21 #include "core/fxcrt/unowned_ptr.h"
22 #include "core/fxcrt/widestring.h"
23 #include "core/fxcrt/widetext_buffer.h"
24 
25 class CPDF_FormObject;
26 class CPDF_Page;
27 class CPDF_TextObject;
28 
29 struct TextPageCharSegment {
30   int index;
31   int count;
32 };
33 
34 FX_DATA_PARTITION_EXCEPTION(TextPageCharSegment);
35 
36 class CPDF_TextPage {
37  public:
38   enum class CharType : uint8_t {
39     kNormal,
40     kGenerated,
41     kNotUnicode,
42     kHyphen,
43     kPiece,
44   };
45 
46   class CharInfo {
47    public:
48     CharInfo();
49     CharInfo(CharType char_type,
50              uint32_t char_code,
51              wchar_t unicode,
52              CFX_PointF origin,
53              CFX_FloatRect char_box,
54              CFX_Matrix matrix,
55              CPDF_TextObject* text_object);
56     CharInfo(const CharInfo&);
57     ~CharInfo();
58 
char_type()59     CharType char_type() const { return char_type_; }
set_char_type(CharType char_type)60     void set_char_type(CharType char_type) { char_type_ = char_type; }
61 
char_code()62     uint32_t char_code() const { return char_code_; }
63 
unicode()64     wchar_t unicode() const { return unicode_; }
set_unicode(wchar_t unicode)65     void set_unicode(wchar_t unicode) { unicode_ = unicode; }
66 
origin()67     const CFX_PointF& origin() const { return origin_; }
68 
char_box()69     const CFX_FloatRect& char_box() const { return char_box_; }
70 
matrix()71     const CFX_Matrix& matrix() const { return matrix_; }
72 
text_object()73     const CPDF_TextObject* text_object() const { return text_object_; }
text_object()74     CPDF_TextObject* text_object() { return text_object_; }
75 
76    private:
77     CharType char_type_ = CharType::kNormal;
78     wchar_t unicode_ = 0;  // Above `char_code_` to potentially pack tighter.
79     uint32_t char_code_ = 0;
80     CFX_PointF origin_;
81     CFX_FloatRect char_box_;
82     CFX_Matrix matrix_;
83     UnownedPtr<CPDF_TextObject> text_object_;
84   };
85 
86   CPDF_TextPage(const CPDF_Page* pPage, bool rtl);
87   ~CPDF_TextPage();
88 
89   int CharIndexFromTextIndex(int text_index) const;
90   int TextIndexFromCharIndex(int char_index) const;
size()91   size_t size() const { return m_CharList.size(); }
92   int CountChars() const;
93 
94   // These methods CHECK() to make sure |index| is within bounds.
95   const CharInfo& GetCharInfo(size_t index) const;
96   CharInfo& GetCharInfo(size_t index);
97   float GetCharFontSize(size_t index) const;
98   CFX_FloatRect GetCharLooseBounds(size_t index) const;
99 
100   std::vector<CFX_FloatRect> GetRectArray(int start, int count) const;
101   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
102   WideString GetTextByRect(const CFX_FloatRect& rect) const;
103   WideString GetTextByObject(const CPDF_TextObject* pTextObj) const;
104 
105   // Returns string with the text from |m_TextBuf| that are covered by the input
106   // range. |start| and |count| are in terms of the |m_CharIndices|, so the
107   // range will be converted into appropriate indices.
108   WideString GetPageText(int start, int count) const;
GetAllPageText()109   WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
110 
111   int CountRects(int start, int nCount);
112   bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;
113 
114  private:
115   enum class TextOrientation {
116     kUnknown,
117     kHorizontal,
118     kVertical,
119   };
120 
121   enum class GenerateCharacter {
122     kNone,
123     kSpace,
124     kLineBreak,
125     kHyphen,
126   };
127 
128   enum class MarkedContentState { kPass = 0, kDone, kDelay };
129 
130   struct TransformedTextObject {
131     TransformedTextObject();
132     TransformedTextObject(const TransformedTextObject& that);
133     ~TransformedTextObject();
134 
135     UnownedPtr<CPDF_TextObject> m_pTextObj;
136     CFX_Matrix m_formMatrix;
137   };
138 
139   void Init();
140   bool IsHyphen(wchar_t curChar) const;
141   void ProcessObject();
142   void ProcessFormObject(CPDF_FormObject* pFormObj,
143                          const CFX_Matrix& form_matrix);
144   void ProcessTextObject(const TransformedTextObject& obj);
145   void ProcessTextObject(CPDF_TextObject* pTextObj,
146                          const CFX_Matrix& form_matrix,
147                          const CPDF_PageObjectHolder* pObjList,
148                          CPDF_PageObjectHolder::const_iterator ObjPos);
149   GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
150                                         const CFX_Matrix& form_matrix);
151   // Returns whether to continue or not.
152   bool ProcessGenerateCharacter(GenerateCharacter type,
153                                 const CPDF_TextObject* text_object,
154                                 const CFX_Matrix& form_matrix);
155   void ProcessTextObjectItems(CPDF_TextObject* text_object,
156                               const CFX_Matrix& form_matrix,
157                               const CFX_Matrix& matrix);
158   const CharInfo* GetPrevCharInfo() const;
159   std::optional<CharInfo> GenerateCharInfo(wchar_t unicode,
160                                            const CFX_Matrix& form_matrix);
161   bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
162                              const CPDF_PageObjectHolder* pObjList,
163                              CPDF_PageObjectHolder::const_iterator iter) const;
164   bool IsSameTextObject(CPDF_TextObject* pTextObj1,
165                         CPDF_TextObject* pTextObj2) const;
166   void CloseTempLine();
167   MarkedContentState PreMarkedContent(const CPDF_TextObject* pTextObj);
168   void ProcessMarkedContent(const TransformedTextObject& obj);
169   void FindPreviousTextObject();
170   void AddCharInfoByLRDirection(wchar_t wChar, const CharInfo& info);
171   void AddCharInfoByRLDirection(wchar_t wChar, const CharInfo& info);
172   TextOrientation GetTextObjectWritingMode(
173       const CPDF_TextObject* pTextObj) const;
174   TextOrientation FindTextlineFlowOrientation() const;
175   void AppendGeneratedCharacter(wchar_t unicode,
176                                 const CFX_Matrix& form_matrix,
177                                 bool use_temp_buffer);
178   void SwapTempTextBuf(size_t iCharListStartAppend, size_t iBufStartAppend);
179   WideString GetTextByPredicate(
180       const std::function<bool(const CharInfo&)>& predicate) const;
181 
182   UnownedPtr<const CPDF_Page> const m_pPage;
183   DataVector<TextPageCharSegment> m_CharIndices;
184   std::deque<CharInfo> m_CharList;
185   std::deque<CharInfo> m_TempCharList;
186   WideTextBuffer m_TextBuf;
187   WideTextBuffer m_TempTextBuf;
188   UnownedPtr<const CPDF_TextObject> m_pPrevTextObj;
189   CFX_Matrix m_PrevMatrix;
190   const bool m_rtl;
191   const CFX_Matrix m_DisplayMatrix;
192   std::vector<CFX_FloatRect> m_SelRects;
193   std::vector<TransformedTextObject> mTextObjects;
194   TextOrientation m_TextlineDir = TextOrientation::kUnknown;
195   CFX_FloatRect m_CurlineRect;
196 };
197 
198 #endif  // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
199