• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef _PDF_TEXT_INT_H_
8 #define _PDF_TEXT_INT_H_
9 class CPDF_TextParseOptions : public CFX_Object
10 {
11 public:
12     CPDF_TextParseOptions();
13     FX_BOOL			m_bCheckObjectOrder;
14     FX_BOOL			m_bCheckDirection;
15     int				m_nCheckSameObject;
16 };
17 class CPDF_TextPage;
18 class CPDF_LinkExtract;
19 class CPDF_TextPageFind;
20 class CPDF_DocProgressiveSearch;
21 #define FPDFTEXT_CHAR_ERROR			-1
22 #define FPDFTEXT_CHAR_NORMAL		0
23 #define FPDFTEXT_CHAR_GENERATED		1
24 #define FPDFTEXT_CHAR_UNUNICODE		2
25 #define FPDFTEXT_CHAR_HYPHEN		3
26 #define FPDFTEXT_CHAR_PIECE			4
27 #define FPDFTEXT_MC_PASS			0
28 #define FPDFTEXT_MC_DONE			1
29 #define FPDFTEXT_MC_DELAY			2
30 typedef struct _PAGECHAR_INFO: public CFX_Object {
31     int					m_CharCode;
32     FX_WCHAR			m_Unicode;
33     FX_FLOAT			m_OriginX;
34     FX_FLOAT			m_OriginY;
35     FX_INT32			m_Flag;
36     CFX_FloatRect		m_CharBox;
37     CPDF_TextObject*	m_pTextObj;
38     CFX_AffineMatrix	m_Matrix;
39     int					m_Index;
40 } PAGECHAR_INFO;
41 typedef	CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray;
42 typedef struct {
43     int	m_Start;
44     int m_nCount;
45 } FPDF_SEGMENT;
46 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array;
47 typedef struct {
48     CPDF_TextObject*	m_pTextObj;
49     CFX_AffineMatrix	m_formMatrix;
50 } PDFTEXT_Obj;
51 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ;
52 class CPDF_TextPage: public IPDF_TextPage
53 {
54 public:
55     CPDF_TextPage(const CPDF_Page* pPage, int flags = 0);
56     CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0);
57     CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions);
58     virtual FX_BOOL					ParseTextPage();
59     virtual void					NormalizeObjects(FX_BOOL bNormalize);
IsParsered()60     virtual	FX_BOOL					IsParsered() const
61     {
62         return m_IsParsered;
63     }
~CPDF_TextPage()64     virtual ~CPDF_TextPage() {};
65 public:
66     virtual int CharIndexFromTextIndex(int TextIndex)const ;
67     virtual int TextIndexFromCharIndex(int CharIndex)const;
68     virtual int						CountChars() const;
69     virtual	void					GetCharInfo(int index, FPDF_CHAR_INFO & info) const;
70     virtual void					GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const;
71     virtual int						GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const;
72     virtual int						GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance,
73             FX_FLOAT yTorelance) const;
74     virtual CFX_WideString			GetTextByRect(CFX_FloatRect rect) const;
75     virtual void					GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const;
76     virtual	int						GetOrderByDirection(int order, int direction) const;
77     virtual	CFX_WideString			GetPageText(int start = 0, int nCount = -1) const;
78 
79     virtual int						CountRects(int start, int nCount);
80     virtual	void					GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top
81                                             , FX_FLOAT& right, FX_FLOAT &bottom) const;
82     virtual FX_BOOL					GetBaselineRotate(int rectIndex, int& Rotate);
83     virtual FX_BOOL					GetBaselineRotate(CFX_FloatRect rect, int& Rotate);
84     virtual	int						CountBoundedSegments(FX_FLOAT left, FX_FLOAT top,
85             FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE);
86     virtual	void					GetBoundedSegment(int index, int& start, int& count) const;
87     virtual int						GetWordBreak(int index, int direction) const;
88 public:
GetCharList()89     const	PAGECHAR_InfoArray*		GetCharList() const
90     {
91         return &m_charList;
92     }
93     static	FX_BOOL					IsRectIntersect(CFX_FloatRect rect1, CFX_FloatRect rect2);
94     static	FX_BOOL					IsLetter(FX_WCHAR unicode);
95 private:
96     FX_BOOL							IsHyphen(FX_WCHAR curChar);
97     FX_BOOL							IsControlChar(PAGECHAR_INFO* pCharInfo);
98     FX_BOOL							GetBaselineRotate(int start, int end, int& Rotate);
99     void							ProcessObject();
100     void							ProcessFormObject(CPDF_FormObject*	pFormObj, CFX_AffineMatrix formMatrix);
101     void							ProcessTextObject(PDFTEXT_Obj pObj);
102     void							ProcessTextObject(CPDF_TextObject*	pTextObj, CFX_AffineMatrix formMatrix, FX_POSITION ObjPos);
103     int								ProcessInsertObject(const CPDF_TextObject* pObj, CFX_AffineMatrix formMatrix);
104     FX_BOOL							GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
105     FX_BOOL							IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos);
106     FX_BOOL							IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2);
107     int								GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const;
108     void							CloseTempLine();
109     void							OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str);
110     FX_INT32	PreMarkedContent(PDFTEXT_Obj pObj);
111     void		ProcessMarkedContent(PDFTEXT_Obj pObj);
112     void		CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const;
113     void		FindPreviousTextObject(void);
114     void		AddCharInfoByLRDirection(CFX_WideString& str, int i);
115     void		AddCharInfoByRLDirection(CFX_WideString& str, int i);
116     FX_INT32	GetTextObjectWritingMode(const CPDF_TextObject* pTextObj);
117     FX_INT32	FindTextlineFlowDirection();
118 protected:
119     CPDFText_ParseOptions			m_ParseOptions;
120     CFX_WordArray					m_CharIndex;
121     const CPDF_PageObjects*			m_pPage;
122     PAGECHAR_InfoArray				m_charList;
123     CFX_WideTextBuf					m_TextBuf;
124     PAGECHAR_InfoArray				m_TempCharList;
125     CFX_WideTextBuf					m_TempTextBuf;
126     int								m_parserflag;
127     CPDF_TextObject*				m_pPreTextObj;
128     CFX_AffineMatrix				m_perMatrix;
129     FX_BOOL							m_IsParsered;
130     CFX_AffineMatrix				m_DisplayMatrix;
131 
132     SEGMENT_Array					m_Segment;
133     CFX_RectArray					m_SelRects;
134     LINEOBJ							m_LineObj;
135     FX_BOOL							m_TextlineDir;
136     CFX_FloatRect					m_CurlineRect;
137 };
138 class CPDF_TextPageFind: public IPDF_TextPageFind
139 {
140 public:
141     CPDF_TextPageFind(const IPDF_TextPage* pTextPage);
~CPDF_TextPageFind()142     virtual							~CPDF_TextPageFind() {};
143 public:
144     virtual	FX_BOOL					FindFirst(CFX_WideString findwhat, int flags, int startPos = 0);
145     virtual	FX_BOOL					FindNext();
146     virtual	FX_BOOL					FindPrev();
147 
148     virtual void					GetRectArray(CFX_RectArray& rects) const;
149     virtual int						GetCurOrder() const;
150     virtual int						GetMatchedCount()const;
151 protected:
152     void							ExtractFindWhat(CFX_WideString findwhat);
153     FX_BOOL							IsMatchWholeWord(CFX_WideString csPageText, int startPos, int endPos);
154     FX_BOOL							ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString,
155             int iSubString, FX_WCHAR chSep);
156     CFX_WideString					MakeReverse(const CFX_WideString str);
157     int								ReverseFind(CFX_WideString csPageText, CFX_WideString csWord, int nStartPos, int& WordLength);
158     int								GetCharIndex(int index) const;
159 private:
160     CFX_WordArray					m_CharIndex;
161     const IPDF_TextPage*			m_pTextPage;
162     CFX_WideString					m_strText;
163     CFX_WideString					m_findWhat;
164     int								m_flags;
165     CFX_WideStringArray				m_csFindWhatArray;
166     int								m_findNextStart;
167     int								m_findPreStart;
168     FX_BOOL							m_bMatchCase;
169     FX_BOOL							m_bMatchWholeWord;
170     int								m_resStart;
171     int								m_resEnd;
172     CFX_RectArray					m_resArray;
173     FX_BOOL							m_IsFind;
174 };
175 class CPDF_LinkExt: public CFX_Object
176 {
177 public:
CPDF_LinkExt()178     CPDF_LinkExt() {};
179     int								m_Start;
180     int								m_Count;
181     CFX_WideString					m_strUrl;
~CPDF_LinkExt()182     virtual							~CPDF_LinkExt() {};
183 };
184 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray;
185 class CPDF_LinkExtract: public IPDF_LinkExtract
186 {
187 public:
188     CPDF_LinkExtract();
189     virtual							~CPDF_LinkExtract();
190     virtual FX_BOOL					ExtractLinks(const IPDF_TextPage* pTextPage);
IsExtract()191     virtual	FX_BOOL					IsExtract() const
192     {
193         return m_IsParserd;
194     }
195 public:
196     virtual int						CountLinks() const;
197     virtual	CFX_WideString			GetURL(int index) const;
198     virtual	void					GetBoundedSegment(int index, int& start, int& count) const;
199     virtual	void					GetRects(int index, CFX_RectArray& rects)const;
200 protected:
201     void							parserLink();
202     void							DeleteLinkList();
203     FX_BOOL							CheckWebLink(CFX_WideString& strBeCheck);
204     FX_BOOL							CheckMailLink(CFX_WideString& str);
205     FX_BOOL							AppendToLinkList(int start, int count, CFX_WideString strUrl);
206 private:
207     LINK_InfoArray					m_LinkList;
208     const CPDF_TextPage*			m_pTextPage;
209     CFX_WideString					m_strPageText;
210     FX_BOOL							m_IsParserd;
211 };
212 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_LPWSTR pDst);
213 void NormalizeString(CFX_WideString& str);
214 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest);
215 #endif
216