1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef _FPDF_TEXT_H_ 8 #define _FPDF_TEXT_H_ 9 #ifndef _FPDF_PARSER_ 10 #include "../fpdfapi/fpdf_parser.h" 11 #endif 12 #ifndef _FPDF_PAGEOBJ_H_ 13 #include "../fpdfapi/fpdf_pageobj.h" 14 #endif 15 #ifndef _FPDF_PAGE_ 16 #include "../fpdfapi/fpdf_page.h" 17 #endif 18 class CPDF_PageObjects; 19 #define PDF2TXT_AUTO_ROTATE 1 20 #define PDF2TXT_AUTO_WIDTH 2 21 #define PDF2TXT_KEEP_COLUMN 4 22 #define PDF2TXT_USE_OCR 8 23 #define PDF2TXT_INCLUDE_INVISIBLE 16 24 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 25 int iMinWidth, FX_DWORD flags); 26 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 27 int iMinWidth, FX_DWORD flags); 28 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 29 FX_DWORD flags); 30 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage); 31 class IPDF_TextPage; 32 class IPDF_LinkExtract; 33 class IPDF_TextPageFind; 34 #define CHAR_ERROR -1 35 #define CHAR_NORMAL 0 36 #define CHAR_GENERATED 1 37 #define CHAR_UNUNICODE 2 38 typedef struct { 39 FX_WCHAR m_Unicode; 40 FX_WCHAR m_Charcode; 41 FX_INT32 m_Flag; 42 FX_FLOAT m_FontSize; 43 FX_FLOAT m_OriginX; 44 FX_FLOAT m_OriginY; 45 CFX_FloatRect m_CharBox; 46 CPDF_TextObject* m_pTextObj; 47 CFX_AffineMatrix m_Matrix; 48 } FPDF_CHAR_INFO; 49 typedef CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray; 50 #define FPDFTEXT_LRTB 0 51 #define FPDFTEXT_RLTB 1 52 #define FPDFTEXT_TBRL 2 53 #define FPDFTEXT_LEFT -1 54 #define FPDFTEXT_RIGHT 1 55 #define FPDFTEXT_UP -2 56 #define FPDFTEXT_DOWN 2 57 class IPDF_ReflowedPage; 58 #define FPDFTEXT_WRITINGMODE_UNKNOW 0 59 #define FPDFTEXT_WRITINGMODE_LRTB 1 60 #define FPDFTEXT_WRITINGMODE_RLTB 2 61 #define FPDFTEXT_WRITINGMODE_TBRL 3 62 class CPDFText_ParseOptions : public CFX_Object 63 { 64 public: 65 66 CPDFText_ParseOptions(); 67 FX_BOOL m_bGetCharCodeOnly; 68 FX_BOOL m_bNormalizeObjs; 69 FX_BOOL m_bOutputHyphen; 70 }; 71 class IPDF_TextPage : public CFX_Object 72 { 73 public: 74 ~IPDF_TextPage()75 virtual ~IPDF_TextPage() {} 76 static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions); 77 static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0); 78 static IPDF_TextPage* CreateTextPage(const CPDF_PageObjects* pObjs, int flags = 0); 79 static IPDF_TextPage* CreateReflowTextPage(IPDF_ReflowedPage* pRefPage); 80 81 virtual void NormalizeObjects(FX_BOOL bNormalize) = 0; 82 83 virtual FX_BOOL ParseTextPage() = 0; 84 85 86 virtual FX_BOOL IsParsered() const = 0; 87 public: 88 89 virtual int CharIndexFromTextIndex(int TextIndex) const = 0; 90 91 virtual int TextIndexFromCharIndex(int CharIndex) const = 0; 92 93 94 virtual int CountChars() const = 0; 95 96 virtual void GetCharInfo(int index, FPDF_CHAR_INFO & info) const = 0; 97 98 virtual void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const = 0; 99 100 101 102 virtual int GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0; 103 104 virtual int GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0; 105 106 virtual int GetOrderByDirection(int index, int direction) const = 0; 107 108 virtual CFX_WideString GetTextByRect(CFX_FloatRect rect) const = 0; 109 110 virtual void GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const = 0; 111 112 113 virtual int CountRects(int start, int nCount) = 0; 114 115 virtual void GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const = 0; 116 117 virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0; 118 119 virtual FX_BOOL GetBaselineRotate(CFX_FloatRect rect, int& Rotate) = 0; 120 121 virtual int CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE) = 0; 122 123 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; 124 125 126 virtual int GetWordBreak(int index, int direction) const = 0; 127 128 virtual CFX_WideString GetPageText(int start = 0, int nCount = -1 ) const = 0; 129 }; 130 #define FPDFTEXT_MATCHCASE 0x00000001 131 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 132 #define FPDFTEXT_CONSECUTIVE 0x00000004 133 class IPDF_TextPageFind : public CFX_Object 134 { 135 public: 136 ~IPDF_TextPageFind()137 virtual ~IPDF_TextPageFind() {} 138 139 static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage); 140 public: 141 142 virtual FX_BOOL FindFirst(CFX_WideString findwhat, int flags, int startPos = 0) = 0; 143 144 virtual FX_BOOL FindNext() = 0; 145 146 virtual FX_BOOL FindPrev() = 0; 147 148 virtual void GetRectArray(CFX_RectArray& rects) const = 0; 149 150 virtual int GetCurOrder() const = 0; 151 152 virtual int GetMatchedCount() const = 0; 153 }; 154 class IPDF_LinkExtract : public CFX_Object 155 { 156 public: 157 ~IPDF_LinkExtract()158 virtual ~IPDF_LinkExtract() {} 159 160 static IPDF_LinkExtract* CreateLinkExtract(); 161 162 virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0; 163 public: 164 165 virtual int CountLinks() const = 0; 166 167 virtual CFX_WideString GetURL(int index) const = 0; 168 169 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; 170 171 virtual void GetRects(int index, CFX_RectArray& rects) const = 0; 172 }; 173 #endif 174