1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "../../include/fpdfapi/fpdf_pageobj.h"
8 #include "../../include/fpdftext/fpdf_text.h"
9 #include "../../include/fpdfapi/fpdf_page.h"
10 class CPDF_TextStream : public CFX_Object
11 {
12 public:
13 CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray);
~CPDF_TextStream()14 ~CPDF_TextStream() {}
15 FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine);
16 CFX_WideTextBuf& m_Buffer;
17 FX_BOOL m_bUseLF;
18 CFX_PtrArray* m_pObjArray;
19 const CPDF_TextObject* m_pLastObj;
20 };
CPDF_TextStream(CFX_WideTextBuf & buffer,FX_BOOL bUseLF,CFX_PtrArray * pObjArray)21 CPDF_TextStream::CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray) : m_Buffer(buffer)
22 {
23 m_pLastObj = NULL;
24 m_bUseLF = bUseLF;
25 m_pObjArray = pObjArray;
26 }
FPDFText_IsSameTextObject(const CPDF_TextObject * pTextObj1,const CPDF_TextObject * pTextObj2)27 FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1, const CPDF_TextObject* pTextObj2)
28 {
29 if (!pTextObj1 || !pTextObj2) {
30 return FALSE;
31 }
32 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, pTextObj2->m_Right, pTextObj2->m_Top);
33 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, pTextObj1->m_Right, pTextObj1->m_Top);
34 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
35 return TRUE;
36 }
37 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
38 rcPreObj.Intersect(rcCurObj);
39 if (rcPreObj.IsEmpty()) {
40 return FALSE;
41 }
42 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) {
43 return FALSE;
44 }
45 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
46 return FALSE;
47 }
48 }
49 int nPreCount = pTextObj2->CountItems();
50 int nCurCount = pTextObj1->CountItems();
51 if (nPreCount != nCurCount) {
52 return FALSE;
53 }
54 for (int i = 0; i < nPreCount; i++) {
55 CPDF_TextObjectItem itemPer, itemCur;
56 pTextObj2->GetItemInfo(i, &itemPer);
57 pTextObj1->GetItemInfo(i, &itemCur);
58 if (itemCur.m_CharCode != itemPer.m_CharCode) {
59 return FALSE;
60 }
61 }
62 return TRUE;
63 }
GetCharWidth(FX_DWORD charCode,CPDF_Font * pFont)64 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont)
65 {
66 if(charCode == -1) {
67 return 0;
68 }
69 int w = pFont->GetCharWidthF(charCode);
70 if(w == 0) {
71 CFX_ByteString str;
72 pFont->AppendChar(str, charCode);
73 w = pFont->GetStringWidth(str, 1);
74 if(w == 0) {
75 FX_RECT BBox;
76 pFont->GetCharBBox(charCode, BBox);
77 w = BBox.right - BBox.left;
78 }
79 }
80 return w;
81 }
FPDFText_ProcessInterObj(const CPDF_TextObject * pPrevObj,const CPDF_TextObject * pObj)82 int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj, const CPDF_TextObject* pObj)
83 {
84 if(FPDFText_IsSameTextObject(pPrevObj, pObj)) {
85 return -1;
86 }
87 CPDF_TextObjectItem item;
88 int nItem = pPrevObj->CountItems();
89 pPrevObj->GetItemInfo(nItem - 1, &item);
90 FX_WCHAR preChar = 0, curChar = 0;
91 CFX_WideString wstr = pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
92 if(wstr.GetLength()) {
93 preChar = wstr.GetAt(0);
94 }
95 FX_FLOAT last_pos = item.m_OriginX;
96 int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont());
97 FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000;
98 last_width = FXSYS_fabs(last_width);
99 pObj->GetItemInfo(0, &item);
100 wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
101 if(wstr.GetLength()) {
102 curChar = wstr.GetAt(0);
103 }
104 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
105 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
106 this_width = FXSYS_fabs(this_width);
107 FX_FLOAT threshold = last_width > this_width ? last_width / 4 : this_width / 4;
108 CFX_AffineMatrix prev_matrix, prev_reverse;
109 pPrevObj->GetTextMatrix(&prev_matrix);
110 prev_reverse.SetReverse(prev_matrix);
111 FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY();
112 prev_reverse.Transform(x, y);
113 if (FXSYS_fabs(y) > threshold * 2) {
114 return 2;
115 }
116 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
117 threshold = threshold > 400 ? (threshold < 700 ? threshold / 4 : threshold / 5) : (threshold / 2);
118 threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize()) : FXSYS_fabs(pObj->GetFontSize());
119 threshold /= 1000;
120 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && preChar != L' ')
121 if(curChar != L' ' && preChar != L' ') {
122 if((x - last_pos - last_width) > threshold || (last_pos - x - last_width) > threshold) {
123 return 1;
124 }
125 if(x < 0 && (last_pos - x - last_width) > threshold) {
126 return 1;
127 }
128 if((x - last_pos - last_width) > this_width || (x - last_pos - this_width) > last_width ) {
129 return 1;
130 }
131 }
132 if(last_pos + last_width > x + this_width && curChar == L' ') {
133 return 3;
134 }
135 return 0;
136 }
ProcessObject(const CPDF_TextObject * pObj,FX_BOOL bFirstLine)137 FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine)
138 {
139 if(pObj->m_Bottom > 380 && pObj->m_Left < 45 && pObj->m_Top < 402) {
140 int i = 0;
141 }
142 CPDF_Font* pFont = pObj->GetFont();
143 CFX_AffineMatrix matrix;
144 pObj->GetTextMatrix(&matrix);
145 FX_FLOAT fs = pObj->GetFontSize();
146 int item_index = 0;
147 if (m_pLastObj) {
148 int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
149 if (result == 2) {
150 int len = m_Buffer.GetLength();
151 if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
152 m_Buffer.Delete(len - 1, 1);
153 if (m_pObjArray) {
154 m_pObjArray->RemoveAt((len - 1) * 2, 2);
155 }
156 } else {
157 if (bFirstLine) {
158 return TRUE;
159 }
160 if (m_bUseLF) {
161 m_Buffer.AppendChar(L'\r');
162 m_Buffer.AppendChar(L'\n');
163 if (m_pObjArray) {
164 for (int i = 0; i < 4; i ++) {
165 m_pObjArray->Add(NULL);
166 }
167 }
168 } else {
169 m_Buffer.AppendChar(' ');
170 if (m_pObjArray) {
171 m_pObjArray->Add(NULL);
172 m_pObjArray->Add(NULL);
173 }
174 }
175 }
176 } else if (result == 1) {
177 m_Buffer.AppendChar(L' ');
178 if (m_pObjArray) {
179 m_pObjArray->Add(NULL);
180 m_pObjArray->Add(NULL);
181 }
182 } else if (result == -1) {
183 m_pLastObj = pObj;
184 return FALSE;
185 } else if (result == 3) {
186 item_index = 1;
187 }
188 }
189 m_pLastObj = pObj;
190 int nItems = pObj->CountItems();
191 FX_FLOAT Ignorekerning = 0;
192 for(int i = 1; i < nItems - 1; i += 2) {
193 CPDF_TextObjectItem item;
194 pObj->GetItemInfo(i, &item);
195 if (item.m_CharCode == (FX_DWORD) - 1) {
196 if(i == 1) {
197 Ignorekerning = item.m_OriginX;
198 } else if(Ignorekerning > item.m_OriginX) {
199 Ignorekerning = item.m_OriginX;
200 }
201 } else {
202 Ignorekerning = 0;
203 break;
204 }
205 }
206 FX_FLOAT spacing = 0;
207 for (; item_index < nItems; item_index ++) {
208 CPDF_TextObjectItem item;
209 pObj->GetItemInfo(item_index, &item);
210 if (item.m_CharCode == (FX_DWORD) - 1) {
211 CFX_WideString wstr = m_Buffer.GetWideString();
212 if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
213 continue;
214 }
215 FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
216 spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
217 continue;
218 }
219 FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
220 if(nItems > 3 && !spacing) {
221 charSpace = 0;
222 }
223 if((spacing || charSpace) && item_index > 0) {
224 int last_width = 0;
225 FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
226 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
227 FX_FLOAT threshold = 0;
228 if (space_charcode != -1) {
229 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ;
230 }
231 if(threshold > fontsize_h / 3) {
232 threshold = 0;
233 } else {
234 threshold /= 2;
235 }
236 if (threshold == 0) {
237 threshold = fontsize_h;
238 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
239 threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width;
240 int nDivide = 6;
241 if (threshold < 300) {
242 nDivide = 2;
243 } else if (threshold < 500) {
244 nDivide = 4;
245 } else if (threshold < 700) {
246 nDivide = 5;
247 }
248 threshold = threshold / nDivide;
249 threshold = fontsize_h * threshold / 1000;
250 }
251 if(charSpace > 0.001) {
252 spacing += matrix.TransformDistance(charSpace);
253 } else if(charSpace < -0.001) {
254 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
255 }
256 if (threshold && (spacing && spacing >= threshold) ) {
257 m_Buffer.AppendChar(L' ');
258 if (m_pObjArray) {
259 m_pObjArray->Add(NULL);
260 m_pObjArray->Add(NULL);
261 }
262 }
263 if (item.m_CharCode == (FX_DWORD) - 1) {
264 continue;
265 }
266 spacing = 0;
267 }
268 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
269 if (unicode_str.IsEmpty()) {
270 m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode);
271 if (m_pObjArray) {
272 m_pObjArray->Add((void*)pObj);
273 m_pObjArray->Add((void*)(FX_INTPTR)item_index);
274 }
275 } else {
276 m_Buffer << unicode_str;
277 if (m_pObjArray) {
278 for (int i = 0; i < unicode_str.GetLength(); i ++) {
279 m_pObjArray->Add((void*)pObj);
280 m_pObjArray->Add((void*)(FX_INTPTR)item_index);
281 }
282 }
283 }
284 }
285 return FALSE;
286 }
_PDF_GetTextStream_Unicode(CFX_WideTextBuf & buffer,CPDF_PageObjects * pPage,FX_BOOL bUseLF,CFX_PtrArray * pObjArray)287 void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
288 CFX_PtrArray* pObjArray)
289 {
290 CPDF_TextStream textstream(buffer, bUseLF, pObjArray);
291 FX_POSITION pos = pPage->GetFirstObjectPosition();
292 while (pos) {
293 CPDF_PageObject* pObject = pPage->GetNextObject(pos);
294 if (pObject == NULL) {
295 continue;
296 }
297 if (pObject->m_Type != PDFPAGE_TEXT) {
298 continue;
299 }
300 textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE);
301 }
302 }
PDF_GetFirstTextLine_Unicode(CPDF_Document * pDoc,CPDF_Dictionary * pPage)303 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage)
304 {
305 CFX_WideTextBuf buffer;
306 buffer.EstimateSize(0, 1024);
307 CPDF_Page page;
308 page.Load(pDoc, pPage);
309 CPDF_ParseOptions options;
310 options.m_bTextOnly = TRUE;
311 options.m_bSeparateForm = FALSE;
312 page.ParseContent(&options);
313 CPDF_TextStream textstream(buffer, FALSE, NULL);
314 FX_POSITION pos = page.GetFirstObjectPosition();
315 while (pos) {
316 CPDF_PageObject* pObject = page.GetNextObject(pos);
317 if (pObject->m_Type != PDFPAGE_TEXT) {
318 continue;
319 }
320 if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
321 break;
322 }
323 }
324 return buffer.GetWideString();
325 }
326