1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/include/fpdfapi/fpdf_page.h"
8 #include "core/include/fpdfapi/fpdf_pageobj.h"
9 #include "text_int.h"
10
11 class CPDF_TextStream {
12 public:
13 CPDF_TextStream(CFX_WideTextBuf& buffer,
14 FX_BOOL bUseLF,
15 CFX_PtrArray* pObjArray);
~CPDF_TextStream()16 ~CPDF_TextStream() {}
17 FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine);
18 CFX_WideTextBuf& m_Buffer;
19 FX_BOOL m_bUseLF;
20 CFX_PtrArray* m_pObjArray;
21 const CPDF_TextObject* m_pLastObj;
22 };
CPDF_TextStream(CFX_WideTextBuf & buffer,FX_BOOL bUseLF,CFX_PtrArray * pObjArray)23 CPDF_TextStream::CPDF_TextStream(CFX_WideTextBuf& buffer,
24 FX_BOOL bUseLF,
25 CFX_PtrArray* pObjArray)
26 : m_Buffer(buffer) {
27 m_pLastObj = NULL;
28 m_bUseLF = bUseLF;
29 m_pObjArray = pObjArray;
30 }
FPDFText_IsSameTextObject(const CPDF_TextObject * pTextObj1,const CPDF_TextObject * pTextObj2)31 FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1,
32 const CPDF_TextObject* pTextObj2) {
33 if (!pTextObj1 || !pTextObj2) {
34 return FALSE;
35 }
36 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
37 pTextObj2->m_Right, pTextObj2->m_Top);
38 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
39 pTextObj1->m_Right, pTextObj1->m_Top);
40 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
41 return TRUE;
42 }
43 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
44 rcPreObj.Intersect(rcCurObj);
45 if (rcPreObj.IsEmpty()) {
46 return FALSE;
47 }
48 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
49 rcCurObj.Width() / 2) {
50 return FALSE;
51 }
52 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
53 return FALSE;
54 }
55 }
56 int nPreCount = pTextObj2->CountItems();
57 int nCurCount = pTextObj1->CountItems();
58 if (nPreCount != nCurCount) {
59 return FALSE;
60 }
61 for (int i = 0; i < nPreCount; i++) {
62 CPDF_TextObjectItem itemPer, itemCur;
63 pTextObj2->GetItemInfo(i, &itemPer);
64 pTextObj1->GetItemInfo(i, &itemCur);
65 if (itemCur.m_CharCode != itemPer.m_CharCode) {
66 return FALSE;
67 }
68 }
69 return TRUE;
70 }
GetCharWidth(FX_DWORD charCode,CPDF_Font * pFont)71 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) {
72 if (charCode == -1) {
73 return 0;
74 }
75 int w = pFont->GetCharWidthF(charCode);
76 if (w == 0) {
77 CFX_ByteString str;
78 pFont->AppendChar(str, charCode);
79 w = pFont->GetStringWidth(str, 1);
80 if (w == 0) {
81 FX_RECT BBox;
82 pFont->GetCharBBox(charCode, BBox);
83 w = BBox.right - BBox.left;
84 }
85 }
86 return w;
87 }
FPDFText_ProcessInterObj(const CPDF_TextObject * pPrevObj,const CPDF_TextObject * pObj)88 int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj,
89 const CPDF_TextObject* pObj) {
90 if (FPDFText_IsSameTextObject(pPrevObj, pObj)) {
91 return -1;
92 }
93 CPDF_TextObjectItem item;
94 int nItem = pPrevObj->CountItems();
95 pPrevObj->GetItemInfo(nItem - 1, &item);
96 FX_WCHAR preChar = 0, curChar = 0;
97 CFX_WideString wstr =
98 pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
99 if (wstr.GetLength()) {
100 preChar = wstr.GetAt(0);
101 }
102 FX_FLOAT last_pos = item.m_OriginX;
103 int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont());
104 FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000;
105 last_width = FXSYS_fabs(last_width);
106 pObj->GetItemInfo(0, &item);
107 wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
108 if (wstr.GetLength()) {
109 curChar = wstr.GetAt(0);
110 }
111 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
112 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
113 this_width = FXSYS_fabs(this_width);
114 FX_FLOAT threshold =
115 last_width > this_width ? last_width / 4 : this_width / 4;
116 CFX_Matrix prev_matrix, prev_reverse;
117 pPrevObj->GetTextMatrix(&prev_matrix);
118 prev_reverse.SetReverse(prev_matrix);
119 FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY();
120 prev_reverse.Transform(x, y);
121 if (FXSYS_fabs(y) > threshold * 2) {
122 return 2;
123 }
124 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
125 threshold = threshold > 400
126 ? (threshold < 700 ? threshold / 4 : threshold / 5)
127 : (threshold / 2);
128 threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize())
129 : FXSYS_fabs(pObj->GetFontSize());
130 threshold /= 1000;
131 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
132 preChar != L' ')
133 if (curChar != L' ' && preChar != L' ') {
134 if ((x - last_pos - last_width) > threshold ||
135 (last_pos - x - last_width) > threshold) {
136 return 1;
137 }
138 if (x < 0 && (last_pos - x - last_width) > threshold) {
139 return 1;
140 }
141 if ((x - last_pos - last_width) > this_width ||
142 (x - last_pos - this_width) > last_width) {
143 return 1;
144 }
145 }
146 if (last_pos + last_width > x + this_width && curChar == L' ') {
147 return 3;
148 }
149 return 0;
150 }
ProcessObject(const CPDF_TextObject * pObj,FX_BOOL bFirstLine)151 FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj,
152 FX_BOOL bFirstLine) {
153 CPDF_Font* pFont = pObj->GetFont();
154 CFX_Matrix matrix;
155 pObj->GetTextMatrix(&matrix);
156 int item_index = 0;
157 if (m_pLastObj) {
158 int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
159 if (result == 2) {
160 int len = m_Buffer.GetLength();
161 if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
162 m_Buffer.Delete(len - 1, 1);
163 if (m_pObjArray) {
164 m_pObjArray->RemoveAt((len - 1) * 2, 2);
165 }
166 } else {
167 if (bFirstLine) {
168 return TRUE;
169 }
170 if (m_bUseLF) {
171 m_Buffer.AppendChar(L'\r');
172 m_Buffer.AppendChar(L'\n');
173 if (m_pObjArray) {
174 for (int i = 0; i < 4; i++) {
175 m_pObjArray->Add(NULL);
176 }
177 }
178 } else {
179 m_Buffer.AppendChar(' ');
180 if (m_pObjArray) {
181 m_pObjArray->Add(NULL);
182 m_pObjArray->Add(NULL);
183 }
184 }
185 }
186 } else if (result == 1) {
187 m_Buffer.AppendChar(L' ');
188 if (m_pObjArray) {
189 m_pObjArray->Add(NULL);
190 m_pObjArray->Add(NULL);
191 }
192 } else if (result == -1) {
193 m_pLastObj = pObj;
194 return FALSE;
195 } else if (result == 3) {
196 item_index = 1;
197 }
198 }
199 m_pLastObj = pObj;
200 int nItems = pObj->CountItems();
201 FX_FLOAT Ignorekerning = 0;
202 for (int i = 1; i < nItems - 1; i += 2) {
203 CPDF_TextObjectItem item;
204 pObj->GetItemInfo(i, &item);
205 if (item.m_CharCode == (FX_DWORD)-1) {
206 if (i == 1) {
207 Ignorekerning = item.m_OriginX;
208 } else if (Ignorekerning > item.m_OriginX) {
209 Ignorekerning = item.m_OriginX;
210 }
211 } else {
212 Ignorekerning = 0;
213 break;
214 }
215 }
216 FX_FLOAT spacing = 0;
217 for (; item_index < nItems; item_index++) {
218 CPDF_TextObjectItem item;
219 pObj->GetItemInfo(item_index, &item);
220 if (item.m_CharCode == (FX_DWORD)-1) {
221 CFX_WideString wstr = m_Buffer.GetWideString();
222 if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
223 continue;
224 }
225 FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
226 spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
227 continue;
228 }
229 FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
230 if (nItems > 3 && !spacing) {
231 charSpace = 0;
232 }
233 if ((spacing || charSpace) && item_index > 0) {
234 int last_width = 0;
235 FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
236 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
237 FX_FLOAT threshold = 0;
238 if (space_charcode != -1) {
239 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
240 }
241 if (threshold > fontsize_h / 3) {
242 threshold = 0;
243 } else {
244 threshold /= 2;
245 }
246 if (threshold == 0) {
247 threshold = fontsize_h;
248 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
249 threshold = this_width > last_width ? (FX_FLOAT)this_width
250 : (FX_FLOAT)last_width;
251 int nDivide = 6;
252 if (threshold < 300) {
253 nDivide = 2;
254 } else if (threshold < 500) {
255 nDivide = 4;
256 } else if (threshold < 700) {
257 nDivide = 5;
258 }
259 threshold = threshold / nDivide;
260 threshold = fontsize_h * threshold / 1000;
261 }
262 if (charSpace > 0.001) {
263 spacing += matrix.TransformDistance(charSpace);
264 } else if (charSpace < -0.001) {
265 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
266 }
267 if (threshold && (spacing && spacing >= threshold)) {
268 m_Buffer.AppendChar(L' ');
269 if (m_pObjArray) {
270 m_pObjArray->Add(NULL);
271 m_pObjArray->Add(NULL);
272 }
273 }
274 if (item.m_CharCode == (FX_DWORD)-1) {
275 continue;
276 }
277 spacing = 0;
278 }
279 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
280 if (unicode_str.IsEmpty()) {
281 m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode);
282 if (m_pObjArray) {
283 m_pObjArray->Add((void*)pObj);
284 m_pObjArray->Add((void*)(intptr_t)item_index);
285 }
286 } else {
287 m_Buffer << unicode_str;
288 if (m_pObjArray) {
289 for (int i = 0; i < unicode_str.GetLength(); i++) {
290 m_pObjArray->Add((void*)pObj);
291 m_pObjArray->Add((void*)(intptr_t)item_index);
292 }
293 }
294 }
295 }
296 return FALSE;
297 }
GetTextStream_Unicode(CFX_WideTextBuf & buffer,CPDF_PageObjects * pPage,FX_BOOL bUseLF,CFX_PtrArray * pObjArray)298 void GetTextStream_Unicode(CFX_WideTextBuf& buffer,
299 CPDF_PageObjects* pPage,
300 FX_BOOL bUseLF,
301 CFX_PtrArray* pObjArray) {
302 CPDF_TextStream textstream(buffer, bUseLF, pObjArray);
303 FX_POSITION pos = pPage->GetFirstObjectPosition();
304 while (pos) {
305 CPDF_PageObject* pObject = pPage->GetNextObject(pos);
306 if (pObject && pObject->m_Type == PDFPAGE_TEXT)
307 textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE);
308 }
309 }
PDF_GetFirstTextLine_Unicode(CPDF_Document * pDoc,CPDF_Dictionary * pPage)310 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc,
311 CPDF_Dictionary* pPage) {
312 CFX_WideTextBuf buffer;
313 buffer.EstimateSize(0, 1024);
314 CPDF_Page page;
315 page.Load(pDoc, pPage);
316 CPDF_ParseOptions options;
317 options.m_bTextOnly = TRUE;
318 options.m_bSeparateForm = FALSE;
319 page.ParseContent(&options);
320 CPDF_TextStream textstream(buffer, FALSE, NULL);
321 FX_POSITION pos = page.GetFirstObjectPosition();
322 while (pos) {
323 CPDF_PageObject* pObject = page.GetNextObject(pos);
324 if (pObject->m_Type != PDFPAGE_TEXT) {
325 continue;
326 }
327 if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
328 break;
329 }
330 }
331 return buffer.GetWideString();
332 }
333