1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "../../include/fpdfapi/fpdf_page.h"
8 #include "../../include/fpdfapi/fpdf_pageobj.h"
9 #include "../../include/fpdftext/fpdf_text.h"
10 #include "txtproc.h"
11 #include "text_int.h"
12 #if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_)
13 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR);
CharFromUnicodeAlt(FX_WCHAR unicode,int destcp,FX_LPCSTR defchar)14 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defchar)
15 {
16 if (destcp == 0) {
17 if (unicode < 0x80) {
18 return CFX_ByteString((char)unicode);
19 }
20 FX_LPCSTR altstr = FCS_GetAltStr(unicode);
21 if (altstr) {
22 return CFX_ByteString(altstr, -1);
23 }
24 return CFX_ByteString(defchar, -1);
25 }
26 FX_BOOL bDef = FALSE;
27 char buf[10];
28 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef);
29 if (ret && !bDef) {
30 return CFX_ByteString(buf, ret);
31 }
32 FX_LPCSTR altstr = FCS_GetAltStr(unicode);
33 if (altstr) {
34 return CFX_ByteString(altstr, -1);
35 }
36 return CFX_ByteString(defchar, -1);
37 }
CTextPage()38 CTextPage::CTextPage()
39 {
40 }
~CTextPage()41 CTextPage::~CTextPage()
42 {
43 int i;
44 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
45 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
46 delete pBaseLine;
47 }
48 for (i = 0; i < m_TextColumns.GetSize(); i ++) {
49 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
50 delete pTextColumn;
51 }
52 }
ProcessObject(CPDF_PageObject * pObject)53 void CTextPage::ProcessObject(CPDF_PageObject* pObject)
54 {
55 if (pObject->m_Type != PDFPAGE_TEXT) {
56 return;
57 }
58 CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
59 CPDF_Font* pFont = pText->m_TextState.GetFont();
60 int count = pText->CountItems();
61 FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2);
62 if (pPosArray) {
63 pText->CalcCharPos(pPosArray);
64 }
65 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
66 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
67 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
68 FX_FLOAT spacew = 0;
69 if (space_charcode != -1) {
70 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
71 }
72 if (spacew == 0) {
73 spacew = fontsize_h / 4;
74 }
75 if (pText->m_TextState.GetBaselineAngle() != 0) {
76 int cc = 0;
77 CFX_AffineMatrix matrix;
78 pText->GetTextMatrix(&matrix);
79 for (int i = 0; i < pText->m_nChars; i ++) {
80 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
81 if (charcode == (FX_DWORD) - 1) {
82 continue;
83 }
84 FX_RECT char_box;
85 pFont->GetCharBBox(charcode, char_box);
86 FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000;
87 FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000;
88 FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000;
89 FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
90 cc ++;
91 FX_FLOAT char_origx, char_origy;
92 matrix.Transform(char_left, 0, char_origx, char_origy);
93 matrix.TransformRect(char_left, char_right, char_top, char_bottom);
94 CFX_ByteString str;
95 pFont->AppendChar(str, charcode);
96 InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
97 char_bottom, spacew, fontsize_v, str, pFont);
98 }
99 if (pPosArray) {
100 FX_Free(pPosArray);
101 }
102 return;
103 }
104 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
105 for (int ii = 0; ii < count * 2; ii ++) {
106 pPosArray[ii] *= ratio_h;
107 }
108 FX_FLOAT baseline = pText->m_PosY;
109 CTextBaseLine* pBaseLine = NULL;
110 FX_FLOAT topy = pText->m_Top;
111 FX_FLOAT bottomy = pText->m_Bottom;
112 FX_FLOAT leftx = pText->m_Left;
113 int cc = 0;
114 CFX_ByteString segment;
115 int space_count = 0;
116 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
117 for (int i = 0; i < pText->m_nChars; i ++) {
118 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
119 if (charcode == (FX_DWORD) - 1) {
120 continue;
121 }
122 FX_FLOAT char_left = pPosArray[cc * 2];
123 FX_FLOAT char_right = pPosArray[cc * 2 + 1];
124 cc ++;
125 if (char_left < last_left || (char_left - last_right) > spacew / 2) {
126 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
127 topy, bottomy, spacew, fontsize_v, segment, pFont);
128 segment_left = char_left;
129 segment = "";
130 }
131 CFX_WideString wCh = pText->GetFont()->UnicodeFromCharCode(charcode);
132 FX_DWORD ch = wCh.GetLength() > 0 ? wCh.GetAt(0) : charcode;
133 if (space_count > 1) {
134 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
135 topy, bottomy, spacew, fontsize_v, segment, pFont);
136 segment = "";
137 } else if (space_count == 1) {
138 pFont->AppendChar(segment, ' ');
139 }
140 if (segment.GetLength() == 0) {
141 segment_left = char_left;
142 }
143 segment_right = char_right;
144 pFont->AppendChar(segment, charcode);
145 space_count = 0;
146 last_left = char_left;
147 last_right = char_right;
148 }
149 if (segment.GetLength())
150 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
151 topy, bottomy, spacew, fontsize_v, segment, pFont);
152 FX_Free(pPosArray);
153 }
154 static void ConvertPDFString(CFX_ByteString& result, CFX_ByteString& src, CPDF_Font* pFont);
InsertTextBox(CTextBaseLine * pBaseLine,FX_FLOAT basey,FX_FLOAT leftx,FX_FLOAT rightx,FX_FLOAT topy,FX_FLOAT bottomy,FX_FLOAT spacew,FX_FLOAT fontsize_v,CFX_ByteString & str,CPDF_Font * pFont)155 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx,
156 FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v,
157 CFX_ByteString& str, CPDF_Font* pFont)
158 {
159 if (str.GetLength() == 0) {
160 return NULL;
161 }
162 if (pBaseLine == NULL) {
163 int i;
164 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
165 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
166 if (pExistLine->m_BaseLine == basey) {
167 pBaseLine = pExistLine;
168 break;
169 }
170 if (pExistLine->m_BaseLine < basey) {
171 break;
172 }
173 }
174 if (pBaseLine == NULL) {
175 pBaseLine = FX_NEW CTextBaseLine;
176 if (NULL == pBaseLine) {
177 return NULL;
178 }
179 pBaseLine->m_BaseLine = basey;
180 m_BaseLines.InsertAt(i, pBaseLine);
181 }
182 }
183 CFX_WideString text;
184 FX_LPCSTR pStr = str;
185 int len = str.GetLength(), offset = 0;
186 while (offset < len) {
187 FX_DWORD ch = pFont->GetNextChar(pStr, offset);
188 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
189 text += unicode_str;
190 }
191 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text);
192 return pBaseLine;
193 }
WriteOutput(CFX_WideStringArray & lines,int iMinWidth)194 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth)
195 {
196 FX_FLOAT lastheight = -1;
197 FX_FLOAT lastbaseline = -1;
198 FX_FLOAT MinLeftX = 1000000;
199 FX_FLOAT MaxRightX = 0;
200 int i;
201 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
202 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
203 FX_FLOAT leftx, rightx;
204 if (pBaseLine->GetWidth(leftx, rightx)) {
205 if (leftx < MinLeftX) {
206 MinLeftX = leftx;
207 }
208 if (rightx > MaxRightX) {
209 MaxRightX = rightx;
210 }
211 }
212 }
213 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
214 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
215 pBaseLine->MergeBoxes();
216 }
217 for (i = 1; i < m_BaseLines.GetSize(); i ++) {
218 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
219 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1);
220 if (pBaseLine->CanMerge(pPrevLine)) {
221 pPrevLine->Merge(pBaseLine);
222 delete pBaseLine;
223 m_BaseLines.RemoveAt(i);
224 i --;
225 }
226 }
227 if (m_bAutoWidth) {
228 int* widths = FX_Alloc(int, m_BaseLines.GetSize());
229 if (widths) {
230 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
231 widths[i] = 0;
232 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
233 int TotalChars = 0;
234 FX_FLOAT TotalWidth = 0;
235 int minchars;
236 pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
237 if (TotalChars) {
238 FX_FLOAT charwidth = TotalWidth / TotalChars;
239 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
240 }
241 if (widths[i] > 1000) {
242 widths[i] = 1000;
243 }
244 if (widths[i] < minchars) {
245 widths[i] = minchars;
246 }
247 }
248 int AvgWidth = 0, widthcount = 0;
249 for (i = 0; i < m_BaseLines.GetSize(); i ++)
250 if (widths[i]) {
251 AvgWidth += widths[i];
252 widthcount ++;
253 }
254 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
255 int MaxWidth = 0;
256 for (i = 0; i < m_BaseLines.GetSize(); i ++)
257 if (MaxWidth < widths[i]) {
258 MaxWidth = widths[i];
259 }
260 if (MaxWidth > AvgWidth * 6 / 5) {
261 MaxWidth = AvgWidth * 6 / 5;
262 }
263 FX_Free(widths);
264 if (iMinWidth < MaxWidth) {
265 iMinWidth = MaxWidth;
266 }
267 }
268 }
269 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
270 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
271 pBaseLine->MergeBoxes();
272 }
273 if (m_bKeepColumn) {
274 FindColumns();
275 }
276 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
277 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
278 if (lastheight >= 0) {
279 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
280 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
281 lines.Add(L"");
282 }
283 }
284 lastheight = pBaseLine->m_MaxFontSizeV;
285 lastbaseline = pBaseLine->m_BaseLine;
286 CFX_WideString str;
287 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
288 lines.Add(str);
289 }
290 }
NormalizeCompositeChar(FX_WCHAR wChar,CFX_WideString & sDest)291 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest)
292 {
293 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
294 FX_LPWSTR pDst = NULL;
295 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
296 if (nCount < 1 ) {
297 sDest += wChar;
298 return;
299 }
300 pDst = new FX_WCHAR[nCount];
301 FX_Unicode_GetNormalization(wChar, pDst);
302 for (int nIndex = 0; nIndex < nCount; nIndex++) {
303 sDest += pDst[nIndex];
304 }
305 delete[] pDst;
306 }
NormalizeString(CFX_WideString & str)307 void NormalizeString(CFX_WideString& str)
308 {
309 if (str.GetLength() <= 0) {
310 return;
311 }
312 CFX_WideString sBuffer;
313 IFX_BidiChar* BidiChar = IFX_BidiChar::Create();
314 if (NULL == BidiChar) {
315 return;
316 }
317 CFX_WordArray order;
318 FX_BOOL bR2L = FALSE;
319 FX_INT32 start = 0, count = 0, i = 0;
320 int nR2L = 0, nL2R = 0;
321 for (i = 0; i < str.GetLength(); i++) {
322 if(BidiChar->AppendChar(str.GetAt(i))) {
323 FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
324 order.Add(start);
325 order.Add(count);
326 order.Add(ret);
327 if(!bR2L) {
328 if(ret == 2) {
329 nR2L++;
330 } else if (ret == 1) {
331 nL2R++;
332 }
333 }
334 }
335 }
336 if(BidiChar->EndChar()) {
337 FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
338 order.Add(start);
339 order.Add(count);
340 order.Add(ret);
341 if(!bR2L) {
342 if(ret == 2) {
343 nR2L++;
344 } else if(ret == 1) {
345 nL2R++;
346 }
347 }
348 }
349 if(nR2L > 0 && nR2L >= nL2R) {
350 bR2L = TRUE;
351 }
352 if(bR2L) {
353 int count = order.GetSize();
354 for(int j = count - 1; j > 0; j -= 3) {
355 int ret = order.GetAt(j);
356 int start = order.GetAt(j - 2);
357 int count1 = order.GetAt(j - 1);
358 if(ret == 2 || ret == 0) {
359 for(int i = start + count1 - 1; i >= start; i--) {
360 NormalizeCompositeChar(str[i], sBuffer);
361 }
362 } else {
363 i = j;
364 FX_BOOL bSymbol = FALSE;
365 while(i > 0 && order.GetAt(i) != 2) {
366 bSymbol = !order.GetAt(i);
367 i -= 3;
368 }
369 int end = start + count1 ;
370 int n = 0;
371 if(bSymbol) {
372 n = i + 6;
373 } else {
374 n = i + 3;
375 }
376 if(n >= j) {
377 for(int m = start; m < end; m++) {
378 sBuffer += str[m];
379 }
380 } else {
381 i = j;
382 j = n;
383 for(; n <= i; n += 3) {
384 int ret = order.GetAt(n);
385 int start = order.GetAt(n - 2);
386 int count1 = order.GetAt(n - 1);
387 int end = start + count1 ;
388 for(int m = start; m < end; m++) {
389 sBuffer += str[m];
390 }
391 }
392 }
393 }
394 }
395 } else {
396 int count = order.GetSize();
397 FX_BOOL bL2R = FALSE;
398 for(int j = 0; j < count; j += 3) {
399 int ret = order.GetAt(j + 2);
400 int start = order.GetAt(j);
401 int count1 = order.GetAt(j + 1);
402 if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
403 int i = j + 3;
404 while(bR2L && i < count) {
405 if(order.GetAt(i + 2) == 1) {
406 break;
407 } else {
408 i += 3;
409 }
410 }
411 if(i == 3) {
412 j = -3;
413 bL2R = TRUE;
414 continue;
415 }
416 int end = str.GetLength() - 1;
417 if(i < count) {
418 end = order.GetAt(i) - 1;
419 }
420 j = i - 3;
421 for(int n = end; n >= start; n--) {
422 NormalizeCompositeChar(str[i], sBuffer);
423 }
424 } else {
425 int end = start + count1 ;
426 for(int i = start; i < end; i++) {
427 sBuffer += str[i];
428 }
429 }
430 }
431 }
432 str.Empty();
433 str += sBuffer;
434 BidiChar->Release();
435 }
IsNumber(CFX_WideString & str)436 static FX_BOOL IsNumber(CFX_WideString& str)
437 {
438 for (int i = 0; i < str.GetLength(); i ++) {
439 FX_WCHAR ch = str[i];
440 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') {
441 return FALSE;
442 }
443 }
444 return TRUE;
445 }
FindColumns()446 void CTextPage::FindColumns()
447 {
448 int i;
449 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
450 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
451 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
452 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
453 CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
454 if (pColumn == NULL) {
455 pColumn = FX_NEW CTextColumn;
456 if (pColumn) {
457 pColumn->m_Count = 1;
458 pColumn->m_AvgPos = pTextBox->m_Right;
459 pColumn->m_TextPos = -1;
460 m_TextColumns.Add(pColumn);
461 }
462 } else {
463 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
464 (pColumn->m_Count + 1);
465 pColumn->m_Count ++;
466 }
467 }
468 }
469 int mincount = m_BaseLines.GetSize() / 4;
470 for (i = 0; i < m_TextColumns.GetSize(); i ++) {
471 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
472 if (pTextColumn->m_Count >= mincount) {
473 continue;
474 }
475 delete pTextColumn;
476 m_TextColumns.RemoveAt(i);
477 i --;
478 }
479 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
480 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
481 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
482 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
483 if (IsNumber(pTextBox->m_Text)) {
484 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
485 }
486 }
487 }
488 }
FindColumn(FX_FLOAT xpos)489 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos)
490 {
491 for (int i = 0; i < m_TextColumns.GetSize(); i ++) {
492 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
493 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
494 return pColumn;
495 }
496 }
497 return NULL;
498 }
BreakSpace(CPDF_TextObject * pTextObj)499 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj)
500 {
501 }
CTextBaseLine()502 CTextBaseLine::CTextBaseLine()
503 {
504 m_Top = -100000;
505 m_Bottom = 100000;
506 m_MaxFontSizeV = 0;
507 }
~CTextBaseLine()508 CTextBaseLine::~CTextBaseLine()
509 {
510 for (int i = 0; i < m_TextList.GetSize(); i ++) {
511 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
512 delete pText;
513 }
514 }
InsertTextBox(FX_FLOAT leftx,FX_FLOAT rightx,FX_FLOAT topy,FX_FLOAT bottomy,FX_FLOAT spacew,FX_FLOAT fontsize_v,const CFX_WideString & text)515 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy,
516 FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text)
517 {
518 if (m_Top < topy) {
519 m_Top = topy;
520 }
521 if (m_Bottom > bottomy) {
522 m_Bottom = bottomy;
523 }
524 if (m_MaxFontSizeV < fontsize_v) {
525 m_MaxFontSizeV = fontsize_v;
526 }
527 int i;
528 for (i = 0; i < m_TextList.GetSize(); i ++) {
529 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
530 if (pText->m_Left > leftx) {
531 break;
532 }
533 }
534 CTextBox* pText = FX_NEW CTextBox;
535 if (NULL == pText) {
536 return;
537 }
538 pText->m_Text = text;
539 pText->m_Left = leftx;
540 pText->m_Right = rightx;
541 pText->m_Top = topy;
542 pText->m_Bottom = bottomy;
543 pText->m_SpaceWidth = spacew;
544 pText->m_FontSizeV = fontsize_v;
545 pText->m_pColumn = NULL;
546 m_TextList.InsertAt(i, pText);
547 }
548 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2,
549 FX_FLOAT& interlow, FX_FLOAT& interhigh);
CanMerge(CTextBaseLine * pOther)550 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther)
551 {
552 FX_FLOAT inter_top, inter_bottom;
553 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
554 inter_bottom, inter_top)) {
555 return FALSE;
556 }
557 FX_FLOAT inter_h = inter_top - inter_bottom;
558 if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
559 return FALSE;
560 }
561 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
562 for (int i = 0; i < m_TextList.GetSize(); i ++) {
563 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
564 FX_FLOAT width = pText->m_Right - pText->m_Left;
565 for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) {
566 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
567 FX_FLOAT inter_left, inter_right;
568 if (!GetIntersection(pText->m_Left, pText->m_Right,
569 pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) {
570 continue;
571 }
572 FX_FLOAT inter_w = inter_right - inter_left;
573 if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) {
574 continue;
575 }
576 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
577 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
578 return FALSE;
579 }
580 }
581 }
582 return TRUE;
583 }
Merge(CTextBaseLine * pOther)584 void CTextBaseLine::Merge(CTextBaseLine* pOther)
585 {
586 for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) {
587 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
588 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
589 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
590 }
591 }
GetWidth(FX_FLOAT & leftx,FX_FLOAT & rightx)592 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx)
593 {
594 int i;
595 for (i = 0; i < m_TextList.GetSize(); i ++) {
596 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
597 if (pText->m_Text != L" ") {
598 break;
599 }
600 }
601 if (i == m_TextList.GetSize()) {
602 return FALSE;
603 }
604 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
605 leftx = pText->m_Left;
606 for (i = m_TextList.GetSize() - 1; i >= 0; i --) {
607 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
608 if (pText->m_Text != L" ") {
609 break;
610 }
611 }
612 pText = (CTextBox*)m_TextList.GetAt(i);
613 rightx = pText->m_Right;
614 return TRUE;
615 }
MergeBoxes()616 void CTextBaseLine::MergeBoxes()
617 {
618 int i = 0;
619 while (1) {
620 if (i >= m_TextList.GetSize() - 1) {
621 break;
622 }
623 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i);
624 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1);
625 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
626 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ?
627 pNextText->m_SpaceWidth : pThisText->m_SpaceWidth;
628 if (spacew > 0.0 && dx < spacew * 2) {
629 pThisText->m_Right = pNextText->m_Right;
630 if (dx > spacew * 1.5) {
631 pThisText->m_Text += L" ";
632 } else if (dx > spacew / 3) {
633 pThisText->m_Text += L' ';
634 }
635 pThisText->m_Text += pNextText->m_Text;
636 pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ?
637 spacew : pNextText->m_SpaceWidth;
638 m_TextList.RemoveAt(i + 1);
639 delete pNextText;
640 } else {
641 i ++;
642 }
643 }
644 }
WriteOutput(CFX_WideString & str,FX_FLOAT leftx,FX_FLOAT pagewidth,int iTextWidth)645 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth,
646 int iTextWidth)
647 {
648 int lastpos = -1;
649 for (int i = 0; i < m_TextList.GetSize(); i ++) {
650 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
651 int xpos;
652 if (pText->m_pColumn) {
653 xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5);
654 xpos -= pText->m_Text.GetLength();
655 } else {
656 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
657 }
658 if (xpos <= lastpos) {
659 xpos = lastpos + 1;
660 }
661 for (int j = lastpos + 1; j < xpos; j ++) {
662 str += ' ';
663 }
664 CFX_WideString sSrc(pText->m_Text);
665 NormalizeString(sSrc);
666 str += sSrc;
667 str += ' ';
668 lastpos = xpos + pText->m_Text.GetLength();
669 }
670 }
CountChars(int & count,FX_FLOAT & width,int & minchars)671 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars)
672 {
673 minchars = 0;
674 for (int i = 0; i < m_TextList.GetSize(); i ++) {
675 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
676 if (pText->m_Right - pText->m_Left < 0.002) {
677 continue;
678 }
679 count += pText->m_Text.GetLength();
680 width += pText->m_Right - pText->m_Left;
681 minchars += pText->m_Text.GetLength() + 1;
682 }
683 }
684 #define PI 3.1415926535897932384626433832795
CheckRotate(CPDF_Page & page,CFX_FloatRect & page_bbox)685 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox)
686 {
687 int total_count = 0, rotated_count[3] = {0, 0, 0};
688 FX_POSITION pos = page.GetFirstObjectPosition();
689 while (pos) {
690 CPDF_PageObject* pObj = page.GetNextObject(pos);
691 if (pObj->m_Type != PDFPAGE_TEXT) {
692 continue;
693 }
694 total_count ++;
695 CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
696 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
697 if (angle == 0.0) {
698 continue;
699 }
700 int degree = (int)(angle * 180 / PI + 0.5);
701 if (degree % 90) {
702 continue;
703 }
704 if (degree < 0) {
705 degree += 360;
706 }
707 int index = degree / 90 % 3 - 1;
708 if (index < 0) {
709 continue;
710 }
711 rotated_count[index] ++;
712 }
713 if (total_count == 0) {
714 return;
715 }
716 CFX_AffineMatrix matrix;
717 if (rotated_count[0] > total_count * 2 / 3) {
718 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
719 } else if (rotated_count[1] > total_count * 2 / 3) {
720 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
721 } else if (rotated_count[2] > total_count * 2 / 3) {
722 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
723 } else {
724 return;
725 }
726 page.Transform(matrix);
727 page_bbox.Transform(&matrix);
728 }
PDF_GetPageText_Unicode(CFX_WideStringArray & lines,CPDF_Document * pDoc,CPDF_Dictionary * pPage,int iMinWidth,FX_DWORD flags)729 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
730 int iMinWidth, FX_DWORD flags)
731 {
732 lines.RemoveAll();
733 if (pPage == NULL) {
734 return;
735 }
736 CPDF_Page page;
737 page.Load(pDoc, pPage);
738 CPDF_ParseOptions options;
739 options.m_bTextOnly = TRUE;
740 options.m_bSeparateForm = FALSE;
741 page.ParseContent(&options);
742 CFX_FloatRect page_bbox = page.GetPageBBox();
743 if (flags & PDF2TXT_AUTO_ROTATE) {
744 CheckRotate(page, page_bbox);
745 }
746 CTextPage texts;
747 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
748 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
749 texts.m_bBreakSpace = TRUE;
750 FX_POSITION pos = page.GetFirstObjectPosition();
751 while (pos) {
752 CPDF_PageObject* pObject = page.GetNextObject(pos);
753 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
754 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top);
755 if (!page_bbox.Contains(rect)) {
756 continue;
757 }
758 }
759 texts.ProcessObject(pObject);
760 }
761 texts.WriteOutput(lines, iMinWidth);
762 }
PDF_GetPageText(CFX_ByteStringArray & lines,CPDF_Document * pDoc,CPDF_Dictionary * pPage,int iMinWidth,FX_DWORD flags)763 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
764 int iMinWidth, FX_DWORD flags)
765 {
766 lines.RemoveAll();
767 CFX_WideStringArray wlines;
768 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
769 for (int i = 0; i < wlines.GetSize(); i ++) {
770 CFX_WideString wstr = wlines[i];
771 CFX_ByteString str;
772 for (int c = 0; c < wstr.GetLength(); c ++) {
773 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
774 }
775 lines.Add(str);
776 }
777 }
778 #endif
779 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
780 CFX_PtrArray* pObjArray);
PDF_GetTextStream_Unicode(CFX_WideTextBuf & buffer,CPDF_Document * pDoc,CPDF_Dictionary * pPage,FX_DWORD flags)781 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags)
782 {
783 buffer.EstimateSize(0, 10240);
784 CPDF_Page page;
785 page.Load(pDoc, pPage);
786 CPDF_ParseOptions options;
787 options.m_bTextOnly = TRUE;
788 options.m_bSeparateForm = FALSE;
789 page.ParseContent(&options);
790 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL);
791 }
792