1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_textpagefind.h"
8
9 #include <cwchar>
10 #include <cwctype>
11 #include <vector>
12
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_string.h"
15 #include "core/fxcrt/fx_system.h"
16 #include "third_party/base/stl_util.h"
17
18 namespace {
19
IsIgnoreSpaceCharacter(FX_WCHAR curChar)20 bool IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
21 if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
22 (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
23 (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
24 (curChar >= 0x0400 && curChar <= 0x04FF) ||
25 (curChar >= 0x0500 && curChar <= 0x052F) ||
26 (curChar >= 0xA640 && curChar <= 0xA69F) ||
27 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
28 (curChar >= 0x2000 && curChar <= 0x206F)) {
29 return false;
30 }
31 return true;
32 }
33
34 } // namespace
35
CPDF_TextPageFind(const CPDF_TextPage * pTextPage)36 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
37 : m_pTextPage(pTextPage),
38 m_flags(0),
39 m_findNextStart(-1),
40 m_findPreStart(-1),
41 m_bMatchCase(false),
42 m_bMatchWholeWord(false),
43 m_resStart(0),
44 m_resEnd(-1),
45 m_IsFind(false) {
46 m_strText = m_pTextPage->GetPageText();
47 int nCount = pTextPage->CountChars();
48 if (nCount)
49 m_CharIndex.push_back(0);
50 for (int i = 0; i < nCount; i++) {
51 FPDF_CHAR_INFO info;
52 pTextPage->GetCharInfo(i, &info);
53 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
54 if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
55 info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
56 if (indexSize % 2) {
57 m_CharIndex.push_back(1);
58 } else {
59 if (indexSize <= 0)
60 continue;
61 m_CharIndex[indexSize - 1] += 1;
62 }
63 } else {
64 if (indexSize % 2) {
65 if (indexSize <= 0)
66 continue;
67 m_CharIndex[indexSize - 1] = i + 1;
68 } else {
69 m_CharIndex.push_back(i + 1);
70 }
71 }
72 }
73 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
74 if (indexSize % 2)
75 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
76 }
77
~CPDF_TextPageFind()78 CPDF_TextPageFind::~CPDF_TextPageFind() {}
79
GetCharIndex(int index) const80 int CPDF_TextPageFind::GetCharIndex(int index) const {
81 return m_pTextPage->CharIndexFromTextIndex(index);
82 }
83
FindFirst(const CFX_WideString & findwhat,int flags,int startPos)84 bool CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
85 int flags,
86 int startPos) {
87 if (!m_pTextPage)
88 return false;
89 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
90 m_strText = m_pTextPage->GetPageText();
91 CFX_WideString findwhatStr = findwhat;
92 m_findWhat = findwhatStr;
93 m_flags = flags;
94 m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
95 if (m_strText.IsEmpty()) {
96 m_IsFind = false;
97 return true;
98 }
99 FX_STRSIZE len = findwhatStr.GetLength();
100 if (!m_bMatchCase) {
101 findwhatStr.MakeLower();
102 m_strText.MakeLower();
103 }
104 m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD);
105 m_findNextStart = startPos;
106 if (startPos == -1)
107 m_findPreStart = m_strText.GetLength() - 1;
108 else
109 m_findPreStart = startPos;
110 m_csFindWhatArray.clear();
111 int i = 0;
112 while (i < len) {
113 if (findwhatStr.GetAt(i) != ' ')
114 break;
115 i++;
116 }
117 if (i < len)
118 ExtractFindWhat(findwhatStr);
119 else
120 m_csFindWhatArray.push_back(findwhatStr);
121 if (m_csFindWhatArray.empty())
122 return false;
123 m_IsFind = true;
124 m_resStart = 0;
125 m_resEnd = -1;
126 return true;
127 }
128
FindNext()129 bool CPDF_TextPageFind::FindNext() {
130 if (!m_pTextPage)
131 return false;
132 m_resArray.clear();
133 if (m_findNextStart == -1)
134 return false;
135 if (m_strText.IsEmpty()) {
136 m_IsFind = false;
137 return m_IsFind;
138 }
139 int strLen = m_strText.GetLength();
140 if (m_findNextStart > strLen - 1) {
141 m_IsFind = false;
142 return m_IsFind;
143 }
144 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
145 int nResultPos = 0;
146 int nStartPos = 0;
147 nStartPos = m_findNextStart;
148 bool bSpaceStart = false;
149 for (int iWord = 0; iWord < nCount; iWord++) {
150 CFX_WideString csWord = m_csFindWhatArray[iWord];
151 if (csWord.IsEmpty()) {
152 if (iWord == nCount - 1) {
153 FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
154 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
155 strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
156 nResultPos = nStartPos + 1;
157 break;
158 }
159 iWord = -1;
160 } else if (iWord == 0) {
161 bSpaceStart = true;
162 }
163 continue;
164 }
165 int endIndex;
166 nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
167 if (nResultPos == -1) {
168 m_IsFind = false;
169 return m_IsFind;
170 }
171 endIndex = nResultPos + csWord.GetLength() - 1;
172 if (iWord == 0)
173 m_resStart = nResultPos;
174 bool bMatch = true;
175 if (iWord != 0 && !bSpaceStart) {
176 int PreResEndPos = nStartPos;
177 int curChar = csWord.GetAt(0);
178 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
179 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
180 if (nStartPos == nResultPos &&
181 !(IsIgnoreSpaceCharacter(lastChar) ||
182 IsIgnoreSpaceCharacter(curChar))) {
183 bMatch = false;
184 }
185 for (int d = PreResEndPos; d < nResultPos; d++) {
186 FX_WCHAR strInsert = m_strText.GetAt(d);
187 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
188 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
189 bMatch = false;
190 break;
191 }
192 }
193 } else if (bSpaceStart) {
194 if (nResultPos > 0) {
195 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
196 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
197 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
198 bMatch = false;
199 m_resStart = nResultPos;
200 } else {
201 m_resStart = nResultPos - 1;
202 }
203 }
204 }
205 if (m_bMatchWholeWord && bMatch) {
206 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
207 }
208 nStartPos = endIndex + 1;
209 if (!bMatch) {
210 iWord = -1;
211 if (bSpaceStart)
212 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
213 else
214 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
215 }
216 }
217 m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
218 m_IsFind = true;
219 int resStart = GetCharIndex(m_resStart);
220 int resEnd = GetCharIndex(m_resEnd);
221 m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
222 if (m_flags & FPDFTEXT_CONSECUTIVE) {
223 m_findNextStart = m_resStart + 1;
224 m_findPreStart = m_resEnd - 1;
225 } else {
226 m_findNextStart = m_resEnd + 1;
227 m_findPreStart = m_resStart - 1;
228 }
229 return m_IsFind;
230 }
231
FindPrev()232 bool CPDF_TextPageFind::FindPrev() {
233 if (!m_pTextPage)
234 return false;
235 m_resArray.clear();
236 if (m_strText.IsEmpty() || m_findPreStart < 0) {
237 m_IsFind = false;
238 return m_IsFind;
239 }
240 CPDF_TextPageFind findEngine(m_pTextPage);
241 bool ret = findEngine.FindFirst(m_findWhat, m_flags);
242 if (!ret) {
243 m_IsFind = false;
244 return m_IsFind;
245 }
246 int order = -1, MatchedCount = 0;
247 while (ret) {
248 ret = findEngine.FindNext();
249 if (ret) {
250 int order1 = findEngine.GetCurOrder();
251 int MatchedCount1 = findEngine.GetMatchedCount();
252 if (((order1 + MatchedCount1) - 1) > m_findPreStart)
253 break;
254 order = order1;
255 MatchedCount = MatchedCount1;
256 }
257 }
258 if (order == -1) {
259 m_IsFind = false;
260 return m_IsFind;
261 }
262 m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
263 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
264 m_IsFind = true;
265 m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
266 if (m_flags & FPDFTEXT_CONSECUTIVE) {
267 m_findNextStart = m_resStart + 1;
268 m_findPreStart = m_resEnd - 1;
269 } else {
270 m_findNextStart = m_resEnd + 1;
271 m_findPreStart = m_resStart - 1;
272 }
273 return m_IsFind;
274 }
275
ExtractFindWhat(const CFX_WideString & findwhat)276 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
277 if (findwhat.IsEmpty())
278 return;
279 int index = 0;
280 while (1) {
281 CFX_WideString csWord = TEXT_EMPTY;
282 int ret =
283 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
284 if (csWord.IsEmpty()) {
285 if (ret) {
286 m_csFindWhatArray.push_back(L"");
287 index++;
288 continue;
289 } else {
290 break;
291 }
292 }
293 int pos = 0;
294 while (pos < csWord.GetLength()) {
295 CFX_WideString curStr = csWord.Mid(pos, 1);
296 FX_WCHAR curChar = csWord.GetAt(pos);
297 if (IsIgnoreSpaceCharacter(curChar)) {
298 if (pos > 0 && curChar == 0x2019) {
299 pos++;
300 continue;
301 }
302 if (pos > 0)
303 m_csFindWhatArray.push_back(csWord.Mid(0, pos));
304 m_csFindWhatArray.push_back(curStr);
305 if (pos == csWord.GetLength() - 1) {
306 csWord.clear();
307 break;
308 }
309 csWord = csWord.Right(csWord.GetLength() - pos - 1);
310 pos = 0;
311 continue;
312 }
313 pos++;
314 }
315 if (!csWord.IsEmpty())
316 m_csFindWhatArray.push_back(csWord);
317 index++;
318 }
319 }
320
IsMatchWholeWord(const CFX_WideString & csPageText,int startPos,int endPos)321 bool CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
322 int startPos,
323 int endPos) {
324 FX_WCHAR char_left = 0;
325 FX_WCHAR char_right = 0;
326 int char_count = endPos - startPos + 1;
327 if (char_count < 1)
328 return false;
329 if (char_count == 1 && csPageText.GetAt(startPos) > 255)
330 return true;
331 if (startPos - 1 >= 0)
332 char_left = csPageText.GetAt(startPos - 1);
333 if (startPos + char_count < csPageText.GetLength())
334 char_right = csPageText.GetAt(startPos + char_count);
335 if ((char_left > 'A' && char_left < 'a') ||
336 (char_left > 'a' && char_left < 'z') ||
337 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
338 (char_right > 'A' && char_right < 'a') ||
339 (char_right > 'a' && char_right < 'z') ||
340 (char_right > 0xfb00 && char_right < 0xfb06) ||
341 std::iswdigit(char_right)) {
342 return false;
343 }
344 if (!(('A' > char_left || char_left > 'Z') &&
345 ('a' > char_left || char_left > 'z') &&
346 ('A' > char_right || char_right > 'Z') &&
347 ('a' > char_right || char_right > 'z'))) {
348 return false;
349 }
350 if (char_count > 0) {
351 if (csPageText.GetAt(startPos) >= L'0' &&
352 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
353 char_left <= L'9') {
354 return false;
355 }
356 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
357 char_right >= L'0' && char_right <= L'9') {
358 return false;
359 }
360 }
361 return true;
362 }
363
ExtractSubString(CFX_WideString & rString,const FX_WCHAR * lpszFullString,int iSubString,FX_WCHAR chSep)364 bool CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
365 const FX_WCHAR* lpszFullString,
366 int iSubString,
367 FX_WCHAR chSep) {
368 if (!lpszFullString)
369 return false;
370 while (iSubString--) {
371 lpszFullString = std::wcschr(lpszFullString, chSep);
372 if (!lpszFullString) {
373 rString.clear();
374 return false;
375 }
376 lpszFullString++;
377 while (*lpszFullString == chSep)
378 lpszFullString++;
379 }
380 const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep);
381 int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
382 : (int)FXSYS_wcslen(lpszFullString);
383 ASSERT(nLen >= 0);
384 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
385 nLen * sizeof(FX_WCHAR));
386 rString.ReleaseBuffer();
387 return true;
388 }
389
MakeReverse(const CFX_WideString & str)390 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
391 CFX_WideString str2;
392 str2.clear();
393 int nlen = str.GetLength();
394 for (int i = nlen - 1; i >= 0; i--)
395 str2 += str.GetAt(i);
396 return str2;
397 }
398
GetCurOrder() const399 int CPDF_TextPageFind::GetCurOrder() const {
400 return GetCharIndex(m_resStart);
401 }
402
GetMatchedCount() const403 int CPDF_TextPageFind::GetMatchedCount() const {
404 int resStart = GetCharIndex(m_resStart);
405 int resEnd = GetCharIndex(m_resEnd);
406 return resEnd - resStart + 1;
407 }
408