1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_textpagefind.h"
8
9 #include <cwchar>
10 #include <cwctype>
11 #include <vector>
12
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_string.h"
15 #include "core/fxcrt/fx_system.h"
16 #include "third_party/base/stl_util.h"
17
18 namespace {
19
IsIgnoreSpaceCharacter(wchar_t curChar)20 bool IsIgnoreSpaceCharacter(wchar_t curChar) {
21 if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
22 (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
23 (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
24 (curChar >= 0x0400 && curChar <= 0x04FF) ||
25 (curChar >= 0x0500 && curChar <= 0x052F) ||
26 (curChar >= 0xA640 && curChar <= 0xA69F) ||
27 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
28 (curChar >= 0x2000 && curChar <= 0x206F)) {
29 return false;
30 }
31 return true;
32 }
33
34 } // namespace
35
CPDF_TextPageFind(const CPDF_TextPage * pTextPage)36 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
37 : m_pTextPage(pTextPage),
38 m_flags(0),
39 m_bMatchCase(false),
40 m_bMatchWholeWord(false),
41 m_resStart(0),
42 m_resEnd(-1),
43 m_IsFind(false) {
44 m_strText = m_pTextPage->GetAllPageText();
45 int nCount = pTextPage->CountChars();
46 if (nCount)
47 m_CharIndex.push_back(0);
48 for (int i = 0; i < nCount; i++) {
49 FPDF_CHAR_INFO info;
50 pTextPage->GetCharInfo(i, &info);
51 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
52 if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
53 info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
54 if (indexSize % 2) {
55 m_CharIndex.push_back(1);
56 } else {
57 if (indexSize <= 0)
58 continue;
59 m_CharIndex[indexSize - 1] += 1;
60 }
61 } else {
62 if (indexSize % 2) {
63 if (indexSize <= 0)
64 continue;
65 m_CharIndex[indexSize - 1] = i + 1;
66 } else {
67 m_CharIndex.push_back(i + 1);
68 }
69 }
70 }
71 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
72 if (indexSize % 2)
73 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
74 }
75
~CPDF_TextPageFind()76 CPDF_TextPageFind::~CPDF_TextPageFind() {}
77
GetCharIndex(int index) const78 int CPDF_TextPageFind::GetCharIndex(int index) const {
79 return m_pTextPage->CharIndexFromTextIndex(index);
80 }
81
FindFirst(const WideString & findwhat,int flags,Optional<size_t> startPos)82 bool CPDF_TextPageFind::FindFirst(const WideString& findwhat,
83 int flags,
84 Optional<size_t> startPos) {
85 if (!m_pTextPage)
86 return false;
87 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
88 m_strText = m_pTextPage->GetAllPageText();
89 WideString findwhatStr = findwhat;
90 m_findWhat = findwhatStr;
91 m_flags = flags;
92 m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
93 if (m_strText.IsEmpty()) {
94 m_IsFind = false;
95 return true;
96 }
97 size_t len = findwhatStr.GetLength();
98 if (!m_bMatchCase) {
99 findwhatStr.MakeLower();
100 m_strText.MakeLower();
101 }
102 m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD);
103 m_findNextStart = startPos;
104 if (!startPos.has_value()) {
105 if (!m_strText.IsEmpty())
106 m_findPreStart = m_strText.GetLength() - 1;
107 } else {
108 m_findPreStart = startPos;
109 }
110
111 m_csFindWhatArray.clear();
112 size_t i = 0;
113 for (i = 0; i < len; ++i)
114 if (findwhatStr[i] != ' ')
115 break;
116 if (i < len)
117 ExtractFindWhat(findwhatStr);
118 else
119 m_csFindWhatArray.push_back(findwhatStr);
120 if (m_csFindWhatArray.empty())
121 return false;
122
123 m_IsFind = true;
124 m_resStart = 0;
125 m_resEnd = -1;
126 return true;
127 }
128
FindNext()129 bool CPDF_TextPageFind::FindNext() {
130 if (!m_pTextPage)
131 return false;
132 m_resArray.clear();
133 if (!m_findNextStart.has_value())
134 return false;
135 if (m_strText.IsEmpty()) {
136 m_IsFind = false;
137 return m_IsFind;
138 }
139 size_t strLen = m_strText.GetLength();
140 if (m_findNextStart.value() > strLen - 1) {
141 m_IsFind = false;
142 return m_IsFind;
143 }
144 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
145 Optional<size_t> nResultPos = 0;
146 size_t nStartPos = m_findNextStart.value();
147 bool bSpaceStart = false;
148 for (int iWord = 0; iWord < nCount; iWord++) {
149 WideString csWord = m_csFindWhatArray[iWord];
150 if (csWord.IsEmpty()) {
151 if (iWord == nCount - 1) {
152 wchar_t strInsert = m_strText[nStartPos];
153 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
154 strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
155 nResultPos = nStartPos + 1;
156 break;
157 }
158 iWord = -1;
159 } else if (iWord == 0) {
160 bSpaceStart = true;
161 }
162 continue;
163 }
164 nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
165 if (!nResultPos.has_value()) {
166 m_IsFind = false;
167 return m_IsFind;
168 }
169 size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
170 if (iWord == 0)
171 m_resStart = nResultPos.value();
172 bool bMatch = true;
173 if (iWord != 0 && !bSpaceStart) {
174 size_t PreResEndPos = nStartPos;
175 int curChar = csWord[0];
176 WideString lastWord = m_csFindWhatArray[iWord - 1];
177 int lastChar = lastWord[lastWord.GetLength() - 1];
178 if (nStartPos == nResultPos.value() &&
179 !(IsIgnoreSpaceCharacter(lastChar) ||
180 IsIgnoreSpaceCharacter(curChar))) {
181 bMatch = false;
182 }
183 for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
184 wchar_t strInsert = m_strText[d];
185 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
186 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
187 bMatch = false;
188 break;
189 }
190 }
191 } else if (bSpaceStart) {
192 if (nResultPos.value() > 0) {
193 wchar_t strInsert = m_strText[nResultPos.value() - 1];
194 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
195 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
196 bMatch = false;
197 m_resStart = nResultPos.value();
198 } else {
199 m_resStart = nResultPos.value() - 1;
200 }
201 }
202 }
203 if (m_bMatchWholeWord && bMatch) {
204 bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
205 }
206 nStartPos = endIndex + 1;
207 if (!bMatch) {
208 iWord = -1;
209 if (bSpaceStart)
210 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
211 else
212 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
213 }
214 }
215 m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
216 m_IsFind = true;
217 int resStart = GetCharIndex(m_resStart);
218 int resEnd = GetCharIndex(m_resEnd);
219 m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
220 if (m_flags & FPDFTEXT_CONSECUTIVE) {
221 m_findNextStart = m_resStart + 1;
222 m_findPreStart = m_resEnd - 1;
223 } else {
224 m_findNextStart = m_resEnd + 1;
225 m_findPreStart = m_resStart - 1;
226 }
227 return m_IsFind;
228 }
229
FindPrev()230 bool CPDF_TextPageFind::FindPrev() {
231 if (!m_pTextPage)
232 return false;
233 m_resArray.clear();
234 if (m_strText.IsEmpty() || !m_findPreStart.has_value()) {
235 m_IsFind = false;
236 return m_IsFind;
237 }
238 CPDF_TextPageFind findEngine(m_pTextPage.Get());
239 bool ret = findEngine.FindFirst(m_findWhat, m_flags, Optional<size_t>(0));
240 if (!ret) {
241 m_IsFind = false;
242 return m_IsFind;
243 }
244 int order = -1;
245 int MatchedCount = 0;
246 while (ret) {
247 ret = findEngine.FindNext();
248 if (ret) {
249 int order1 = findEngine.GetCurOrder();
250 int MatchedCount1 = findEngine.GetMatchedCount();
251 int temp = order1 + MatchedCount1;
252 if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
253 break;
254 order = order1;
255 MatchedCount = MatchedCount1;
256 }
257 }
258 if (order == -1) {
259 m_IsFind = false;
260 return m_IsFind;
261 }
262 m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
263 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
264 m_IsFind = true;
265 m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
266 if (m_flags & FPDFTEXT_CONSECUTIVE) {
267 m_findNextStart = m_resStart + 1;
268 m_findPreStart = m_resEnd - 1;
269 } else {
270 m_findNextStart = m_resEnd + 1;
271 m_findPreStart = m_resStart - 1;
272 }
273 return m_IsFind;
274 }
275
ExtractFindWhat(const WideString & findwhat)276 void CPDF_TextPageFind::ExtractFindWhat(const WideString& findwhat) {
277 if (findwhat.IsEmpty())
278 return;
279 int index = 0;
280 while (1) {
281 Optional<WideString> word =
282 ExtractSubString(findwhat.c_str(), index, TEXT_SPACE_CHAR);
283 if (!word)
284 break;
285
286 if (word->IsEmpty()) {
287 m_csFindWhatArray.push_back(L"");
288 index++;
289 continue;
290 }
291
292 size_t pos = 0;
293 while (pos < word->GetLength()) {
294 WideString curStr = word->Mid(pos, 1);
295 wchar_t curChar = word->operator[](pos);
296 if (IsIgnoreSpaceCharacter(curChar)) {
297 if (pos > 0 && curChar == 0x2019) {
298 pos++;
299 continue;
300 }
301 if (pos > 0)
302 m_csFindWhatArray.push_back(word->Left(pos));
303 m_csFindWhatArray.push_back(curStr);
304 if (pos == word->GetLength() - 1) {
305 word->clear();
306 break;
307 }
308 word.emplace(word->Right(word->GetLength() - pos - 1));
309 pos = 0;
310 continue;
311 }
312 pos++;
313 }
314
315 if (!word->IsEmpty())
316 m_csFindWhatArray.push_back(word.value());
317 index++;
318 }
319 }
320
IsMatchWholeWord(const WideString & csPageText,size_t startPos,size_t endPos)321 bool CPDF_TextPageFind::IsMatchWholeWord(const WideString& csPageText,
322 size_t startPos,
323 size_t endPos) {
324 if (startPos > endPos)
325 return false;
326 wchar_t char_left = 0;
327 wchar_t char_right = 0;
328 size_t char_count = endPos - startPos + 1;
329 if (char_count == 0)
330 return false;
331 if (char_count == 1 && csPageText[startPos] > 255)
332 return true;
333 if (startPos >= 1)
334 char_left = csPageText[startPos - 1];
335 if (startPos + char_count < csPageText.GetLength())
336 char_right = csPageText[startPos + char_count];
337 if ((char_left > 'A' && char_left < 'a') ||
338 (char_left > 'a' && char_left < 'z') ||
339 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
340 (char_right > 'A' && char_right < 'a') ||
341 (char_right > 'a' && char_right < 'z') ||
342 (char_right > 0xfb00 && char_right < 0xfb06) ||
343 std::iswdigit(char_right)) {
344 return false;
345 }
346 if (!(('A' > char_left || char_left > 'Z') &&
347 ('a' > char_left || char_left > 'z') &&
348 ('A' > char_right || char_right > 'Z') &&
349 ('a' > char_right || char_right > 'z'))) {
350 return false;
351 }
352 if (char_count > 0) {
353 if (std::iswdigit(char_left) && std::iswdigit(csPageText[startPos]))
354 return false;
355 if (std::iswdigit(char_right) && std::iswdigit(csPageText[endPos]))
356 return false;
357 }
358 return true;
359 }
360
ExtractSubString(const wchar_t * lpszFullString,int iSubString,wchar_t chSep)361 Optional<WideString> CPDF_TextPageFind::ExtractSubString(
362 const wchar_t* lpszFullString,
363 int iSubString,
364 wchar_t chSep) {
365 if (!lpszFullString)
366 return {};
367
368 while (iSubString--) {
369 lpszFullString = std::wcschr(lpszFullString, chSep);
370 if (!lpszFullString)
371 return {};
372
373 lpszFullString++;
374 while (*lpszFullString == chSep)
375 lpszFullString++;
376 }
377
378 const wchar_t* lpchEnd = std::wcschr(lpszFullString, chSep);
379 int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
380 : static_cast<int>(wcslen(lpszFullString));
381 if (nLen < 0)
382 return {};
383
384 return {WideString(lpszFullString, static_cast<size_t>(nLen))};
385 }
386
GetCurOrder() const387 int CPDF_TextPageFind::GetCurOrder() const {
388 return GetCharIndex(m_resStart);
389 }
390
GetMatchedCount() const391 int CPDF_TextPageFind::GetMatchedCount() const {
392 int resStart = GetCharIndex(m_resStart);
393 int resEnd = GetCharIndex(m_resEnd);
394 return resEnd - resStart + 1;
395 }
396