1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_textpagefind.h"
8
9 #include <cwchar>
10 #include <cwctype>
11 #include <vector>
12
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_extension.h"
15 #include "core/fxcrt/fx_string.h"
16 #include "core/fxcrt/fx_system.h"
17 #include "third_party/base/ptr_util.h"
18 #include "third_party/base/stl_util.h"
19
20 namespace {
21
22 constexpr wchar_t kNonBreakingSpace = 160;
23
IsIgnoreSpaceCharacter(wchar_t curChar)24 bool IsIgnoreSpaceCharacter(wchar_t curChar) {
25 if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
26 (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
27 (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
28 (curChar >= 0x0400 && curChar <= 0x04FF) ||
29 (curChar >= 0x0500 && curChar <= 0x052F) ||
30 (curChar >= 0xA640 && curChar <= 0xA69F) ||
31 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
32 (curChar >= 0x2000 && curChar <= 0x206F)) {
33 return false;
34 }
35 return true;
36 }
37
IsMatchWholeWord(const WideString & csPageText,size_t startPos,size_t endPos)38 bool IsMatchWholeWord(const WideString& csPageText,
39 size_t startPos,
40 size_t endPos) {
41 if (startPos > endPos)
42 return false;
43 wchar_t char_left = 0;
44 wchar_t char_right = 0;
45 size_t char_count = endPos - startPos + 1;
46 if (char_count == 0)
47 return false;
48 if (char_count == 1 && csPageText[startPos] > 255)
49 return true;
50 if (startPos >= 1)
51 char_left = csPageText[startPos - 1];
52 if (startPos + char_count < csPageText.GetLength())
53 char_right = csPageText[startPos + char_count];
54 if ((char_left > 'A' && char_left < 'a') ||
55 (char_left > 'a' && char_left < 'z') ||
56 (char_left > 0xfb00 && char_left < 0xfb06) ||
57 FXSYS_IsDecimalDigit(char_left) ||
58 (char_right > 'A' && char_right < 'a') ||
59 (char_right > 'a' && char_right < 'z') ||
60 (char_right > 0xfb00 && char_right < 0xfb06) ||
61 FXSYS_IsDecimalDigit(char_right)) {
62 return false;
63 }
64 if (!(('A' > char_left || char_left > 'Z') &&
65 ('a' > char_left || char_left > 'z') &&
66 ('A' > char_right || char_right > 'Z') &&
67 ('a' > char_right || char_right > 'z'))) {
68 return false;
69 }
70 if (char_count > 0) {
71 if (FXSYS_IsDecimalDigit(char_left) &&
72 FXSYS_IsDecimalDigit(csPageText[startPos])) {
73 return false;
74 }
75 if (FXSYS_IsDecimalDigit(char_right) &&
76 FXSYS_IsDecimalDigit(csPageText[endPos])) {
77 return false;
78 }
79 }
80 return true;
81 }
82
GetStringCase(const WideString & wsOriginal,bool bMatchCase)83 WideString GetStringCase(const WideString& wsOriginal, bool bMatchCase) {
84 if (bMatchCase)
85 return wsOriginal;
86
87 WideString wsLower = wsOriginal;
88 wsLower.MakeLower();
89 return wsLower;
90 }
91
ExtractSubString(const wchar_t * lpszFullString,int iSubString)92 Optional<WideString> ExtractSubString(const wchar_t* lpszFullString,
93 int iSubString) {
94 ASSERT(lpszFullString);
95
96 while (iSubString--) {
97 lpszFullString = std::wcschr(lpszFullString, L' ');
98 if (!lpszFullString)
99 return {};
100
101 lpszFullString++;
102 while (*lpszFullString == L' ')
103 lpszFullString++;
104 }
105
106 const wchar_t* lpchEnd = std::wcschr(lpszFullString, L' ');
107 int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
108 : static_cast<int>(wcslen(lpszFullString));
109 if (nLen < 0)
110 return {};
111
112 return WideString(lpszFullString, static_cast<size_t>(nLen));
113 }
114
ExtractFindWhat(const WideString & findwhat)115 std::vector<WideString> ExtractFindWhat(const WideString& findwhat) {
116 std::vector<WideString> findwhat_array;
117
118 size_t len = findwhat.GetLength();
119 size_t i = 0;
120 for (i = 0; i < len; ++i)
121 if (findwhat[i] != ' ')
122 break;
123 if (i == len) {
124 findwhat_array.push_back(findwhat);
125 return findwhat_array;
126 }
127
128 int index = 0;
129 while (1) {
130 Optional<WideString> word = ExtractSubString(findwhat.c_str(), index);
131 if (!word)
132 break;
133
134 if (word->IsEmpty()) {
135 findwhat_array.push_back(L"");
136 index++;
137 continue;
138 }
139
140 size_t pos = 0;
141 while (pos < word->GetLength()) {
142 WideString curStr = word->Substr(pos, 1);
143 wchar_t curChar = (*word)[pos];
144 if (IsIgnoreSpaceCharacter(curChar)) {
145 if (pos > 0 && curChar == 0x2019) {
146 pos++;
147 continue;
148 }
149 if (pos > 0)
150 findwhat_array.push_back(word->First(pos));
151 findwhat_array.push_back(curStr);
152 if (pos == word->GetLength() - 1) {
153 word->clear();
154 break;
155 }
156 word.emplace(word->Last(word->GetLength() - pos - 1));
157 pos = 0;
158 continue;
159 }
160 pos++;
161 }
162
163 if (!word->IsEmpty())
164 findwhat_array.push_back(word.value());
165 index++;
166 }
167 return findwhat_array;
168 }
169
170 } // namespace
171
172 // static
Create(const CPDF_TextPage * pTextPage,const WideString & findwhat,const Options & options,Optional<size_t> startPos)173 std::unique_ptr<CPDF_TextPageFind> CPDF_TextPageFind::Create(
174 const CPDF_TextPage* pTextPage,
175 const WideString& findwhat,
176 const Options& options,
177 Optional<size_t> startPos) {
178 std::vector<WideString> findwhat_array =
179 ExtractFindWhat(GetStringCase(findwhat, options.bMatchCase));
180 auto find = pdfium::WrapUnique(
181 new CPDF_TextPageFind(pTextPage, findwhat_array, options, startPos));
182 find->FindFirst();
183 return find;
184 }
185
CPDF_TextPageFind(const CPDF_TextPage * pTextPage,const std::vector<WideString> & findwhat_array,const Options & options,Optional<size_t> startPos)186 CPDF_TextPageFind::CPDF_TextPageFind(
187 const CPDF_TextPage* pTextPage,
188 const std::vector<WideString>& findwhat_array,
189 const Options& options,
190 Optional<size_t> startPos)
191 : m_pTextPage(pTextPage),
192 m_strText(GetStringCase(pTextPage->GetAllPageText(), options.bMatchCase)),
193 m_csFindWhatArray(findwhat_array),
194 m_options(options) {
195 if (!m_strText.IsEmpty()) {
196 m_findNextStart = startPos;
197 m_findPreStart = startPos.value_or(m_strText.GetLength() - 1);
198 }
199 }
200
201 CPDF_TextPageFind::~CPDF_TextPageFind() = default;
202
GetCharIndex(int index) const203 int CPDF_TextPageFind::GetCharIndex(int index) const {
204 return m_pTextPage->CharIndexFromTextIndex(index);
205 }
206
FindFirst()207 bool CPDF_TextPageFind::FindFirst() {
208 return m_strText.IsEmpty() || !m_csFindWhatArray.empty();
209 }
210
FindNext()211 bool CPDF_TextPageFind::FindNext() {
212 if (m_strText.IsEmpty() || !m_findNextStart.has_value())
213 return false;
214
215 size_t strLen = m_strText.GetLength();
216 if (m_findNextStart.value() > strLen - 1)
217 return false;
218
219 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
220 Optional<size_t> nResultPos = 0;
221 size_t nStartPos = m_findNextStart.value();
222 bool bSpaceStart = false;
223 for (int iWord = 0; iWord < nCount; iWord++) {
224 WideString csWord = m_csFindWhatArray[iWord];
225 if (csWord.IsEmpty()) {
226 if (iWord == nCount - 1) {
227 wchar_t strInsert = m_strText[nStartPos];
228 if (strInsert == L'\n' || strInsert == L' ' || strInsert == L'\r' ||
229 strInsert == kNonBreakingSpace) {
230 nResultPos = nStartPos + 1;
231 break;
232 }
233 iWord = -1;
234 } else if (iWord == 0) {
235 bSpaceStart = true;
236 }
237 continue;
238 }
239 nResultPos = m_strText.Find(csWord.AsStringView(), nStartPos);
240 if (!nResultPos.has_value())
241 return false;
242
243 size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
244 if (iWord == 0)
245 m_resStart = nResultPos.value();
246 bool bMatch = true;
247 if (iWord != 0 && !bSpaceStart) {
248 size_t PreResEndPos = nStartPos;
249 int curChar = csWord[0];
250 WideString lastWord = m_csFindWhatArray[iWord - 1];
251 int lastChar = lastWord.Back();
252 if (nStartPos == nResultPos.value() &&
253 !(IsIgnoreSpaceCharacter(lastChar) ||
254 IsIgnoreSpaceCharacter(curChar))) {
255 bMatch = false;
256 }
257 for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
258 wchar_t strInsert = m_strText[d];
259 if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
260 strInsert != kNonBreakingSpace) {
261 bMatch = false;
262 break;
263 }
264 }
265 } else if (bSpaceStart) {
266 if (nResultPos.value() > 0) {
267 wchar_t strInsert = m_strText[nResultPos.value() - 1];
268 if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
269 strInsert != kNonBreakingSpace) {
270 bMatch = false;
271 m_resStart = nResultPos.value();
272 } else {
273 m_resStart = nResultPos.value() - 1;
274 }
275 }
276 }
277 if (m_options.bMatchWholeWord && bMatch)
278 bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
279
280 nStartPos = endIndex + 1;
281 if (!bMatch) {
282 iWord = -1;
283 size_t index = bSpaceStart ? 1 : 0;
284 nStartPos = m_resStart + m_csFindWhatArray[index].GetLength();
285 }
286 }
287 m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
288 if (m_options.bConsecutive) {
289 m_findNextStart = m_resStart + 1;
290 m_findPreStart = m_resEnd - 1;
291 } else {
292 m_findNextStart = m_resEnd + 1;
293 m_findPreStart = m_resStart - 1;
294 }
295 return true;
296 }
297
FindPrev()298 bool CPDF_TextPageFind::FindPrev() {
299 if (m_strText.IsEmpty() || !m_findPreStart.has_value())
300 return false;
301
302 CPDF_TextPageFind find_engine(m_pTextPage.Get(), m_csFindWhatArray, m_options,
303 0);
304 if (!find_engine.FindFirst())
305 return false;
306
307 int order = -1;
308 int matches = 0;
309 while (find_engine.FindNext()) {
310 int cur_order = find_engine.GetCurOrder();
311 int cur_match = find_engine.GetMatchedCount();
312 int temp = cur_order + cur_match;
313 if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
314 break;
315
316 order = cur_order;
317 matches = cur_match;
318 }
319 if (order == -1)
320 return false;
321
322 m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
323 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + matches - 1);
324 if (m_options.bConsecutive) {
325 m_findNextStart = m_resStart + 1;
326 m_findPreStart = m_resEnd - 1;
327 } else {
328 m_findNextStart = m_resEnd + 1;
329 m_findPreStart = m_resStart - 1;
330 }
331 return true;
332 }
333
GetCurOrder() const334 int CPDF_TextPageFind::GetCurOrder() const {
335 return GetCharIndex(m_resStart);
336 }
337
GetMatchedCount() const338 int CPDF_TextPageFind::GetMatchedCount() const {
339 int resStart = GetCharIndex(m_resStart);
340 int resEnd = GetCharIndex(m_resEnd);
341 return resEnd - resStart + 1;
342 }
343