1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_textpagefind.h"
8
9 #include <wchar.h>
10
11 #include <vector>
12
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/check.h"
15 #include "core/fxcrt/compiler_specific.h"
16 #include "core/fxcrt/fx_extension.h"
17 #include "core/fxcrt/fx_string.h"
18 #include "core/fxcrt/fx_system.h"
19 #include "core/fxcrt/fx_unicode.h"
20 #include "core/fxcrt/ptr_util.h"
21 #include "core/fxcrt/stl_util.h"
22
23 namespace {
24
25 constexpr wchar_t kNonBreakingSpace = 160;
26
IsIgnoreSpaceCharacter(wchar_t curChar)27 bool IsIgnoreSpaceCharacter(wchar_t curChar) {
28 if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
29 (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
30 (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
31 (curChar >= 0x0400 && curChar <= 0x04FF) ||
32 (curChar >= 0x0500 && curChar <= 0x052F) ||
33 (curChar >= 0xA640 && curChar <= 0xA69F) ||
34 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
35 (curChar >= 0x2000 && curChar <= 0x206F)) {
36 return false;
37 }
38 return true;
39 }
40
IsMatchWholeWord(const WideString & csPageText,size_t startPos,size_t endPos)41 bool IsMatchWholeWord(const WideString& csPageText,
42 size_t startPos,
43 size_t endPos) {
44 if (startPos > endPos)
45 return false;
46 wchar_t char_left = 0;
47 wchar_t char_right = 0;
48 size_t char_count = endPos - startPos + 1;
49 if (char_count == 0)
50 return false;
51 if (char_count == 1 && csPageText[startPos] > 255)
52 return true;
53 if (startPos >= 1)
54 char_left = csPageText[startPos - 1];
55 if (startPos + char_count < csPageText.GetLength())
56 char_right = csPageText[startPos + char_count];
57 if ((char_left > 'A' && char_left < 'a') ||
58 (char_left > 'a' && char_left < 'z') ||
59 (char_left > 0xfb00 && char_left < 0xfb06) ||
60 FXSYS_IsDecimalDigit(char_left) ||
61 (char_right > 'A' && char_right < 'a') ||
62 (char_right > 'a' && char_right < 'z') ||
63 (char_right > 0xfb00 && char_right < 0xfb06) ||
64 FXSYS_IsDecimalDigit(char_right)) {
65 return false;
66 }
67 if (!(('A' > char_left || char_left > 'Z') &&
68 ('a' > char_left || char_left > 'z') &&
69 ('A' > char_right || char_right > 'Z') &&
70 ('a' > char_right || char_right > 'z'))) {
71 return false;
72 }
73 if (char_count > 0) {
74 if (FXSYS_IsDecimalDigit(char_left) &&
75 FXSYS_IsDecimalDigit(csPageText[startPos])) {
76 return false;
77 }
78 if (FXSYS_IsDecimalDigit(char_right) &&
79 FXSYS_IsDecimalDigit(csPageText[endPos])) {
80 return false;
81 }
82 }
83 return true;
84 }
85
GetStringCase(const WideString & wsOriginal,bool bMatchCase)86 WideString GetStringCase(const WideString& wsOriginal, bool bMatchCase) {
87 if (bMatchCase)
88 return wsOriginal;
89
90 WideString wsLower = wsOriginal;
91 wsLower.MakeLower();
92 return wsLower;
93 }
94
ExtractSubString(const wchar_t * lpszFullString,int iSubString)95 std::optional<WideString> ExtractSubString(const wchar_t* lpszFullString,
96 int iSubString) {
97 DCHECK(lpszFullString);
98 UNSAFE_TODO({
99 while (iSubString--) {
100 lpszFullString = wcschr(lpszFullString, L' ');
101 if (!lpszFullString) {
102 return std::nullopt;
103 }
104
105 lpszFullString++;
106 while (*lpszFullString == L' ') {
107 lpszFullString++;
108 }
109 }
110
111 const wchar_t* lpchEnd = wcschr(lpszFullString, L' ');
112 int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
113 : static_cast<int>(wcslen(lpszFullString));
114 if (nLen < 0) {
115 return std::nullopt;
116 }
117
118 return WideString(lpszFullString, static_cast<size_t>(nLen));
119 });
120 }
121
ExtractFindWhat(const WideString & findwhat)122 std::vector<WideString> ExtractFindWhat(const WideString& findwhat) {
123 std::vector<WideString> findwhat_array;
124
125 size_t len = findwhat.GetLength();
126 size_t i = 0;
127 for (i = 0; i < len; ++i)
128 if (findwhat[i] != ' ')
129 break;
130 if (i == len) {
131 findwhat_array.push_back(findwhat);
132 return findwhat_array;
133 }
134
135 int index = 0;
136 while (true) {
137 std::optional<WideString> word = ExtractSubString(findwhat.c_str(), index);
138 if (!word.has_value())
139 break;
140
141 if (word->IsEmpty()) {
142 findwhat_array.push_back(L"");
143 index++;
144 continue;
145 }
146
147 size_t pos = 0;
148 while (pos < word->GetLength()) {
149 WideString curStr = word->Substr(pos, 1);
150 wchar_t curChar = word.value()[pos];
151 if (IsIgnoreSpaceCharacter(curChar)) {
152 if (pos > 0 && curChar == pdfium::unicode::kRightSingleQuotationMark) {
153 pos++;
154 continue;
155 }
156 if (pos > 0)
157 findwhat_array.push_back(word->First(pos));
158 findwhat_array.push_back(curStr);
159 if (pos == word->GetLength() - 1) {
160 word->clear();
161 break;
162 }
163 word.emplace(word->Last(word->GetLength() - pos - 1));
164 pos = 0;
165 continue;
166 }
167 pos++;
168 }
169
170 if (!word->IsEmpty())
171 findwhat_array.push_back(word.value());
172 index++;
173 }
174 return findwhat_array;
175 }
176
177 } // namespace
178
179 // static
Create(const CPDF_TextPage * pTextPage,const WideString & findwhat,const Options & options,std::optional<size_t> startPos)180 std::unique_ptr<CPDF_TextPageFind> CPDF_TextPageFind::Create(
181 const CPDF_TextPage* pTextPage,
182 const WideString& findwhat,
183 const Options& options,
184 std::optional<size_t> startPos) {
185 std::vector<WideString> findwhat_array =
186 ExtractFindWhat(GetStringCase(findwhat, options.bMatchCase));
187 auto find = pdfium::WrapUnique(
188 new CPDF_TextPageFind(pTextPage, findwhat_array, options, startPos));
189 find->FindFirst();
190 return find;
191 }
192
CPDF_TextPageFind(const CPDF_TextPage * pTextPage,const std::vector<WideString> & findwhat_array,const Options & options,std::optional<size_t> startPos)193 CPDF_TextPageFind::CPDF_TextPageFind(
194 const CPDF_TextPage* pTextPage,
195 const std::vector<WideString>& findwhat_array,
196 const Options& options,
197 std::optional<size_t> startPos)
198 : m_pTextPage(pTextPage),
199 m_strText(GetStringCase(pTextPage->GetAllPageText(), options.bMatchCase)),
200 m_csFindWhatArray(findwhat_array),
201 m_options(options) {
202 if (!m_strText.IsEmpty()) {
203 m_findNextStart = startPos;
204 m_findPreStart = startPos.value_or(m_strText.GetLength() - 1);
205 }
206 }
207
208 CPDF_TextPageFind::~CPDF_TextPageFind() = default;
209
GetCharIndex(int index) const210 int CPDF_TextPageFind::GetCharIndex(int index) const {
211 return m_pTextPage->CharIndexFromTextIndex(index);
212 }
213
FindFirst()214 bool CPDF_TextPageFind::FindFirst() {
215 return m_strText.IsEmpty() || !m_csFindWhatArray.empty();
216 }
217
FindNext()218 bool CPDF_TextPageFind::FindNext() {
219 if (m_strText.IsEmpty() || !m_findNextStart.has_value())
220 return false;
221
222 const size_t strLen = m_strText.GetLength();
223 size_t nStartPos = m_findNextStart.value();
224 if (nStartPos >= strLen) {
225 return false;
226 }
227
228 int nCount = fxcrt::CollectionSize<int>(m_csFindWhatArray);
229 std::optional<size_t> nResultPos = 0;
230 bool bSpaceStart = false;
231 for (int iWord = 0; iWord < nCount; iWord++) {
232 WideString csWord = m_csFindWhatArray[iWord];
233 if (csWord.IsEmpty()) {
234 if (iWord == nCount - 1) {
235 if (nStartPos >= strLen) {
236 return false;
237 }
238 wchar_t strInsert = m_strText[nStartPos];
239 if (strInsert == L'\n' || strInsert == L' ' || strInsert == L'\r' ||
240 strInsert == kNonBreakingSpace) {
241 nResultPos = nStartPos + 1;
242 break;
243 }
244 iWord = -1;
245 } else if (iWord == 0) {
246 bSpaceStart = true;
247 }
248 continue;
249 }
250 nResultPos = m_strText.Find(csWord.AsStringView(), nStartPos);
251 if (!nResultPos.has_value())
252 return false;
253
254 size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
255 if (iWord == 0)
256 m_resStart = nResultPos.value();
257 bool bMatch = true;
258 if (iWord != 0 && !bSpaceStart) {
259 size_t PreResEndPos = nStartPos;
260 int curChar = csWord[0];
261 WideString lastWord = m_csFindWhatArray[iWord - 1];
262 int lastChar = lastWord.Back();
263 if (nStartPos == nResultPos.value() &&
264 !(IsIgnoreSpaceCharacter(lastChar) ||
265 IsIgnoreSpaceCharacter(curChar))) {
266 bMatch = false;
267 }
268 for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
269 wchar_t strInsert = m_strText[d];
270 if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
271 strInsert != kNonBreakingSpace) {
272 bMatch = false;
273 break;
274 }
275 }
276 } else if (bSpaceStart) {
277 if (nResultPos.value() > 0) {
278 wchar_t strInsert = m_strText[nResultPos.value() - 1];
279 if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
280 strInsert != kNonBreakingSpace) {
281 bMatch = false;
282 m_resStart = nResultPos.value();
283 } else {
284 m_resStart = nResultPos.value() - 1;
285 }
286 }
287 }
288 if (m_options.bMatchWholeWord && bMatch)
289 bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
290
291 if (bMatch) {
292 nStartPos = endIndex + 1;
293 } else {
294 iWord = -1;
295 size_t index = bSpaceStart ? 1 : 0;
296 nStartPos = m_resStart + m_csFindWhatArray[index].GetLength();
297 }
298 }
299 m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
300 if (m_options.bConsecutive) {
301 m_findNextStart = m_resStart + 1;
302 m_findPreStart = m_resEnd - 1;
303 } else {
304 m_findNextStart = m_resEnd + 1;
305 m_findPreStart = m_resStart - 1;
306 }
307 return true;
308 }
309
FindPrev()310 bool CPDF_TextPageFind::FindPrev() {
311 if (m_strText.IsEmpty() || !m_findPreStart.has_value())
312 return false;
313
314 CPDF_TextPageFind find_engine(m_pTextPage, m_csFindWhatArray, m_options, 0);
315 if (!find_engine.FindFirst())
316 return false;
317
318 int order = -1;
319 int matches = 0;
320 while (find_engine.FindNext()) {
321 int cur_order = find_engine.GetCurOrder();
322 int cur_match = find_engine.GetMatchedCount();
323 int temp = cur_order + cur_match;
324 if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
325 break;
326
327 order = cur_order;
328 matches = cur_match;
329 }
330 if (order == -1)
331 return false;
332
333 m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
334 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + matches - 1);
335 if (m_options.bConsecutive) {
336 m_findNextStart = m_resStart + 1;
337 m_findPreStart = m_resEnd - 1;
338 } else {
339 m_findNextStart = m_resEnd + 1;
340 m_findPreStart = m_resStart - 1;
341 }
342 return true;
343 }
344
GetCurOrder() const345 int CPDF_TextPageFind::GetCurOrder() const {
346 return GetCharIndex(m_resStart);
347 }
348
GetMatchedCount() const349 int CPDF_TextPageFind::GetMatchedCount() const {
350 int resStart = GetCharIndex(m_resStart);
351 int resEnd = GetCharIndex(m_resEnd);
352 return resEnd - resStart + 1;
353 }
354