• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_textpagefind.h"
8 
9 #include <wchar.h>
10 
11 #include <vector>
12 
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/check.h"
15 #include "core/fxcrt/compiler_specific.h"
16 #include "core/fxcrt/fx_extension.h"
17 #include "core/fxcrt/fx_string.h"
18 #include "core/fxcrt/fx_system.h"
19 #include "core/fxcrt/fx_unicode.h"
20 #include "core/fxcrt/ptr_util.h"
21 #include "core/fxcrt/stl_util.h"
22 
23 namespace {
24 
25 constexpr wchar_t kNonBreakingSpace = 160;
26 
IsIgnoreSpaceCharacter(wchar_t curChar)27 bool IsIgnoreSpaceCharacter(wchar_t curChar) {
28   if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
29       (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
30       (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
31       (curChar >= 0x0400 && curChar <= 0x04FF) ||
32       (curChar >= 0x0500 && curChar <= 0x052F) ||
33       (curChar >= 0xA640 && curChar <= 0xA69F) ||
34       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
35       (curChar >= 0x2000 && curChar <= 0x206F)) {
36     return false;
37   }
38   return true;
39 }
40 
IsMatchWholeWord(const WideString & csPageText,size_t startPos,size_t endPos)41 bool IsMatchWholeWord(const WideString& csPageText,
42                       size_t startPos,
43                       size_t endPos) {
44   if (startPos > endPos)
45     return false;
46   wchar_t char_left = 0;
47   wchar_t char_right = 0;
48   size_t char_count = endPos - startPos + 1;
49   if (char_count == 0)
50     return false;
51   if (char_count == 1 && csPageText[startPos] > 255)
52     return true;
53   if (startPos >= 1)
54     char_left = csPageText[startPos - 1];
55   if (startPos + char_count < csPageText.GetLength())
56     char_right = csPageText[startPos + char_count];
57   if ((char_left > 'A' && char_left < 'a') ||
58       (char_left > 'a' && char_left < 'z') ||
59       (char_left > 0xfb00 && char_left < 0xfb06) ||
60       FXSYS_IsDecimalDigit(char_left) ||
61       (char_right > 'A' && char_right < 'a') ||
62       (char_right > 'a' && char_right < 'z') ||
63       (char_right > 0xfb00 && char_right < 0xfb06) ||
64       FXSYS_IsDecimalDigit(char_right)) {
65     return false;
66   }
67   if (!(('A' > char_left || char_left > 'Z') &&
68         ('a' > char_left || char_left > 'z') &&
69         ('A' > char_right || char_right > 'Z') &&
70         ('a' > char_right || char_right > 'z'))) {
71     return false;
72   }
73   if (char_count > 0) {
74     if (FXSYS_IsDecimalDigit(char_left) &&
75         FXSYS_IsDecimalDigit(csPageText[startPos])) {
76       return false;
77     }
78     if (FXSYS_IsDecimalDigit(char_right) &&
79         FXSYS_IsDecimalDigit(csPageText[endPos])) {
80       return false;
81     }
82   }
83   return true;
84 }
85 
GetStringCase(const WideString & wsOriginal,bool bMatchCase)86 WideString GetStringCase(const WideString& wsOriginal, bool bMatchCase) {
87   if (bMatchCase)
88     return wsOriginal;
89 
90   WideString wsLower = wsOriginal;
91   wsLower.MakeLower();
92   return wsLower;
93 }
94 
ExtractSubString(const wchar_t * lpszFullString,int iSubString)95 std::optional<WideString> ExtractSubString(const wchar_t* lpszFullString,
96                                            int iSubString) {
97   DCHECK(lpszFullString);
98   UNSAFE_TODO({
99     while (iSubString--) {
100       lpszFullString = wcschr(lpszFullString, L' ');
101       if (!lpszFullString) {
102         return std::nullopt;
103       }
104 
105       lpszFullString++;
106       while (*lpszFullString == L' ') {
107         lpszFullString++;
108       }
109     }
110 
111     const wchar_t* lpchEnd = wcschr(lpszFullString, L' ');
112     int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
113                        : static_cast<int>(wcslen(lpszFullString));
114     if (nLen < 0) {
115       return std::nullopt;
116     }
117 
118     return WideString(lpszFullString, static_cast<size_t>(nLen));
119   });
120 }
121 
ExtractFindWhat(const WideString & findwhat)122 std::vector<WideString> ExtractFindWhat(const WideString& findwhat) {
123   std::vector<WideString> findwhat_array;
124 
125   size_t len = findwhat.GetLength();
126   size_t i = 0;
127   for (i = 0; i < len; ++i)
128     if (findwhat[i] != ' ')
129       break;
130   if (i == len) {
131     findwhat_array.push_back(findwhat);
132     return findwhat_array;
133   }
134 
135   int index = 0;
136   while (true) {
137     std::optional<WideString> word = ExtractSubString(findwhat.c_str(), index);
138     if (!word.has_value())
139       break;
140 
141     if (word->IsEmpty()) {
142       findwhat_array.push_back(L"");
143       index++;
144       continue;
145     }
146 
147     size_t pos = 0;
148     while (pos < word->GetLength()) {
149       WideString curStr = word->Substr(pos, 1);
150       wchar_t curChar = word.value()[pos];
151       if (IsIgnoreSpaceCharacter(curChar)) {
152         if (pos > 0 && curChar == pdfium::unicode::kRightSingleQuotationMark) {
153           pos++;
154           continue;
155         }
156         if (pos > 0)
157           findwhat_array.push_back(word->First(pos));
158         findwhat_array.push_back(curStr);
159         if (pos == word->GetLength() - 1) {
160           word->clear();
161           break;
162         }
163         word.emplace(word->Last(word->GetLength() - pos - 1));
164         pos = 0;
165         continue;
166       }
167       pos++;
168     }
169 
170     if (!word->IsEmpty())
171       findwhat_array.push_back(word.value());
172     index++;
173   }
174   return findwhat_array;
175 }
176 
177 }  // namespace
178 
179 // static
Create(const CPDF_TextPage * pTextPage,const WideString & findwhat,const Options & options,std::optional<size_t> startPos)180 std::unique_ptr<CPDF_TextPageFind> CPDF_TextPageFind::Create(
181     const CPDF_TextPage* pTextPage,
182     const WideString& findwhat,
183     const Options& options,
184     std::optional<size_t> startPos) {
185   std::vector<WideString> findwhat_array =
186       ExtractFindWhat(GetStringCase(findwhat, options.bMatchCase));
187   auto find = pdfium::WrapUnique(
188       new CPDF_TextPageFind(pTextPage, findwhat_array, options, startPos));
189   find->FindFirst();
190   return find;
191 }
192 
CPDF_TextPageFind(const CPDF_TextPage * pTextPage,const std::vector<WideString> & findwhat_array,const Options & options,std::optional<size_t> startPos)193 CPDF_TextPageFind::CPDF_TextPageFind(
194     const CPDF_TextPage* pTextPage,
195     const std::vector<WideString>& findwhat_array,
196     const Options& options,
197     std::optional<size_t> startPos)
198     : m_pTextPage(pTextPage),
199       m_strText(GetStringCase(pTextPage->GetAllPageText(), options.bMatchCase)),
200       m_csFindWhatArray(findwhat_array),
201       m_options(options) {
202   if (!m_strText.IsEmpty()) {
203     m_findNextStart = startPos;
204     m_findPreStart = startPos.value_or(m_strText.GetLength() - 1);
205   }
206 }
207 
208 CPDF_TextPageFind::~CPDF_TextPageFind() = default;
209 
GetCharIndex(int index) const210 int CPDF_TextPageFind::GetCharIndex(int index) const {
211   return m_pTextPage->CharIndexFromTextIndex(index);
212 }
213 
FindFirst()214 bool CPDF_TextPageFind::FindFirst() {
215   return m_strText.IsEmpty() || !m_csFindWhatArray.empty();
216 }
217 
FindNext()218 bool CPDF_TextPageFind::FindNext() {
219   if (m_strText.IsEmpty() || !m_findNextStart.has_value())
220     return false;
221 
222   const size_t strLen = m_strText.GetLength();
223   size_t nStartPos = m_findNextStart.value();
224   if (nStartPos >= strLen) {
225     return false;
226   }
227 
228   int nCount = fxcrt::CollectionSize<int>(m_csFindWhatArray);
229   std::optional<size_t> nResultPos = 0;
230   bool bSpaceStart = false;
231   for (int iWord = 0; iWord < nCount; iWord++) {
232     WideString csWord = m_csFindWhatArray[iWord];
233     if (csWord.IsEmpty()) {
234       if (iWord == nCount - 1) {
235         if (nStartPos >= strLen) {
236           return false;
237         }
238         wchar_t strInsert = m_strText[nStartPos];
239         if (strInsert == L'\n' || strInsert == L' ' || strInsert == L'\r' ||
240             strInsert == kNonBreakingSpace) {
241           nResultPos = nStartPos + 1;
242           break;
243         }
244         iWord = -1;
245       } else if (iWord == 0) {
246         bSpaceStart = true;
247       }
248       continue;
249     }
250     nResultPos = m_strText.Find(csWord.AsStringView(), nStartPos);
251     if (!nResultPos.has_value())
252       return false;
253 
254     size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
255     if (iWord == 0)
256       m_resStart = nResultPos.value();
257     bool bMatch = true;
258     if (iWord != 0 && !bSpaceStart) {
259       size_t PreResEndPos = nStartPos;
260       int curChar = csWord[0];
261       WideString lastWord = m_csFindWhatArray[iWord - 1];
262       int lastChar = lastWord.Back();
263       if (nStartPos == nResultPos.value() &&
264           !(IsIgnoreSpaceCharacter(lastChar) ||
265             IsIgnoreSpaceCharacter(curChar))) {
266         bMatch = false;
267       }
268       for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
269         wchar_t strInsert = m_strText[d];
270         if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
271             strInsert != kNonBreakingSpace) {
272           bMatch = false;
273           break;
274         }
275       }
276     } else if (bSpaceStart) {
277       if (nResultPos.value() > 0) {
278         wchar_t strInsert = m_strText[nResultPos.value() - 1];
279         if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
280             strInsert != kNonBreakingSpace) {
281           bMatch = false;
282           m_resStart = nResultPos.value();
283         } else {
284           m_resStart = nResultPos.value() - 1;
285         }
286       }
287     }
288     if (m_options.bMatchWholeWord && bMatch)
289       bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
290 
291     if (bMatch) {
292       nStartPos = endIndex + 1;
293     } else {
294       iWord = -1;
295       size_t index = bSpaceStart ? 1 : 0;
296       nStartPos = m_resStart + m_csFindWhatArray[index].GetLength();
297     }
298   }
299   m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
300   if (m_options.bConsecutive) {
301     m_findNextStart = m_resStart + 1;
302     m_findPreStart = m_resEnd - 1;
303   } else {
304     m_findNextStart = m_resEnd + 1;
305     m_findPreStart = m_resStart - 1;
306   }
307   return true;
308 }
309 
FindPrev()310 bool CPDF_TextPageFind::FindPrev() {
311   if (m_strText.IsEmpty() || !m_findPreStart.has_value())
312     return false;
313 
314   CPDF_TextPageFind find_engine(m_pTextPage, m_csFindWhatArray, m_options, 0);
315   if (!find_engine.FindFirst())
316     return false;
317 
318   int order = -1;
319   int matches = 0;
320   while (find_engine.FindNext()) {
321     int cur_order = find_engine.GetCurOrder();
322     int cur_match = find_engine.GetMatchedCount();
323     int temp = cur_order + cur_match;
324     if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
325       break;
326 
327     order = cur_order;
328     matches = cur_match;
329   }
330   if (order == -1)
331     return false;
332 
333   m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
334   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + matches - 1);
335   if (m_options.bConsecutive) {
336     m_findNextStart = m_resStart + 1;
337     m_findPreStart = m_resEnd - 1;
338   } else {
339     m_findNextStart = m_resEnd + 1;
340     m_findPreStart = m_resStart - 1;
341   }
342   return true;
343 }
344 
GetCurOrder() const345 int CPDF_TextPageFind::GetCurOrder() const {
346   return GetCharIndex(m_resStart);
347 }
348 
GetMatchedCount() const349 int CPDF_TextPageFind::GetMatchedCount() const {
350   int resStart = GetCharIndex(m_resStart);
351   int resEnd = GetCharIndex(m_resEnd);
352   return resEnd - resStart + 1;
353 }
354