• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/browser/history/query_parser.h"
6 
7 #include <algorithm>
8 
9 #include "base/i18n/break_iterator.h"
10 #include "base/logging.h"
11 #include "base/memory/scoped_vector.h"
12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h"
14 #include "ui/base/l10n/l10n_util.h"
15 #include "unicode/uscript.h"
16 
17 namespace {
18 
19 // Returns true if |mp1.first| is less than |mp2.first|. This is used to
20 // sort match positions.
CompareMatchPosition(const Snippet::MatchPosition & mp1,const Snippet::MatchPosition & mp2)21 int CompareMatchPosition(const Snippet::MatchPosition& mp1,
22                          const Snippet::MatchPosition& mp2) {
23   return mp1.first < mp2.first;
24 }
25 
26 // Returns true if |mp2| intersects |mp1|. This is intended for use by
27 // CoalesceMatchesFrom and isn't meant as a general intersectpion comparison
28 // function.
SnippetIntersects(const Snippet::MatchPosition & mp1,const Snippet::MatchPosition & mp2)29 bool SnippetIntersects(const Snippet::MatchPosition& mp1,
30                        const Snippet::MatchPosition& mp2) {
31   return mp2.first >= mp1.first && mp2.first <= mp1.second;
32 }
33 
34 // Coalesces match positions in |matches| after index that intersect the match
35 // position at |index|.
CoalesceMatchesFrom(size_t index,Snippet::MatchPositions * matches)36 void CoalesceMatchesFrom(size_t index,
37                          Snippet::MatchPositions* matches) {
38   Snippet::MatchPosition& mp = (*matches)[index];
39   for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
40        i != matches->end(); ) {
41     if (SnippetIntersects(mp, *i)) {
42       mp.second = i->second;
43       i = matches->erase(i);
44     } else {
45       return;
46     }
47   }
48 }
49 
50 // Sorts the match positions in |matches| by their first index, then coalesces
51 // any match positions that intersect each other.
CoalseAndSortMatchPositions(Snippet::MatchPositions * matches)52 void CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) {
53   std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
54   // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
55   // from matches.
56   for (size_t i = 0; i < matches->size(); ++i)
57     CoalesceMatchesFrom(i, matches);
58 }
59 
60 }  // namespace
61 
62 // Inheritance structure:
63 // Queries are represented as trees of QueryNodes.
64 // QueryNodes are either a collection of subnodes (a QueryNodeList)
65 // or a single word (a QueryNodeWord).
66 
67 // A QueryNodeWord is a single word in the query.
68 class QueryNodeWord : public QueryNode {
69  public:
QueryNodeWord(const string16 & word)70   explicit QueryNodeWord(const string16& word)
71       : word_(word), literal_(false) {}
~QueryNodeWord()72   virtual ~QueryNodeWord() {}
73   virtual int AppendToSQLiteQuery(string16* query) const;
IsWord() const74   virtual bool IsWord() const { return true; }
75 
word() const76   const string16& word() const { return word_; }
set_literal(bool literal)77   void set_literal(bool literal) { literal_ = literal; }
78 
79   virtual bool HasMatchIn(const std::vector<QueryWord>& words,
80                           Snippet::MatchPositions* match_positions) const;
81 
82   virtual bool Matches(const string16& word, bool exact) const;
83   virtual void AppendWords(std::vector<string16>* words) const;
84 
85  private:
86   string16 word_;
87   bool literal_;
88 };
89 
HasMatchIn(const std::vector<QueryWord> & words,Snippet::MatchPositions * match_positions) const90 bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words,
91                                Snippet::MatchPositions* match_positions) const {
92   for (size_t i = 0; i < words.size(); ++i) {
93     if (Matches(words[i].word, false)) {
94       size_t match_start = words[i].position;
95       match_positions->push_back(
96           Snippet::MatchPosition(match_start,
97                                  match_start + static_cast<int>(word_.size())));
98       return true;
99     }
100   }
101   return false;
102 }
103 
Matches(const string16 & word,bool exact) const104 bool QueryNodeWord::Matches(const string16& word, bool exact) const {
105   if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_))
106     return word == word_;
107   return word.size() >= word_.size() &&
108          (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
109 }
110 
AppendWords(std::vector<string16> * words) const111 void QueryNodeWord::AppendWords(std::vector<string16>* words) const {
112   words->push_back(word_);
113 }
114 
AppendToSQLiteQuery(string16 * query) const115 int QueryNodeWord::AppendToSQLiteQuery(string16* query) const {
116   query->append(word_);
117 
118   // Use prefix search if we're not literal and long enough.
119   if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_))
120     *query += L'*';
121   return 1;
122 }
123 
124 // A QueryNodeList has a collection of child QueryNodes
125 // which it cleans up after.
126 class QueryNodeList : public QueryNode {
127  public:
128   virtual ~QueryNodeList();
129 
AppendToSQLiteQuery(string16 * query) const130   virtual int AppendToSQLiteQuery(string16* query) const {
131     return AppendChildrenToString(query);
132   }
IsWord() const133   virtual bool IsWord() const { return false; }
134 
AddChild(QueryNode * node)135   void AddChild(QueryNode* node) { children_.push_back(node); }
136 
137   typedef std::vector<QueryNode*> QueryNodeVector;
children()138   QueryNodeVector* children() { return &children_; }
139 
140   // Remove empty subnodes left over from other parsing.
141   void RemoveEmptySubnodes();
142 
143   // QueryNodeList is never used with Matches or HasMatchIn.
Matches(const string16 & word,bool exact) const144   virtual bool Matches(const string16& word, bool exact) const {
145     NOTREACHED();
146     return false;
147   }
HasMatchIn(const std::vector<QueryWord> & words,Snippet::MatchPositions * match_positions) const148   virtual bool HasMatchIn(const std::vector<QueryWord>& words,
149                           Snippet::MatchPositions* match_positions) const {
150     NOTREACHED();
151     return false;
152   }
153   virtual void AppendWords(std::vector<string16>* words) const;
154 
155  protected:
156   int AppendChildrenToString(string16* query) const;
157 
158   QueryNodeVector children_;
159 };
160 
~QueryNodeList()161 QueryNodeList::~QueryNodeList() {
162   for (QueryNodeVector::iterator node = children_.begin();
163        node != children_.end(); ++node)
164     delete *node;
165 }
166 
RemoveEmptySubnodes()167 void QueryNodeList::RemoveEmptySubnodes() {
168   for (size_t i = 0; i < children_.size(); ++i) {
169     if (children_[i]->IsWord())
170       continue;
171 
172     QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
173     list_node->RemoveEmptySubnodes();
174     if (list_node->children()->empty()) {
175       children_.erase(children_.begin() + i);
176       --i;
177       delete list_node;
178     }
179   }
180 }
181 
AppendWords(std::vector<string16> * words) const182 void QueryNodeList::AppendWords(std::vector<string16>* words) const {
183   for (size_t i = 0; i < children_.size(); ++i)
184     children_[i]->AppendWords(words);
185 }
186 
AppendChildrenToString(string16 * query) const187 int QueryNodeList::AppendChildrenToString(string16* query) const {
188   int num_words = 0;
189   for (QueryNodeVector::const_iterator node = children_.begin();
190        node != children_.end(); ++node) {
191     if (node != children_.begin())
192       query->push_back(L' ');
193     num_words += (*node)->AppendToSQLiteQuery(query);
194   }
195   return num_words;
196 }
197 
198 // A QueryNodePhrase is a phrase query ("quoted").
199 class QueryNodePhrase : public QueryNodeList {
200  public:
AppendToSQLiteQuery(string16 * query) const201   virtual int AppendToSQLiteQuery(string16* query) const {
202     query->push_back(L'"');
203     int num_words = AppendChildrenToString(query);
204     query->push_back(L'"');
205     return num_words;
206   }
207 
208   virtual bool Matches(const string16& word, bool exact) const;
209   virtual bool HasMatchIn(const std::vector<QueryWord>& words,
210                           Snippet::MatchPositions* match_positions) const;
211 };
212 
Matches(const string16 & word,bool exact) const213 bool QueryNodePhrase::Matches(const string16& word, bool exact) const {
214   NOTREACHED();
215   return false;
216 }
217 
HasMatchIn(const std::vector<QueryWord> & words,Snippet::MatchPositions * match_positions) const218 bool QueryNodePhrase::HasMatchIn(
219     const std::vector<QueryWord>& words,
220     Snippet::MatchPositions* match_positions) const {
221   if (words.size() < children_.size())
222     return false;
223 
224   for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
225     bool matched_all = true;
226     for (size_t j = 0; j < children_.size(); ++j) {
227       if (!children_[j]->Matches(words[i + j].word, true)) {
228         matched_all = false;
229         break;
230       }
231     }
232     if (matched_all) {
233       const QueryWord& last_word = words[i + children_.size() - 1];
234       match_positions->push_back(
235           Snippet::MatchPosition(words[i].position,
236                                  last_word.position + last_word.word.length()));
237       return true;
238     }
239   }
240   return false;
241 }
242 
QueryParser()243 QueryParser::QueryParser() {
244 }
245 
246 // static
IsWordLongEnoughForPrefixSearch(const string16 & word)247 bool QueryParser::IsWordLongEnoughForPrefixSearch(const string16& word) {
248   DCHECK(!word.empty());
249   size_t minimum_length = 3;
250   // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
251   // because they 'behave like' Latin letters. Moreover, we should
252   // normalize the former before reaching here.
253   if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
254     minimum_length = 2;
255   return word.size() >= minimum_length;
256 }
257 
258 // Returns true if the character is considered a quote.
IsQueryQuote(wchar_t ch)259 static bool IsQueryQuote(wchar_t ch) {
260   return ch == '"' ||
261          ch == 0xab ||    // left pointing double angle bracket
262          ch == 0xbb ||    // right pointing double angle bracket
263          ch == 0x201c ||  // left double quotation mark
264          ch == 0x201d ||  // right double quotation mark
265          ch == 0x201e;    // double low-9 quotation mark
266 }
267 
ParseQuery(const string16 & query,string16 * sqlite_query)268 int QueryParser::ParseQuery(const string16& query,
269                             string16* sqlite_query) {
270   QueryNodeList root;
271   if (!ParseQueryImpl(query, &root))
272     return 0;
273   return root.AppendToSQLiteQuery(sqlite_query);
274 }
275 
ParseQuery(const string16 & query,std::vector<QueryNode * > * nodes)276 void QueryParser::ParseQuery(const string16& query,
277                              std::vector<QueryNode*>* nodes) {
278   QueryNodeList root;
279   if (ParseQueryImpl(l10n_util::ToLower(query), &root))
280     nodes->swap(*root.children());
281 }
282 
283 
ExtractQueryWords(const string16 & query,std::vector<string16> * words)284 void QueryParser::ExtractQueryWords(const string16& query,
285                                     std::vector<string16>* words) {
286   QueryNodeList root;
287   if (!ParseQueryImpl(query, &root))
288     return;
289   root.AppendWords(words);
290 }
291 
DoesQueryMatch(const string16 & text,const std::vector<QueryNode * > & query_nodes,Snippet::MatchPositions * match_positions)292 bool QueryParser::DoesQueryMatch(const string16& text,
293                                  const std::vector<QueryNode*>& query_nodes,
294                                  Snippet::MatchPositions* match_positions) {
295   if (query_nodes.empty())
296     return false;
297 
298   std::vector<QueryWord> query_words;
299   string16 lower_text = l10n_util::ToLower(text);
300   ExtractQueryWords(lower_text, &query_words);
301 
302   if (query_words.empty())
303     return false;
304 
305   Snippet::MatchPositions matches;
306   for (size_t i = 0; i < query_nodes.size(); ++i) {
307     if (!query_nodes[i]->HasMatchIn(query_words, &matches))
308       return false;
309   }
310   if (lower_text.length() != text.length()) {
311     // The lower case string differs from the original string. The matches are
312     // meaningless.
313     // TODO(sky): we need a better way to align the positions so that we don't
314     // completely punt here.
315     match_positions->clear();
316   } else {
317     CoalseAndSortMatchPositions(&matches);
318     match_positions->swap(matches);
319   }
320   return true;
321 }
322 
ParseQueryImpl(const string16 & query,QueryNodeList * root)323 bool QueryParser::ParseQueryImpl(const string16& query,
324                                 QueryNodeList* root) {
325   base::BreakIterator iter(&query, base::BreakIterator::BREAK_WORD);
326   // TODO(evanm): support a locale here
327   if (!iter.Init())
328     return false;
329 
330   // To handle nesting, we maintain a stack of QueryNodeLists.
331   // The last element (back) of the stack contains the current, deepest node.
332   std::vector<QueryNodeList*> query_stack;
333   query_stack.push_back(root);
334 
335   bool in_quotes = false;  // whether we're currently in a quoted phrase
336   while (iter.Advance()) {
337     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
338     // is not necessarily a word, but could also be a sequence of punctuation
339     // or whitespace.
340     if (iter.IsWord()) {
341       string16 word = iter.GetString();
342 
343       QueryNodeWord* word_node = new QueryNodeWord(word);
344       if (in_quotes)
345         word_node->set_literal(true);
346       query_stack.back()->AddChild(word_node);
347     } else {  // Punctuation.
348       if (IsQueryQuote(query[iter.prev()])) {
349         if (!in_quotes) {
350           QueryNodeList* quotes_node = new QueryNodePhrase;
351           query_stack.back()->AddChild(quotes_node);
352           query_stack.push_back(quotes_node);
353           in_quotes = true;
354         } else {
355           query_stack.pop_back();  // Stop adding to the quoted phrase.
356           in_quotes = false;
357         }
358       }
359     }
360   }
361 
362   root->RemoveEmptySubnodes();
363   return true;
364 }
365 
ExtractQueryWords(const string16 & text,std::vector<QueryWord> * words)366 void QueryParser::ExtractQueryWords(const string16& text,
367                                     std::vector<QueryWord>* words) {
368   base::BreakIterator iter(&text, base::BreakIterator::BREAK_WORD);
369   // TODO(evanm): support a locale here
370   if (!iter.Init())
371     return;
372 
373   while (iter.Advance()) {
374     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
375     // is not necessarily a word, but could also be a sequence of punctuation
376     // or whitespace.
377     if (iter.IsWord()) {
378       string16 word = iter.GetString();
379       if (!word.empty()) {
380         words->push_back(QueryWord());
381         words->back().word = word;
382         words->back().position = iter.prev();
383       }
384     }
385   }
386 }
387