• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/query_parser/query_parser.h"
6 
7 #include <algorithm>
8 
9 #include "base/compiler_specific.h"
10 #include "base/i18n/break_iterator.h"
11 #include "base/i18n/case_conversion.h"
12 #include "base/logging.h"
13 #include "base/stl_util.h"
14 #include "base/strings/utf_string_conversions.h"
15 
16 namespace query_parser {
17 namespace {
18 
19 // Returns true if |mp1.first| is less than |mp2.first|. This is used to
20 // sort match positions.
CompareMatchPosition(const Snippet::MatchPosition & mp1,const Snippet::MatchPosition & mp2)21 int CompareMatchPosition(const Snippet::MatchPosition& mp1,
22                          const Snippet::MatchPosition& mp2) {
23   return mp1.first < mp2.first;
24 }
25 
26 // Returns true if |mp2| intersects |mp1|. This is intended for use by
27 // CoalesceMatchesFrom and isn't meant as a general intersection comparison
28 // function.
SnippetIntersects(const Snippet::MatchPosition & mp1,const Snippet::MatchPosition & mp2)29 bool SnippetIntersects(const Snippet::MatchPosition& mp1,
30                        const Snippet::MatchPosition& mp2) {
31   return mp2.first >= mp1.first && mp2.first <= mp1.second;
32 }
33 
34 // Coalesces match positions in |matches| after index that intersect the match
35 // position at |index|.
CoalesceMatchesFrom(size_t index,Snippet::MatchPositions * matches)36 void CoalesceMatchesFrom(size_t index, Snippet::MatchPositions* matches) {
37   Snippet::MatchPosition& mp = (*matches)[index];
38   for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
39        i != matches->end(); ) {
40     if (SnippetIntersects(mp, *i)) {
41       mp.second = std::max(mp.second, i->second);
42       i = matches->erase(i);
43     } else {
44       return;
45     }
46   }
47 }
48 
49 // Returns true if the character is considered a quote.
IsQueryQuote(wchar_t ch)50 bool IsQueryQuote(wchar_t ch) {
51   return ch == '"' ||
52          ch == 0xab ||    // left pointing double angle bracket
53          ch == 0xbb ||    // right pointing double angle bracket
54          ch == 0x201c ||  // left double quotation mark
55          ch == 0x201d ||  // right double quotation mark
56          ch == 0x201e;    // double low-9 quotation mark
57 }
58 
59 }  // namespace
60 
61 // Inheritance structure:
62 // Queries are represented as trees of QueryNodes.
63 // QueryNodes are either a collection of subnodes (a QueryNodeList)
64 // or a single word (a QueryNodeWord).
65 
66 // A QueryNodeWord is a single word in the query.
67 class QueryNodeWord : public QueryNode {
68  public:
69   explicit QueryNodeWord(const base::string16& word);
70   virtual ~QueryNodeWord();
71 
word() const72   const base::string16& word() const { return word_; }
73 
set_literal(bool literal)74   void set_literal(bool literal) { literal_ = literal; }
75 
76   // QueryNode:
77   virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
78   virtual bool IsWord() const OVERRIDE;
79   virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE;
80   virtual bool HasMatchIn(
81       const QueryWordVector& words,
82       Snippet::MatchPositions* match_positions) const OVERRIDE;
83   virtual bool HasMatchIn(
84       const QueryWordVector& words) const OVERRIDE;
85   virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE;
86 
87  private:
88   base::string16 word_;
89   bool literal_;
90 
91   DISALLOW_COPY_AND_ASSIGN(QueryNodeWord);
92 };
93 
QueryNodeWord(const base::string16 & word)94 QueryNodeWord::QueryNodeWord(const base::string16& word)
95     : word_(word),
96       literal_(false) {}
97 
~QueryNodeWord()98 QueryNodeWord::~QueryNodeWord() {}
99 
AppendToSQLiteQuery(base::string16 * query) const100 int QueryNodeWord::AppendToSQLiteQuery(base::string16* query) const {
101   query->append(word_);
102 
103   // Use prefix search if we're not literal and long enough.
104   if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_))
105     *query += L'*';
106   return 1;
107 }
108 
IsWord() const109 bool QueryNodeWord::IsWord() const {
110   return true;
111 }
112 
Matches(const base::string16 & word,bool exact) const113 bool QueryNodeWord::Matches(const base::string16& word, bool exact) const {
114   if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_))
115     return word == word_;
116   return word.size() >= word_.size() &&
117          (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
118 }
119 
HasMatchIn(const QueryWordVector & words,Snippet::MatchPositions * match_positions) const120 bool QueryNodeWord::HasMatchIn(const QueryWordVector& words,
121                                Snippet::MatchPositions* match_positions) const {
122   bool matched = false;
123   for (size_t i = 0; i < words.size(); ++i) {
124     if (Matches(words[i].word, false)) {
125       size_t match_start = words[i].position;
126       match_positions->push_back(
127           Snippet::MatchPosition(match_start,
128                                  match_start + static_cast<int>(word_.size())));
129       matched = true;
130     }
131   }
132   return matched;
133 }
134 
HasMatchIn(const QueryWordVector & words) const135 bool QueryNodeWord::HasMatchIn(const QueryWordVector& words) const {
136   for (size_t i = 0; i < words.size(); ++i) {
137     if (Matches(words[i].word, false))
138       return true;
139   }
140   return false;
141 }
142 
AppendWords(std::vector<base::string16> * words) const143 void QueryNodeWord::AppendWords(std::vector<base::string16>* words) const {
144   words->push_back(word_);
145 }
146 
147 // A QueryNodeList has a collection of QueryNodes which are deleted in the end.
148 class QueryNodeList : public QueryNode {
149  public:
150   QueryNodeList();
151   virtual ~QueryNodeList();
152 
children()153   QueryNodeStarVector* children() { return &children_; }
154 
155   void AddChild(QueryNode* node);
156 
157   // Remove empty subnodes left over from other parsing.
158   void RemoveEmptySubnodes();
159 
160   // QueryNode:
161   virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
162   virtual bool IsWord() const OVERRIDE;
163   virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE;
164   virtual bool HasMatchIn(
165       const QueryWordVector& words,
166       Snippet::MatchPositions* match_positions) const OVERRIDE;
167   virtual bool HasMatchIn(const QueryWordVector& words) const OVERRIDE;
168   virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE;
169 
170  protected:
171   int AppendChildrenToString(base::string16* query) const;
172 
173   QueryNodeStarVector children_;
174 
175  private:
176   DISALLOW_COPY_AND_ASSIGN(QueryNodeList);
177 };
178 
QueryNodeList()179 QueryNodeList::QueryNodeList() {}
180 
~QueryNodeList()181 QueryNodeList::~QueryNodeList() {
182   STLDeleteElements(&children_);
183 }
184 
AddChild(QueryNode * node)185 void QueryNodeList::AddChild(QueryNode* node) {
186   children_.push_back(node);
187 }
188 
RemoveEmptySubnodes()189 void QueryNodeList::RemoveEmptySubnodes() {
190   for (size_t i = 0; i < children_.size(); ++i) {
191     if (children_[i]->IsWord())
192       continue;
193 
194     QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
195     list_node->RemoveEmptySubnodes();
196     if (list_node->children()->empty()) {
197       children_.erase(children_.begin() + i);
198       --i;
199       delete list_node;
200     }
201   }
202 }
203 
AppendToSQLiteQuery(base::string16 * query) const204 int QueryNodeList::AppendToSQLiteQuery(base::string16* query) const {
205   return AppendChildrenToString(query);
206 }
207 
IsWord() const208 bool QueryNodeList::IsWord() const {
209   return false;
210 }
211 
Matches(const base::string16 & word,bool exact) const212 bool QueryNodeList::Matches(const base::string16& word, bool exact) const {
213   NOTREACHED();
214   return false;
215 }
216 
HasMatchIn(const QueryWordVector & words,Snippet::MatchPositions * match_positions) const217 bool QueryNodeList::HasMatchIn(const QueryWordVector& words,
218                                Snippet::MatchPositions* match_positions) const {
219   NOTREACHED();
220   return false;
221 }
222 
HasMatchIn(const QueryWordVector & words) const223 bool QueryNodeList::HasMatchIn(const QueryWordVector& words) const {
224   NOTREACHED();
225   return false;
226 }
227 
AppendWords(std::vector<base::string16> * words) const228 void QueryNodeList::AppendWords(std::vector<base::string16>* words) const {
229   for (size_t i = 0; i < children_.size(); ++i)
230     children_[i]->AppendWords(words);
231 }
232 
AppendChildrenToString(base::string16 * query) const233 int QueryNodeList::AppendChildrenToString(base::string16* query) const {
234   int num_words = 0;
235   for (QueryNodeStarVector::const_iterator node = children_.begin();
236        node != children_.end(); ++node) {
237     if (node != children_.begin())
238       query->push_back(L' ');
239     num_words += (*node)->AppendToSQLiteQuery(query);
240   }
241   return num_words;
242 }
243 
244 // A QueryNodePhrase is a phrase query ("quoted").
245 class QueryNodePhrase : public QueryNodeList {
246  public:
247   QueryNodePhrase();
248   virtual ~QueryNodePhrase();
249 
250   // QueryNodeList:
251   virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
252   virtual bool HasMatchIn(
253       const QueryWordVector& words,
254       Snippet::MatchPositions* match_positions) const OVERRIDE;
255   virtual bool HasMatchIn(const QueryWordVector& words) const OVERRIDE;
256 
257  private:
258   bool MatchesAll(const QueryWordVector& words,
259                   const QueryWord** first_word,
260                   const QueryWord** last_word) const;
261   DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase);
262 };
263 
QueryNodePhrase()264 QueryNodePhrase::QueryNodePhrase() {}
265 
~QueryNodePhrase()266 QueryNodePhrase::~QueryNodePhrase() {}
267 
AppendToSQLiteQuery(base::string16 * query) const268 int QueryNodePhrase::AppendToSQLiteQuery(base::string16* query) const {
269   query->push_back(L'"');
270   int num_words = AppendChildrenToString(query);
271   query->push_back(L'"');
272   return num_words;
273 }
274 
MatchesAll(const QueryWordVector & words,const QueryWord ** first_word,const QueryWord ** last_word) const275 bool QueryNodePhrase::MatchesAll(const QueryWordVector& words,
276                                  const QueryWord** first_word,
277                                  const QueryWord** last_word) const {
278   if (words.size() < children_.size())
279     return false;
280 
281   for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
282     bool matched_all = true;
283     for (size_t j = 0; j < children_.size(); ++j) {
284       if (!children_[j]->Matches(words[i + j].word, true)) {
285         matched_all = false;
286         break;
287       }
288     }
289     if (matched_all) {
290       *first_word = &words[i];
291       *last_word = &words[i + children_.size() - 1];
292       return true;
293     }
294   }
295   return false;
296 }
297 
HasMatchIn(const QueryWordVector & words,Snippet::MatchPositions * match_positions) const298 bool QueryNodePhrase::HasMatchIn(
299     const QueryWordVector& words,
300     Snippet::MatchPositions* match_positions) const {
301   const QueryWord* first_word;
302   const QueryWord* last_word;
303 
304   if (MatchesAll(words, &first_word, &last_word)) {
305     match_positions->push_back(
306         Snippet::MatchPosition(first_word->position,
307                                last_word->position + last_word->word.length()));
308       return true;
309   }
310   return false;
311 }
312 
HasMatchIn(const QueryWordVector & words) const313 bool QueryNodePhrase::HasMatchIn(const QueryWordVector& words) const {
314   const QueryWord* first_word;
315   const QueryWord* last_word;
316   return MatchesAll(words, &first_word, &last_word);
317 }
318 
QueryParser()319 QueryParser::QueryParser() {}
320 
321 // static
IsWordLongEnoughForPrefixSearch(const base::string16 & word)322 bool QueryParser::IsWordLongEnoughForPrefixSearch(const base::string16& word) {
323   DCHECK(!word.empty());
324   size_t minimum_length = 3;
325   // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
326   // because they 'behave like' Latin letters. Moreover, we should
327   // normalize the former before reaching here.
328   if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
329     minimum_length = 2;
330   return word.size() >= minimum_length;
331 }
332 
ParseQuery(const base::string16 & query,base::string16 * sqlite_query)333 int QueryParser::ParseQuery(const base::string16& query,
334                             base::string16* sqlite_query) {
335   QueryNodeList root;
336   if (!ParseQueryImpl(query, &root))
337     return 0;
338   return root.AppendToSQLiteQuery(sqlite_query);
339 }
340 
ParseQueryWords(const base::string16 & query,std::vector<base::string16> * words)341 void QueryParser::ParseQueryWords(const base::string16& query,
342                                   std::vector<base::string16>* words) {
343   QueryNodeList root;
344   if (!ParseQueryImpl(query, &root))
345     return;
346   root.AppendWords(words);
347 }
348 
ParseQueryNodes(const base::string16 & query,QueryNodeStarVector * nodes)349 void QueryParser::ParseQueryNodes(const base::string16& query,
350                                   QueryNodeStarVector* nodes) {
351   QueryNodeList root;
352   if (ParseQueryImpl(base::i18n::ToLower(query), &root))
353     nodes->swap(*root.children());
354 }
355 
DoesQueryMatch(const base::string16 & text,const QueryNodeStarVector & query_nodes,Snippet::MatchPositions * match_positions)356 bool QueryParser::DoesQueryMatch(const base::string16& text,
357                                  const QueryNodeStarVector& query_nodes,
358                                  Snippet::MatchPositions* match_positions) {
359   if (query_nodes.empty())
360     return false;
361 
362   QueryWordVector query_words;
363   base::string16 lower_text = base::i18n::ToLower(text);
364   ExtractQueryWords(lower_text, &query_words);
365 
366   if (query_words.empty())
367     return false;
368 
369   Snippet::MatchPositions matches;
370   for (size_t i = 0; i < query_nodes.size(); ++i) {
371     if (!query_nodes[i]->HasMatchIn(query_words, &matches))
372       return false;
373   }
374   if (lower_text.length() != text.length()) {
375     // The lower case string differs from the original string. The matches are
376     // meaningless.
377     // TODO(sky): we need a better way to align the positions so that we don't
378     // completely punt here.
379     match_positions->clear();
380   } else {
381     SortAndCoalesceMatchPositions(&matches);
382     match_positions->swap(matches);
383   }
384   return true;
385 }
386 
DoesQueryMatch(const QueryWordVector & query_words,const QueryNodeStarVector & query_nodes)387 bool QueryParser::DoesQueryMatch(const QueryWordVector& query_words,
388                                  const QueryNodeStarVector& query_nodes) {
389   if (query_nodes.empty() || query_words.empty())
390     return false;
391 
392   for (size_t i = 0; i < query_nodes.size(); ++i) {
393     if (!query_nodes[i]->HasMatchIn(query_words))
394       return false;
395   }
396   return true;
397 }
398 
ParseQueryImpl(const base::string16 & query,QueryNodeList * root)399 bool QueryParser::ParseQueryImpl(const base::string16& query,
400                                  QueryNodeList* root) {
401   base::i18n::BreakIterator iter(query, base::i18n::BreakIterator::BREAK_WORD);
402   // TODO(evanm): support a locale here
403   if (!iter.Init())
404     return false;
405 
406   // To handle nesting, we maintain a stack of QueryNodeLists.
407   // The last element (back) of the stack contains the current, deepest node.
408   std::vector<QueryNodeList*> query_stack;
409   query_stack.push_back(root);
410 
411   bool in_quotes = false;  // whether we're currently in a quoted phrase
412   while (iter.Advance()) {
413     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
414     // is not necessarily a word, but could also be a sequence of punctuation
415     // or whitespace.
416     if (iter.IsWord()) {
417       QueryNodeWord* word_node = new QueryNodeWord(iter.GetString());
418       if (in_quotes)
419         word_node->set_literal(true);
420       query_stack.back()->AddChild(word_node);
421     } else {  // Punctuation.
422       if (IsQueryQuote(query[iter.prev()])) {
423         if (!in_quotes) {
424           QueryNodeList* quotes_node = new QueryNodePhrase;
425           query_stack.back()->AddChild(quotes_node);
426           query_stack.push_back(quotes_node);
427           in_quotes = true;
428         } else {
429           query_stack.pop_back();  // Stop adding to the quoted phrase.
430           in_quotes = false;
431         }
432       }
433     }
434   }
435 
436   root->RemoveEmptySubnodes();
437   return true;
438 }
439 
ExtractQueryWords(const base::string16 & text,QueryWordVector * words)440 void QueryParser::ExtractQueryWords(const base::string16& text,
441                                     QueryWordVector* words) {
442   base::i18n::BreakIterator iter(text, base::i18n::BreakIterator::BREAK_WORD);
443   // TODO(evanm): support a locale here
444   if (!iter.Init())
445     return;
446 
447   while (iter.Advance()) {
448     // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
449     // is not necessarily a word, but could also be a sequence of punctuation
450     // or whitespace.
451     if (iter.IsWord()) {
452       base::string16 word = iter.GetString();
453       if (!word.empty()) {
454         words->push_back(QueryWord());
455         words->back().word = word;
456         words->back().position = iter.prev();
457      }
458     }
459   }
460 }
461 
462 // static
SortAndCoalesceMatchPositions(Snippet::MatchPositions * matches)463 void QueryParser::SortAndCoalesceMatchPositions(
464     Snippet::MatchPositions* matches) {
465   std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
466   // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
467   // from matches.
468   for (size_t i = 0; i < matches->size(); ++i)
469     CoalesceMatchesFrom(i, matches);
470 }
471 
472 }  // namespace query_parser
473