• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/query_parser/snippet.h"
6 
7 #include <algorithm>
8 
9 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversions.h"
12 #include "testing/gtest/include/gtest/gtest.h"
13 
14 namespace query_parser {
15 namespace {
16 
17 // A sample document to compute snippets of.
18 // The \x bits after the first "Google" are UTF-8 of U+2122 TRADE MARK SIGN,
19 // and are useful for verifying we don't screw up in UTF-8/UTF-16 conversion.
20 const char* kSampleDocument = "Google\xe2\x84\xa2 Terms of Service "
21 "Welcome to Google! "
22 "1. Your relationship with Google "
23 "1.1 Your use of Google's products, software, services and web sites "
24 "(referred to collectively as the \"Services\" in this document and excluding "
25 "any services provided to you by Google under a separate written agreement) "
26 "is subject to the terms of a legal agreement between you and Google. "
27 "\"Google\" means Google Inc., whose principal place of business is at 1600 "
28 "Amphitheatre Parkway, Mountain View, CA 94043, United States. This document "
29 "explains how the agreement is made up, and sets out some of the terms of "
30 "that agreement.";
31 };
32 
33 // Thai sample taken from http://www.google.co.th/intl/th/privacy.html
34 // TODO(jungshik) : Add more samples (e.g. Hindi) after porting
35 // ICU 4.0's character iterator changes to our copy of ICU 3.8 to get
36 // grapheme clusters in Indic scripts handled more reasonably.
37 const char* kThaiSample = "Google \xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87"
38 "\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7"
39 "\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
40 "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
41 "\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1"
42 "\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93"
43 "\xE0\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A"
44 "\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
45 "\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A"
46 "\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82"
47 "\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7"
48 "\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x82\xE0\xB9\x89"
49 "\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1"
50 "\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7"
51 "\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1"
52 "\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3"
53 "\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7"
54 "\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
55 "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
56 "\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5"
57 "\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3"
58 "\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x88"
59 "\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB9\x80"
60 "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A"
61 "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5"
62 "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4"
63 "\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88"
64 "\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB"
65 "\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84"
66 "\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA"
67 "\xE0\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88"
68 "\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x9C\xE0\xB8\xB9"
69 "\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94"
70 "\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3"
71 "\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3"
72 "\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94"
73 "\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3"
74 "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87"
75 "\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95"
76 "\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89"
77 "\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89"
78 "\xE0\xB9\x80\xE0\xB8\xAB\xE0\xB8\xA1\xE0\xB8\xB2\xE0\xB8\xB0\xE0\xB8\xAA"
79 "\xE0\xB8\xB3\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x84"
80 "\xE0\xB8\xB8\xE0\xB8\x93";
81 
82 // Comparator for sorting by the first element in a pair.
ComparePair1st(const Snippet::MatchPosition & a,const Snippet::MatchPosition & b)83 bool ComparePair1st(const Snippet::MatchPosition& a,
84                     const Snippet::MatchPosition& b) {
85   return a.first < b.first;
86 }
87 
88 // For testing, we'll compute the match positions manually instead of using
89 // sqlite's FTS matching.  BuildSnippet returns the snippet for matching
90 // |query| against |document|.  Matches are surrounded by "**".
BuildSnippet(const std::string & document,const std::string & query)91 base::string16 BuildSnippet(const std::string& document,
92                             const std::string& query) {
93   // This function assumes that |document| does not contain
94   // any character for which lowercasing changes its length. Further,
95   // it's assumed that lowercasing only the ASCII-portion works for
96   // |document|. We need to add more test cases and change this function
97   // to be more generic depending on how we deal with 'folding for match'
98   // in history.
99   const std::string document_folded =
100       base::StringToLowerASCII(std::string(document));
101 
102   std::vector<std::string> query_words;
103   base::SplitString(query, ' ', &query_words);
104 
105   // Manually construct match_positions of the document.
106   Snippet::MatchPositions match_positions;
107   match_positions.clear();
108   for (std::vector<std::string>::iterator qw = query_words.begin();
109        qw != query_words.end(); ++qw) {
110     // Insert all instances of this word into match_pairs.
111     size_t ofs = 0;
112     while ((ofs = document_folded.find(*qw, ofs)) != std::string::npos) {
113       match_positions.push_back(std::make_pair(ofs, ofs + qw->size()));
114       ofs += qw->size();
115     }
116   }
117   // Sort match_positions in order of increasing offset.
118   std::sort(match_positions.begin(), match_positions.end(), ComparePair1st);
119 
120   // Compute the snippet.
121   Snippet snippet;
122   snippet.ComputeSnippet(match_positions, document);
123 
124   // Now "highlight" all matches in the snippet with **.
125   base::string16 star_snippet;
126   Snippet::MatchPositions::const_iterator match;
127   size_t pos = 0;
128   for (match = snippet.matches().begin();
129        match != snippet.matches().end(); ++match) {
130     star_snippet += snippet.text().substr(pos, match->first - pos);
131     star_snippet += base::UTF8ToUTF16("**");
132     star_snippet += snippet.text().substr(match->first,
133                                           match->second - match->first);
134     star_snippet += base::UTF8ToUTF16("**");
135     pos = match->second;
136   }
137   star_snippet += snippet.text().substr(pos);
138 
139   return star_snippet;
140 }
141 
TEST(Snippets,SimpleQuery)142 TEST(Snippets, SimpleQuery) {
143   ASSERT_EQ(" ... eferred to collectively as the \"Services\" in this "
144             "**document** and excluding any services provided to you by "
145             "Goo ...  ... way, Mountain View, CA 94043, United States. This "
146             "**document** explains how the agreement is made up, and sets "
147             "o ... ",
148             base::UTF16ToUTF8(BuildSnippet(kSampleDocument, "document")));
149 }
150 
151 // Test that two words that are near each other don't produce two elided bits.
TEST(Snippets,NearbyWords)152 TEST(Snippets, NearbyWords) {
153   ASSERT_EQ(" ... lace of business is at 1600 Amphitheatre Parkway, "
154             "**Mountain** **View**, CA 94043, United States. This "
155             "document explains  ... ",
156             base::UTF16ToUTF8(BuildSnippet(kSampleDocument, "mountain view")));
157 }
158 
159 // The above tests already test that we get byte offsets correct, but here's
160 // one that gets the "TM" in its snippet.
TEST(Snippets,UTF8)161 TEST(Snippets, UTF8) {
162   ASSERT_EQ(" ... ogle\xe2\x84\xa2 Terms of Service Welcome to Google! "
163             "1. Your **relationship** with Google 1.1 Your use of Google's "
164             "products, so ... ",
165             base::UTF16ToUTF8(BuildSnippet(kSampleDocument, "relationship")));
166 }
167 
TEST(Snippets,ThaiUTF8)168 TEST(Snippets, ThaiUTF8) {
169   // There are 3 instances of '\u0E43\u0E2B\u0E49'
170   // (\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89) in kThaiSample.
171   // The 1st is more than |kSniipetContext| graphemes away from the
172   // 2nd while the 2nd and 3rd are within that window. However, with
173   // the 2nd match added, the snippet goes over the size limit so that
174   // the snippet ends right before the 3rd match.
175   ASSERT_EQ(" ...  "
176             "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
177             "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99"
178             "\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 "
179             "\xE0\xB9\x80\xE0\xB8\xA1\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD"
180             "\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB8\xA5\xE0\xB8\x87"
181             "\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A\xE0\xB8\xB5"
182             "\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
183             "\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89"
184             "\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2"
185             "\xE0\xB8\xA3\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google "
186             "\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD**\xE0\xB9\x83"
187             "\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD"
188             "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1"
189             "\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2"
190             "\xE0\xB8\xA7\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA"
191             "\xE0\xB8\xA1\xE0\xB8\xB1\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83"
192             "\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3\xE0\xB8\xB2\xE0\xB8\xAD"
193             "\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1"
194             "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
195             "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99"
196             "\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5"
197             "\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81"
198             "\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A"
199             "\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1 ...  ... \xE0\xB8\x88"
200             "\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93"
201             "\xE0\xB9\x80\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81"
202             "\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD"
203             "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x88\xE0\xB8\xB2"
204             "\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81"
205             "\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88"
206             "\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google "
207             "\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A"
208             "\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97"
209             "\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA\xE0\xB8\xB2\xE0\xB8\xA1 "
210             "\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD**"
211             "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x9C\xE0\xB8\xB9"
212             "\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84"
213             "\xE0\xB8\x94\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A"
214             "\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A"
215             "\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x93\xE0\xB9\x8C"
216             "\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94\xE0\xB8\xB5"
217             "\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3"
218             "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89"
219             "\xE0\xB8\x87\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A"
220             "\xE0\xB9\x81\xE0\xB8\x95\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80"
221             "\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xAB"
222             "\xE0\xB8\xB2",
223             base::UTF16ToUTF8(BuildSnippet(kThaiSample,
224                                      "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89")));
225 }
226 
TEST(Snippets,ExtractMatchPositions)227 TEST(Snippets, ExtractMatchPositions) {
228   struct TestData {
229     const std::string offsets_string;
230     const size_t expected_match_count;
231     const size_t expected_matches[10];
232   } data[] = {
233     { "0 0 1 2 0 0 4 1 0 0 1 5",            1,     { 1, 6 } },
234     { "0 0 1 4 0 0 2 1",                    1,     { 1, 5 } },
235     { "0 0 4 1 0 0 2 1",                    2,     { 2, 3, 4, 5 } },
236     { "0 0 0 1",                            1,     { 0, 1 } },
237     { "0 0 0 1 0 0 0 2",                    1,     { 0, 2 } },
238     { "0 0 1 1 0 0 1 2",                    1,     { 1, 3 } },
239     { "0 0 1 2 0 0 4 3 0 0 3 1",            1,     { 1, 7 } },
240     { "0 0 1 4 0 0 2 5",                    1,     { 1, 7 } },
241     { "0 0 1 2 0 0 1 1",                    1,     { 1, 3 } },
242     { "0 0 1 1 0 0 5 2 0 0 10 1 0 0 3 10",  2,     { 1, 2, 3, 13 } },
243   };
244   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(data); ++i) {
245     Snippet::MatchPositions matches;
246     Snippet::ExtractMatchPositions(data[i].offsets_string, "0", &matches);
247     EXPECT_EQ(data[i].expected_match_count, matches.size());
248     for (size_t j = 0; j < data[i].expected_match_count; ++j) {
249       EXPECT_EQ(data[i].expected_matches[2 * j], matches[j].first);
250       EXPECT_EQ(data[i].expected_matches[2 * j + 1], matches[j].second);
251     }
252   }
253 }
254 
255 }  // namespace query_parser
256