1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/history/snippet.h"
6
7 #include <algorithm>
8
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/string_split.h"
12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h"
14 #include "unicode/brkiter.h"
15 #include "unicode/utext.h"
16 #include "unicode/utf8.h"
17
18 namespace {
19
PairFirstLessThan(const Snippet::MatchPosition & a,const Snippet::MatchPosition & b)20 bool PairFirstLessThan(const Snippet::MatchPosition& a,
21 const Snippet::MatchPosition& b) {
22 return a.first < b.first;
23 }
24
25 // Combines all pairs after offset in match_positions that are contained
26 // or touch the pair at offset.
CoalescePositionsFrom(size_t offset,Snippet::MatchPositions * match_positions)27 void CoalescePositionsFrom(size_t offset,
28 Snippet::MatchPositions* match_positions) {
29 DCHECK(offset < match_positions->size());
30 Snippet::MatchPosition& pair((*match_positions)[offset]);
31 ++offset;
32 while (offset < match_positions->size() &&
33 pair.second >= (*match_positions)[offset].first) {
34 pair.second = std::max(pair.second, (*match_positions)[offset].second);
35 match_positions->erase(match_positions->begin() + offset);
36 }
37 }
38
39 // Makes sure there is a pair in match_positions that contains the specified
40 // range. This keeps the pairs ordered in match_positions by first, and makes
41 // sure none of the pairs in match_positions touch each other.
AddMatch(size_t start,size_t end,Snippet::MatchPositions * match_positions)42 void AddMatch(size_t start,
43 size_t end,
44 Snippet::MatchPositions* match_positions) {
45 DCHECK(start < end);
46 DCHECK(match_positions);
47 Snippet::MatchPosition pair(start, end);
48 if (match_positions->empty()) {
49 match_positions->push_back(pair);
50 return;
51 }
52 // There's at least one match. Find the position of the new match,
53 // potentially extending pairs around it.
54 Snippet::MatchPositions::iterator i =
55 std::lower_bound(match_positions->begin(), match_positions->end(),
56 pair, &PairFirstLessThan);
57 if (i != match_positions->end() && i->first == start) {
58 // Match not at the end and there is already a pair with the same
59 // start.
60 if (end > i->second) {
61 // New pair extends beyond existing pair. Extend existing pair and
62 // coalesce matches after it.
63 i->second = end;
64 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
65 } // else case, new pair completely contained in existing pair, nothing
66 // to do.
67 } else if (i == match_positions->begin()) {
68 // Match at the beginning and the first pair doesn't have the same
69 // start. Insert new pair and coalesce matches after it.
70 match_positions->insert(i, pair);
71 CoalescePositionsFrom(0, match_positions);
72 } else {
73 // Not at the beginning (but may be at the end).
74 --i;
75 if (start <= i->second && end > i->second) {
76 // Previous element contains match. Extend it and coalesce.
77 i->second = end;
78 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
79 } else if (end > i->second) {
80 // Region doesn't touch previous element. See if region touches current
81 // element.
82 ++i;
83 if (i == match_positions->end() || end < i->first) {
84 match_positions->insert(i, pair);
85 } else {
86 i->first = start;
87 i->second = end;
88 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
89 }
90 }
91 }
92 }
93
94 // Converts an index in a utf8 string into the index in the corresponding utf16
95 // string and returns the utf16 index. This is intended to be called in a loop
96 // iterating through a utf8 string.
97 //
98 // utf8_string: the utf8 string.
99 // utf8_length: length of the utf8 string.
100 // offset: the utf8 offset to convert.
101 // utf8_pos: current offset in the utf8 string. This is modified and on return
102 // matches offset.
103 // wide_pos: current index in the wide string. This is the same as the return
104 // value.
AdvanceAndReturnUTF16Pos(const char * utf8_string,int32_t utf8_length,int32_t offset,int32_t * utf8_pos,size_t * utf16_pos)105 size_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
106 int32_t utf8_length,
107 int32_t offset,
108 int32_t* utf8_pos,
109 size_t* utf16_pos) {
110 DCHECK(offset >= *utf8_pos && offset <= utf8_length);
111
112 UChar32 wide_char;
113 while (*utf8_pos < offset) {
114 U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
115 *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
116 }
117 return *utf16_pos;
118 }
119
120 // Given a character break iterator over a UTF-8 string, set the iterator
121 // position to |*utf8_pos| and move by |count| characters. |count| can
122 // be either positive or negative.
MoveByNGraphemes(icu::BreakIterator * bi,int count,size_t * utf8_pos)123 void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
124 // Ignore the return value. A side effect of the current position
125 // being set at or following |*utf8_pos| is exploited here.
126 // It's simpler than calling following(n) and then previous().
127 // isBoundary() is not very fast, but should be good enough for the
128 // snippet generation. If not, revisit the way we scan in ComputeSnippet.
129 bi->isBoundary(*utf8_pos);
130 bi->next(count);
131 *utf8_pos = static_cast<size_t>(bi->current());
132 }
133
134 // The amount of context to include for a given hit. Note that it's counted
135 // in terms of graphemes rather than bytes.
136 const int kSnippetContext = 50;
137
138 // Returns true if next match falls within a snippet window
139 // from the previous match. The window size is counted in terms
140 // of graphemes rather than bytes in UTF-8.
IsNextMatchWithinSnippetWindow(icu::BreakIterator * bi,size_t previous_match_end,size_t next_match_start)141 bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
142 size_t previous_match_end,
143 size_t next_match_start) {
144 // If it's within a window in terms of bytes, it's certain
145 // that it's within a window in terms of graphemes as well.
146 if (next_match_start < previous_match_end + kSnippetContext)
147 return true;
148 bi->isBoundary(previous_match_end);
149 // An alternative to this is to call |bi->next()| at most
150 // kSnippetContext times, compare |bi->current()| with |next_match_start|
151 // after each call and return early if possible. There are other
152 // heuristics to speed things up if necessary, but it's not likely that
153 // we need to bother.
154 bi->next(kSnippetContext);
155 int64 current = bi->current();
156 return (next_match_start < static_cast<uint64>(current) ||
157 current == icu::BreakIterator::DONE);
158 }
159
160 } // namespace
161
162 // static
ExtractMatchPositions(const std::string & offsets_str,const std::string & column_num,MatchPositions * match_positions)163 void Snippet::ExtractMatchPositions(const std::string& offsets_str,
164 const std::string& column_num,
165 MatchPositions* match_positions) {
166 DCHECK(match_positions);
167 if (offsets_str.empty())
168 return;
169 std::vector<std::string> offsets;
170 base::SplitString(offsets_str, ' ', &offsets);
171 // SQLite offsets are sets of four integers:
172 // column, query term, match offset, match length
173 // Matches within a string are marked by (start, end) pairs.
174 for (size_t i = 0; i < offsets.size() - 3; i += 4) {
175 if (offsets[i] != column_num)
176 continue;
177 const size_t start = atoi(offsets[i + 2].c_str());
178 const size_t end = start + atoi(offsets[i + 3].c_str());
179 // Switch to DCHECK after debugging http://crbug.com/15261.
180 CHECK(end >= start);
181 AddMatch(start, end, match_positions);
182 }
183 }
184
185 // static
ConvertMatchPositionsToWide(const std::string & utf8_string,Snippet::MatchPositions * match_positions)186 void Snippet::ConvertMatchPositionsToWide(
187 const std::string& utf8_string,
188 Snippet::MatchPositions* match_positions) {
189 DCHECK(match_positions);
190 int32_t utf8_pos = 0;
191 size_t utf16_pos = 0;
192 const char* utf8_cstring = utf8_string.c_str();
193 const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
194 for (Snippet::MatchPositions::iterator i = match_positions->begin();
195 i != match_positions->end(); ++i) {
196 i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
197 i->first, &utf8_pos, &utf16_pos);
198 i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
199 i->second, &utf8_pos, &utf16_pos);
200 }
201 }
202
Snippet()203 Snippet::Snippet() {
204 }
205
~Snippet()206 Snippet::~Snippet() {
207 }
208
ComputeSnippet(const MatchPositions & match_positions,const std::string & document)209 void Snippet::ComputeSnippet(const MatchPositions& match_positions,
210 const std::string& document) {
211 // The length of snippets we try to produce.
212 // We can generate longer snippets but stop once we cross kSnippetMaxLength.
213 const size_t kSnippetMaxLength = 200;
214 const string16 kEllipsis = ASCIIToUTF16(" ... ");
215
216 UText* document_utext = NULL;
217 UErrorCode status = U_ZERO_ERROR;
218 document_utext = utext_openUTF8(document_utext, document.data(),
219 document.size(), &status);
220 // Locale does not matter because there's no per-locale customization
221 // for character iterator.
222 scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
223 icu::Locale::getDefault(), status));
224 bi->setText(document_utext, status);
225 DCHECK(U_SUCCESS(status));
226
227 // We build the snippet by iterating through the matches and then grabbing
228 // context around each match. If matches are near enough each other (within
229 // kSnippetContext), we skip the "..." between them.
230 string16 snippet;
231 size_t start = 0;
232 for (size_t i = 0; i < match_positions.size(); ++i) {
233 // Some shorter names for the current match.
234 const size_t match_start = match_positions[i].first;
235 const size_t match_end = match_positions[i].second;
236
237 // Switch to DCHECK after debugging http://crbug.com/15261.
238 CHECK(match_end > match_start);
239 CHECK(match_end <= document.size());
240
241 // Add the context, if any, to show before the match.
242 size_t context_start = match_start;
243 MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
244 start = std::max(start, context_start);
245 if (start < match_start) {
246 if (start > 0)
247 snippet += kEllipsis;
248 // Switch to DCHECK after debugging http://crbug.com/15261.
249 CHECK(start < document.size());
250 snippet += UTF8ToUTF16(document.substr(start, match_start - start));
251 }
252
253 // Add the match.
254 const size_t first = snippet.size();
255 snippet += UTF8ToUTF16(document.substr(match_start,
256 match_end - match_start));
257 matches_.push_back(std::make_pair(first, snippet.size()));
258
259 // Compute the context, if any, to show after the match.
260 size_t end;
261 // Check if the next match falls within our snippet window.
262 if (i + 1 < match_positions.size() &&
263 IsNextMatchWithinSnippetWindow(bi.get(), match_end,
264 match_positions[i + 1].first)) {
265 // Yes, it's within the window. Make the end context extend just up
266 // to the next match.
267 end = match_positions[i + 1].first;
268 // Switch to DCHECK after debugging http://crbug.com/15261.
269 CHECK(end >= match_end);
270 CHECK(end <= document.size());
271 snippet += UTF8ToUTF16(document.substr(match_end, end - match_end));
272 } else {
273 // No, there's either no next match or the next match is too far away.
274 end = match_end;
275 MoveByNGraphemes(bi.get(), kSnippetContext, &end);
276 // Switch to DCHECK after debugging http://crbug.com/15261.
277 CHECK(end >= match_end);
278 CHECK(end <= document.size());
279 snippet += UTF8ToUTF16(document.substr(match_end, end - match_end));
280 if (end < document.size())
281 snippet += kEllipsis;
282 }
283 start = end;
284
285 // Stop here if we have enough snippet computed.
286 if (snippet.size() >= kSnippetMaxLength)
287 break;
288 }
289
290 utext_close(document_utext);
291 swap(text_, snippet);
292 }
293
Swap(Snippet * other)294 void Snippet::Swap(Snippet* other) {
295 text_.swap(other->text_);
296 matches_.swap(other->matches_);
297 }
298