1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/history/in_memory_url_index_types.h"
6
7 #include <algorithm>
8 #include <functional>
9 #include <iterator>
10 #include <numeric>
11 #include <set>
12
13 #include "base/i18n/break_iterator.h"
14 #include "base/i18n/case_conversion.h"
15 #include "base/strings/string_util.h"
16 #include "net/base/escape.h"
17 #include "net/base/net_util.h"
18
19 namespace history {
20
21 // Matches within URL and Title Strings ----------------------------------------
22
MatchTermInString(const base::string16 & term,const base::string16 & cleaned_string,int term_num)23 TermMatches MatchTermInString(const base::string16& term,
24 const base::string16& cleaned_string,
25 int term_num) {
26 const size_t kMaxCompareLength = 2048;
27 const base::string16& short_string =
28 (cleaned_string.length() > kMaxCompareLength) ?
29 cleaned_string.substr(0, kMaxCompareLength) : cleaned_string;
30 TermMatches matches;
31 for (size_t location = short_string.find(term);
32 location != base::string16::npos;
33 location = short_string.find(term, location + 1))
34 matches.push_back(TermMatch(term_num, location, term.length()));
35 return matches;
36 }
37
38 // Comparison function for sorting TermMatches by their offsets.
MatchOffsetLess(const TermMatch & m1,const TermMatch & m2)39 bool MatchOffsetLess(const TermMatch& m1, const TermMatch& m2) {
40 return m1.offset < m2.offset;
41 }
42
SortAndDeoverlapMatches(const TermMatches & matches)43 TermMatches SortAndDeoverlapMatches(const TermMatches& matches) {
44 if (matches.empty())
45 return matches;
46 TermMatches sorted_matches = matches;
47 std::sort(sorted_matches.begin(), sorted_matches.end(), MatchOffsetLess);
48 TermMatches clean_matches;
49 TermMatch last_match;
50 for (TermMatches::const_iterator iter = sorted_matches.begin();
51 iter != sorted_matches.end(); ++iter) {
52 if (iter->offset >= last_match.offset + last_match.length) {
53 last_match = *iter;
54 clean_matches.push_back(last_match);
55 }
56 }
57 return clean_matches;
58 }
59
OffsetsFromTermMatches(const TermMatches & matches)60 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches) {
61 std::vector<size_t> offsets;
62 for (TermMatches::const_iterator i = matches.begin(); i != matches.end();
63 ++i) {
64 offsets.push_back(i->offset);
65 offsets.push_back(i->offset + i->length);
66 }
67 return offsets;
68 }
69
ReplaceOffsetsInTermMatches(const TermMatches & matches,const std::vector<size_t> & offsets)70 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
71 const std::vector<size_t>& offsets) {
72 DCHECK_EQ(2 * matches.size(), offsets.size());
73 TermMatches new_matches;
74 std::vector<size_t>::const_iterator offset_iter = offsets.begin();
75 for (TermMatches::const_iterator term_iter = matches.begin();
76 term_iter != matches.end(); ++term_iter, ++offset_iter) {
77 const size_t starting_offset = *offset_iter;
78 ++offset_iter;
79 const size_t ending_offset = *offset_iter;
80 if ((starting_offset != base::string16::npos) &&
81 (ending_offset != base::string16::npos) &&
82 (starting_offset != ending_offset)) {
83 TermMatch new_match(*term_iter);
84 new_match.offset = starting_offset;
85 new_match.length = ending_offset - starting_offset;
86 new_matches.push_back(new_match);
87 }
88 }
89 return new_matches;
90 }
91
92 // Utility Functions -----------------------------------------------------------
93
String16SetFromString16(const base::string16 & cleaned_uni_string,WordStarts * word_starts)94 String16Set String16SetFromString16(const base::string16& cleaned_uni_string,
95 WordStarts* word_starts) {
96 String16Vector words =
97 String16VectorFromString16(cleaned_uni_string, false, word_starts);
98 String16Set word_set;
99 for (String16Vector::const_iterator iter = words.begin(); iter != words.end();
100 ++iter)
101 word_set.insert(base::i18n::ToLower(*iter).substr(0, kMaxSignificantChars));
102 return word_set;
103 }
104
String16VectorFromString16(const base::string16 & cleaned_uni_string,bool break_on_space,WordStarts * word_starts)105 String16Vector String16VectorFromString16(
106 const base::string16& cleaned_uni_string,
107 bool break_on_space,
108 WordStarts* word_starts) {
109 if (word_starts)
110 word_starts->clear();
111 base::i18n::BreakIterator iter(cleaned_uni_string,
112 break_on_space ? base::i18n::BreakIterator::BREAK_SPACE :
113 base::i18n::BreakIterator::BREAK_WORD);
114 String16Vector words;
115 if (!iter.Init())
116 return words;
117 while (iter.Advance()) {
118 if (break_on_space || iter.IsWord()) {
119 base::string16 word(iter.GetString());
120 size_t initial_whitespace = 0;
121 if (break_on_space) {
122 base::string16 trimmed_word;
123 base::TrimWhitespace(word, base::TRIM_LEADING, &trimmed_word);
124 initial_whitespace = word.length() - trimmed_word.length();
125 base::TrimWhitespace(trimmed_word, base::TRIM_TRAILING, &word);
126 }
127 if (word.empty())
128 continue;
129 words.push_back(word);
130 if (!word_starts)
131 continue;
132 size_t word_start = iter.prev() + initial_whitespace;
133 if (word_start < kMaxSignificantChars)
134 word_starts->push_back(word_start);
135 }
136 }
137 return words;
138 }
139
Char16SetFromString16(const base::string16 & term)140 Char16Set Char16SetFromString16(const base::string16& term) {
141 Char16Set characters;
142 for (base::string16::const_iterator iter = term.begin(); iter != term.end();
143 ++iter)
144 characters.insert(*iter);
145 return characters;
146 }
147
148 // HistoryInfoMapValue ---------------------------------------------------------
149
HistoryInfoMapValue()150 HistoryInfoMapValue::HistoryInfoMapValue() {}
~HistoryInfoMapValue()151 HistoryInfoMapValue::~HistoryInfoMapValue() {}
152
153 // RowWordStarts ---------------------------------------------------------------
154
RowWordStarts()155 RowWordStarts::RowWordStarts() {}
~RowWordStarts()156 RowWordStarts::~RowWordStarts() {}
157
Clear()158 void RowWordStarts::Clear() {
159 url_word_starts_.clear();
160 title_word_starts_.clear();
161 }
162
163 } // namespace history
164