1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/history/visitsegment_database.h"
6
7 #include <math.h>
8
9 #include <algorithm>
10 #include <string>
11 #include <vector>
12
13 #include "base/logging.h"
14 #include "base/stl_util.h"
15 #include "base/strings/string_util.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "chrome/browser/history/page_usage_data.h"
18 #include "sql/statement.h"
19 #include "sql/transaction.h"
20
21 // The following tables are used to store url segment information.
22 //
23 // segments
24 // id Primary key
25 // name A unique string to represent that segment. (URL derived)
26 // url_id ID of the url currently used to represent this segment.
27 //
28 // segment_usage
29 // id Primary key
30 // segment_id Corresponding segment id
31 // time_slot time stamp identifying for what day this entry is about
32 // visit_count Number of visit in the segment
33 //
34
35 namespace history {
36
VisitSegmentDatabase()37 VisitSegmentDatabase::VisitSegmentDatabase() {
38 }
39
~VisitSegmentDatabase()40 VisitSegmentDatabase::~VisitSegmentDatabase() {
41 }
42
InitSegmentTables()43 bool VisitSegmentDatabase::InitSegmentTables() {
44 // Segments table.
45 if (!GetDB().DoesTableExist("segments")) {
46 if (!GetDB().Execute("CREATE TABLE segments ("
47 "id INTEGER PRIMARY KEY,"
48 "name VARCHAR,"
49 "url_id INTEGER NON NULL)")) {
50 return false;
51 }
52
53 if (!GetDB().Execute(
54 "CREATE INDEX segments_name ON segments(name)")) {
55 return false;
56 }
57 }
58
59 // This was added later, so we need to try to create it even if the table
60 // already exists.
61 if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_url_id ON "
62 "segments(url_id)"))
63 return false;
64
65 // Segment usage table.
66 if (!GetDB().DoesTableExist("segment_usage")) {
67 if (!GetDB().Execute("CREATE TABLE segment_usage ("
68 "id INTEGER PRIMARY KEY,"
69 "segment_id INTEGER NOT NULL,"
70 "time_slot INTEGER NOT NULL,"
71 "visit_count INTEGER DEFAULT 0 NOT NULL)")) {
72 return false;
73 }
74 if (!GetDB().Execute(
75 "CREATE INDEX segment_usage_time_slot_segment_id ON "
76 "segment_usage(time_slot, segment_id)")) {
77 return false;
78 }
79 }
80
81 // Added in a later version, so we always need to try to creat this index.
82 if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_usage_seg_id "
83 "ON segment_usage(segment_id)"))
84 return false;
85
86 return true;
87 }
88
DropSegmentTables()89 bool VisitSegmentDatabase::DropSegmentTables() {
90 // Dropping the tables will implicitly delete the indices.
91 return GetDB().Execute("DROP TABLE segments") &&
92 GetDB().Execute("DROP TABLE segment_usage");
93 }
94
95 // Note: the segment name is derived from the URL but is not a URL. It is
96 // a string that can be easily recreated from various URLS. Maybe this should
97 // be an MD5 to limit the length.
98 //
99 // static
ComputeSegmentName(const GURL & url)100 std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
101 // TODO(brettw) this should probably use the registry controlled
102 // domains service.
103 GURL::Replacements r;
104 const char kWWWDot[] = "www.";
105 const int kWWWDotLen = arraysize(kWWWDot) - 1;
106
107 std::string host = url.host();
108 const char* host_c = host.c_str();
109 // Remove www. to avoid some dups.
110 if (static_cast<int>(host.size()) > kWWWDotLen &&
111 LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) {
112 r.SetHost(host.c_str(),
113 url::Component(kWWWDotLen,
114 static_cast<int>(host.size()) - kWWWDotLen));
115 }
116 // Remove other stuff we don't want.
117 r.ClearUsername();
118 r.ClearPassword();
119 r.ClearQuery();
120 r.ClearRef();
121 r.ClearPort();
122
123 return url.ReplaceComponents(r).spec();
124 }
125
GetSegmentNamed(const std::string & segment_name)126 SegmentID VisitSegmentDatabase::GetSegmentNamed(
127 const std::string& segment_name) {
128 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
129 "SELECT id FROM segments WHERE name = ?"));
130 statement.BindString(0, segment_name);
131
132 if (statement.Step())
133 return statement.ColumnInt64(0);
134 return 0;
135 }
136
UpdateSegmentRepresentationURL(SegmentID segment_id,URLID url_id)137 bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id,
138 URLID url_id) {
139 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
140 "UPDATE segments SET url_id = ? WHERE id = ?"));
141 statement.BindInt64(0, url_id);
142 statement.BindInt64(1, segment_id);
143
144 return statement.Run();
145 }
146
GetSegmentRepresentationURL(SegmentID segment_id)147 URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) {
148 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
149 "SELECT url_id FROM segments WHERE id = ?"));
150 statement.BindInt64(0, segment_id);
151
152 if (statement.Step())
153 return statement.ColumnInt64(0);
154 return 0;
155 }
156
CreateSegment(URLID url_id,const std::string & segment_name)157 SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id,
158 const std::string& segment_name) {
159 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
160 "INSERT INTO segments (name, url_id) VALUES (?,?)"));
161 statement.BindString(0, segment_name);
162 statement.BindInt64(1, url_id);
163
164 if (statement.Run())
165 return GetDB().GetLastInsertRowId();
166 return 0;
167 }
168
IncreaseSegmentVisitCount(SegmentID segment_id,base::Time ts,int amount)169 bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id,
170 base::Time ts,
171 int amount) {
172 base::Time t = ts.LocalMidnight();
173
174 sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
175 "SELECT id, visit_count FROM segment_usage "
176 "WHERE time_slot = ? AND segment_id = ?"));
177 select.BindInt64(0, t.ToInternalValue());
178 select.BindInt64(1, segment_id);
179
180 if (!select.is_valid())
181 return false;
182
183 if (select.Step()) {
184 sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE,
185 "UPDATE segment_usage SET visit_count = ? WHERE id = ?"));
186 update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount));
187 update.BindInt64(1, select.ColumnInt64(0));
188
189 return update.Run();
190 } else {
191 sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE,
192 "INSERT INTO segment_usage "
193 "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)"));
194 insert.BindInt64(0, segment_id);
195 insert.BindInt64(1, t.ToInternalValue());
196 insert.BindInt64(2, static_cast<int64>(amount));
197
198 return insert.Run();
199 }
200 }
201
QuerySegmentUsage(base::Time from_time,int max_result_count,std::vector<PageUsageData * > * results)202 void VisitSegmentDatabase::QuerySegmentUsage(
203 base::Time from_time,
204 int max_result_count,
205 std::vector<PageUsageData*>* results) {
206 // This function gathers the highest-ranked segments in two queries.
207 // The first gathers scores for all segments.
208 // The second gathers segment data (url, title, etc.) for the highest-ranked
209 // segments.
210
211 // Gather all the segment scores.
212 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
213 "SELECT segment_id, time_slot, visit_count "
214 "FROM segment_usage WHERE time_slot >= ? "
215 "ORDER BY segment_id"));
216 if (!statement.is_valid())
217 return;
218
219 base::Time ts = from_time.LocalMidnight();
220 statement.BindInt64(0, ts.ToInternalValue());
221
222 base::Time now = base::Time::Now();
223 SegmentID last_segment_id = 0;
224 PageUsageData* pud = NULL;
225 float score = 0;
226 while (statement.Step()) {
227 SegmentID segment_id = statement.ColumnInt64(0);
228 if (segment_id != last_segment_id) {
229 if (pud) {
230 pud->SetScore(score);
231 results->push_back(pud);
232 }
233
234 pud = new PageUsageData(segment_id);
235 score = 0;
236 last_segment_id = segment_id;
237 }
238
239 base::Time timeslot =
240 base::Time::FromInternalValue(statement.ColumnInt64(1));
241 int visit_count = statement.ColumnInt(2);
242 int days_ago = (now - timeslot).InDays();
243
244 // Score for this day in isolation.
245 float day_visits_score = 1.0f + log(static_cast<float>(visit_count));
246 // Recent visits count more than historical ones, so we multiply in a boost
247 // related to how long ago this day was.
248 // This boost is a curve that smoothly goes through these values:
249 // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x
250 // at the limit of how far we reach into the past.
251 float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f)));
252 score += recency_boost * day_visits_score;
253 }
254
255 if (pud) {
256 pud->SetScore(score);
257 results->push_back(pud);
258 }
259
260 // Limit to the top kResultCount results.
261 std::sort(results->begin(), results->end(), PageUsageData::Predicate);
262 if (static_cast<int>(results->size()) > max_result_count) {
263 STLDeleteContainerPointers(results->begin() + max_result_count,
264 results->end());
265 results->resize(max_result_count);
266 }
267
268 // Now fetch the details about the entries we care about.
269 sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE,
270 "SELECT urls.url, urls.title FROM urls "
271 "JOIN segments ON segments.url_id = urls.id "
272 "WHERE segments.id = ?"));
273
274 if (!statement2.is_valid())
275 return;
276
277 for (size_t i = 0; i < results->size(); ++i) {
278 PageUsageData* pud = (*results)[i];
279 statement2.BindInt64(0, pud->GetID());
280 if (statement2.Step()) {
281 pud->SetURL(GURL(statement2.ColumnString(0)));
282 pud->SetTitle(statement2.ColumnString16(1));
283 }
284 statement2.Reset(true);
285 }
286 }
287
DeleteSegmentData(base::Time older_than)288 bool VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) {
289 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
290 "DELETE FROM segment_usage WHERE time_slot < ?"));
291 statement.BindInt64(0, older_than.LocalMidnight().ToInternalValue());
292
293 return statement.Run();
294 }
295
DeleteSegmentForURL(URLID url_id)296 bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) {
297 sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE,
298 "DELETE FROM segment_usage WHERE segment_id IN "
299 "(SELECT id FROM segments WHERE url_id = ?)"));
300 delete_usage.BindInt64(0, url_id);
301
302 if (!delete_usage.Run())
303 return false;
304
305 sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE,
306 "DELETE FROM segments WHERE url_id = ?"));
307 delete_seg.BindInt64(0, url_id);
308
309 return delete_seg.Run();
310 }
311
MigratePresentationIndex()312 bool VisitSegmentDatabase::MigratePresentationIndex() {
313 sql::Transaction transaction(&GetDB());
314 return transaction.Begin() &&
315 GetDB().Execute("DROP TABLE presentation") &&
316 GetDB().Execute("CREATE TABLE segments_tmp ("
317 "id INTEGER PRIMARY KEY,"
318 "name VARCHAR,"
319 "url_id INTEGER NON NULL)") &&
320 GetDB().Execute("INSERT INTO segments_tmp SELECT "
321 "id, name, url_id FROM segments") &&
322 GetDB().Execute("DROP TABLE segments") &&
323 GetDB().Execute("ALTER TABLE segments_tmp RENAME TO segments") &&
324 transaction.Commit();
325 }
326
327 } // namespace history
328