1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/history/visitsegment_database.h"
6
7 #include <math.h>
8
9 #include <algorithm>
10 #include <string>
11 #include <vector>
12
13 #include "app/sql/statement.h"
14 #include "base/logging.h"
15 #include "base/stl_util-inl.h"
16 #include "base/string_util.h"
17 #include "base/utf_string_conversions.h"
18 #include "chrome/browser/history/page_usage_data.h"
19
20 // The following tables are used to store url segment information.
21 //
22 // segments
23 // id Primary key
24 // name A unique string to represent that segment. (URL derived)
25 // url_id ID of the url currently used to represent this segment.
26 // pres_index index used to store a fixed presentation position.
27 //
28 // segment_usage
29 // id Primary key
30 // segment_id Corresponding segment id
31 // time_slot time stamp identifying for what day this entry is about
32 // visit_count Number of visit in the segment
33 //
34
35 namespace history {
36
VisitSegmentDatabase()37 VisitSegmentDatabase::VisitSegmentDatabase() {
38 }
39
~VisitSegmentDatabase()40 VisitSegmentDatabase::~VisitSegmentDatabase() {
41 }
42
InitSegmentTables()43 bool VisitSegmentDatabase::InitSegmentTables() {
44 // Segments table.
45 if (!GetDB().DoesTableExist("segments")) {
46 if (!GetDB().Execute("CREATE TABLE segments ("
47 "id INTEGER PRIMARY KEY,"
48 "name VARCHAR,"
49 "url_id INTEGER NON NULL,"
50 "pres_index INTEGER DEFAULT -1 NOT NULL)")) {
51 NOTREACHED();
52 return false;
53 }
54
55 if (!GetDB().Execute("CREATE INDEX segments_name ON segments(name)")) {
56 NOTREACHED();
57 return false;
58 }
59 }
60
61 // This was added later, so we need to try to create it even if the table
62 // already exists.
63 GetDB().Execute("CREATE INDEX segments_url_id ON segments(url_id)");
64
65 // Segment usage table.
66 if (!GetDB().DoesTableExist("segment_usage")) {
67 if (!GetDB().Execute("CREATE TABLE segment_usage ("
68 "id INTEGER PRIMARY KEY,"
69 "segment_id INTEGER NOT NULL,"
70 "time_slot INTEGER NOT NULL,"
71 "visit_count INTEGER DEFAULT 0 NOT NULL)")) {
72 NOTREACHED();
73 return false;
74 }
75 if (!GetDB().Execute(
76 "CREATE INDEX segment_usage_time_slot_segment_id ON "
77 "segment_usage(time_slot, segment_id)")) {
78 NOTREACHED();
79 return false;
80 }
81 }
82
83 // Added in a later version, so we always need to try to creat this index.
84 GetDB().Execute("CREATE INDEX segments_usage_seg_id "
85 "ON segment_usage(segment_id)");
86
87 // Presentation index table.
88 //
89 // Important note:
90 // Right now, this table is only used to store the presentation index.
91 // If you need to add more columns, keep in mind that rows are currently
92 // deleted when the presentation index is changed to -1.
93 // See SetPagePresentationIndex() in this file
94 if (!GetDB().DoesTableExist("presentation")) {
95 if (!GetDB().Execute("CREATE TABLE presentation("
96 "url_id INTEGER PRIMARY KEY,"
97 "pres_index INTEGER NOT NULL)"))
98 return false;
99 }
100 return true;
101 }
102
DropSegmentTables()103 bool VisitSegmentDatabase::DropSegmentTables() {
104 // Dropping the tables will implicitly delete the indices.
105 return GetDB().Execute("DROP TABLE segments") &&
106 GetDB().Execute("DROP TABLE segment_usage");
107 }
108
109 // Note: the segment name is derived from the URL but is not a URL. It is
110 // a string that can be easily recreated from various URLS. Maybe this should
111 // be an MD5 to limit the length.
112 //
113 // static
ComputeSegmentName(const GURL & url)114 std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
115 // TODO(brettw) this should probably use the registry controlled
116 // domains service.
117 GURL::Replacements r;
118 const char kWWWDot[] = "www.";
119 const int kWWWDotLen = arraysize(kWWWDot) - 1;
120
121 std::string host = url.host();
122 const char* host_c = host.c_str();
123 // Remove www. to avoid some dups.
124 if (static_cast<int>(host.size()) > kWWWDotLen &&
125 LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) {
126 r.SetHost(host.c_str(),
127 url_parse::Component(kWWWDotLen,
128 static_cast<int>(host.size()) - kWWWDotLen));
129 }
130 // Remove other stuff we don't want.
131 r.ClearUsername();
132 r.ClearPassword();
133 r.ClearQuery();
134 r.ClearRef();
135 r.ClearPort();
136
137 return url.ReplaceComponents(r).spec();
138 }
139
GetSegmentNamed(const std::string & segment_name)140 SegmentID VisitSegmentDatabase::GetSegmentNamed(
141 const std::string& segment_name) {
142 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
143 "SELECT id FROM segments WHERE name = ?"));
144 if (!statement)
145 return 0;
146
147 statement.BindString(0, segment_name);
148 if (statement.Step())
149 return statement.ColumnInt64(0);
150 return 0;
151 }
152
UpdateSegmentRepresentationURL(SegmentID segment_id,URLID url_id)153 bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id,
154 URLID url_id) {
155 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
156 "UPDATE segments SET url_id = ? WHERE id = ?"));
157 if (!statement)
158 return false;
159
160 statement.BindInt64(0, url_id);
161 statement.BindInt64(1, segment_id);
162 return statement.Run();
163 }
164
GetSegmentRepresentationURL(SegmentID segment_id)165 URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) {
166 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
167 "SELECT url_id FROM segments WHERE id = ?"));
168 if (!statement)
169 return 0;
170
171 statement.BindInt64(0, segment_id);
172 if (statement.Step())
173 return statement.ColumnInt64(0);
174 return 0;
175 }
176
CreateSegment(URLID url_id,const std::string & segment_name)177 SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id,
178 const std::string& segment_name) {
179 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
180 "INSERT INTO segments (name, url_id) VALUES (?,?)"));
181 if (!statement)
182 return false;
183
184 statement.BindString(0, segment_name);
185 statement.BindInt64(1, url_id);
186 if (statement.Run())
187 return GetDB().GetLastInsertRowId();
188 return false;
189 }
190
IncreaseSegmentVisitCount(SegmentID segment_id,base::Time ts,int amount)191 bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id,
192 base::Time ts,
193 int amount) {
194 base::Time t = ts.LocalMidnight();
195
196 sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
197 "SELECT id, visit_count FROM segment_usage "
198 "WHERE time_slot = ? AND segment_id = ?"));
199 if (!select)
200 return false;
201
202 select.BindInt64(0, t.ToInternalValue());
203 select.BindInt64(1, segment_id);
204 if (select.Step()) {
205 sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE,
206 "UPDATE segment_usage SET visit_count = ? WHERE id = ?"));
207 if (!update)
208 return false;
209
210 update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount));
211 update.BindInt64(1, select.ColumnInt64(0));
212 return update.Run();
213
214 } else {
215 sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE,
216 "INSERT INTO segment_usage "
217 "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)"));
218 if (!insert)
219 return false;
220
221 insert.BindInt64(0, segment_id);
222 insert.BindInt64(1, t.ToInternalValue());
223 insert.BindInt64(2, static_cast<int64>(amount));
224 return insert.Run();
225 }
226 }
227
QuerySegmentUsage(base::Time from_time,int max_result_count,std::vector<PageUsageData * > * results)228 void VisitSegmentDatabase::QuerySegmentUsage(
229 base::Time from_time,
230 int max_result_count,
231 std::vector<PageUsageData*>* results) {
232 // This function gathers the highest-ranked segments in two queries.
233 // The first gathers scores for all segments.
234 // The second gathers segment data (url, title, etc.) for the highest-ranked
235 // segments.
236 // TODO(evanm): this disregards the "presentation index", which was what was
237 // used to lock results into position. But the rest of our code currently
238 // does as well.
239
240 // Gather all the segment scores.
241 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
242 "SELECT segment_id, time_slot, visit_count "
243 "FROM segment_usage WHERE time_slot >= ? "
244 "ORDER BY segment_id"));
245 if (!statement) {
246 NOTREACHED() << GetDB().GetErrorMessage();
247 return;
248 }
249
250 base::Time ts = from_time.LocalMidnight();
251 statement.BindInt64(0, ts.ToInternalValue());
252
253 base::Time now = base::Time::Now();
254 SegmentID last_segment_id = 0;
255 PageUsageData* pud = NULL;
256 float score = 0;
257 while (statement.Step()) {
258 SegmentID segment_id = statement.ColumnInt64(0);
259 if (segment_id != last_segment_id) {
260 if (pud) {
261 pud->SetScore(score);
262 results->push_back(pud);
263 }
264
265 pud = new PageUsageData(segment_id);
266 score = 0;
267 last_segment_id = segment_id;
268 }
269
270 base::Time timeslot =
271 base::Time::FromInternalValue(statement.ColumnInt64(1));
272 int visit_count = statement.ColumnInt(2);
273 int days_ago = (now - timeslot).InDays();
274
275 // Score for this day in isolation.
276 float day_visits_score = 1.0f + log(static_cast<float>(visit_count));
277 // Recent visits count more than historical ones, so we multiply in a boost
278 // related to how long ago this day was.
279 // This boost is a curve that smoothly goes through these values:
280 // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x
281 // at the limit of how far we reach into the past.
282 float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f)));
283 score += recency_boost * day_visits_score;
284 }
285
286 if (pud) {
287 pud->SetScore(score);
288 results->push_back(pud);
289 }
290
291 // Limit to the top kResultCount results.
292 sort(results->begin(), results->end(), PageUsageData::Predicate);
293 if (static_cast<int>(results->size()) > max_result_count) {
294 STLDeleteContainerPointers(results->begin() + max_result_count,
295 results->end());
296 results->resize(max_result_count);
297 }
298
299 // Now fetch the details about the entries we care about.
300 sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE,
301 "SELECT urls.url, urls.title FROM urls "
302 "JOIN segments ON segments.url_id = urls.id "
303 "WHERE segments.id = ?"));
304 if (!statement2) {
305 NOTREACHED() << GetDB().GetErrorMessage();
306 return;
307 }
308 for (size_t i = 0; i < results->size(); ++i) {
309 PageUsageData* pud = (*results)[i];
310 statement2.BindInt64(0, pud->GetID());
311 if (statement2.Step()) {
312 pud->SetURL(GURL(statement2.ColumnString(0)));
313 pud->SetTitle(UTF8ToUTF16(statement2.ColumnString(1)));
314 }
315 statement2.Reset();
316 }
317 }
318
DeleteSegmentData(base::Time older_than)319 void VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) {
320 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
321 "DELETE FROM segment_usage WHERE time_slot < ?"));
322 if (!statement)
323 return;
324
325 statement.BindInt64(0, older_than.LocalMidnight().ToInternalValue());
326 if (!statement.Run())
327 NOTREACHED();
328 }
329
SetSegmentPresentationIndex(SegmentID segment_id,int index)330 void VisitSegmentDatabase::SetSegmentPresentationIndex(SegmentID segment_id,
331 int index) {
332 sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
333 "UPDATE segments SET pres_index = ? WHERE id = ?"));
334 if (!statement)
335 return;
336
337 statement.BindInt(0, index);
338 statement.BindInt64(1, segment_id);
339 if (!statement.Run())
340 NOTREACHED();
341 else
342 DCHECK_EQ(1, GetDB().GetLastChangeCount());
343 }
344
DeleteSegmentForURL(URLID url_id)345 bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) {
346 sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
347 "SELECT id FROM segments WHERE url_id = ?"));
348 if (!select)
349 return false;
350
351 sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE,
352 "DELETE FROM segments WHERE id = ?"));
353 if (!delete_seg)
354 return false;
355
356 sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE,
357 "DELETE FROM segment_usage WHERE segment_id = ?"));
358 if (!delete_usage)
359 return false;
360
361 bool r = true;
362 select.BindInt64(0, url_id);
363 // In theory there could not be more than one segment using that URL but we
364 // loop anyway to cleanup any inconsistency.
365 while (select.Step()) {
366 SegmentID segment_id = select.ColumnInt64(0);
367
368 delete_usage.BindInt64(0, segment_id);
369 if (!delete_usage.Run()) {
370 NOTREACHED();
371 r = false;
372 }
373
374 delete_seg.BindInt64(0, segment_id);
375 if (!delete_seg.Run()) {
376 NOTREACHED();
377 r = false;
378 }
379 delete_usage.Reset();
380 delete_seg.Reset();
381 }
382 return r;
383 }
384
385 } // namespace history
386