• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/browser/devtools/devtools_file_system_indexer.h"
6 
7 #include <iterator>
8 
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/file_util.h"
12 #include "base/files/file_enumerator.h"
13 #include "base/files/file_util_proxy.h"
14 #include "base/lazy_instance.h"
15 #include "base/logging.h"
16 #include "base/platform_file.h"
17 #include "base/strings/utf_string_conversions.h"
18 #include "content/public/browser/browser_thread.h"
19 
20 using base::Bind;
21 using base::Callback;
22 using base::FileEnumerator;
23 using base::FilePath;
24 using base::FileUtilProxy;
25 using base::Time;
26 using base::TimeDelta;
27 using base::TimeTicks;
28 using base::PassPlatformFile;
29 using base::PlatformFile;
30 using base::PlatformFileError;
31 using base::PlatformFileInfo;
32 using content::BrowserThread;
33 using std::map;
34 using std::set;
35 using std::string;
36 using std::vector;
37 
38 namespace {
39 
40 typedef int32 Trigram;
41 typedef char TrigramChar;
42 typedef uint16 FileId;
43 
44 const int kMinTimeoutBetweenWorkedNitification = 200;
45 // Trigram characters include all ASCII printable characters (32-126) except for
46 // the capital letters, because the index is case insensitive.
47 const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1;
48 const size_t kTrigramCount =
49     kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount;
50 const int kMaxReadLength = 10 * 1024;
51 const TrigramChar kUndefinedTrigramChar = -1;
52 const Trigram kUndefinedTrigram = -1;
53 
54 base::LazyInstance<vector<bool> >::Leaky g_is_binary_char =
55     LAZY_INSTANCE_INITIALIZER;
56 
57 base::LazyInstance<vector<TrigramChar> >::Leaky g_trigram_chars =
58     LAZY_INSTANCE_INITIALIZER;
59 
60 class Index {
61  public:
62   Index();
63   Time LastModifiedTimeForFile(const FilePath& file_path);
64   void SetTrigramsForFile(const FilePath& file_path,
65                           const vector<Trigram>& index,
66                           const Time& time);
67   vector<FilePath> Search(string query);
68   void PrintStats();
69   void NormalizeVectors();
70 
71  private:
72   ~Index();
73 
74   FileId GetFileId(const FilePath& file_path);
75 
76   typedef map<FilePath, FileId> FileIdsMap;
77   FileIdsMap file_ids_;
78   FileId last_file_id_;
79   // The index in this vector is the trigram id.
80   vector<vector<FileId> > index_;
81   typedef map<FilePath, Time> IndexedFilesMap;
82   IndexedFilesMap index_times_;
83   vector<bool> is_normalized_;
84 
85   DISALLOW_COPY_AND_ASSIGN(Index);
86 };
87 
88 base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER;
89 
InitIsBinaryCharMap()90 void InitIsBinaryCharMap() {
91   for (size_t i = 0; i < 256; ++i) {
92     unsigned char ch = i;
93     bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127;
94     g_is_binary_char.Get().push_back(is_binary_char);
95   }
96 }
97 
IsBinaryChar(char c)98 bool IsBinaryChar(char c) {
99   unsigned char uc = static_cast<unsigned char>(c);
100   return g_is_binary_char.Get()[uc];
101 }
102 
InitTrigramCharsMap()103 void InitTrigramCharsMap() {
104   for (size_t i = 0; i < 256; ++i) {
105     if (i > 127) {
106       g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
107       continue;
108     }
109     char ch = i;
110     if (ch == '\t')
111       ch = ' ';
112     if (ch >= 'A' && ch <= 'Z')
113       ch = ch - 'A' + 'a';
114     if ((IsBinaryChar(ch)) || (ch < ' ')) {
115       g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
116       continue;
117     }
118 
119     if (ch >= 'Z')
120       ch = ch - 'Z' - 1 + 'A';
121     ch -= ' ';
122     char signed_trigram_char_count = static_cast<char>(kTrigramCharacterCount);
123     CHECK(ch >= 0 && ch < signed_trigram_char_count);
124     g_trigram_chars.Get().push_back(ch);
125   }
126 }
127 
TrigramCharForChar(char c)128 TrigramChar TrigramCharForChar(char c) {
129   unsigned char uc = static_cast<unsigned char>(c);
130   return g_trigram_chars.Get()[uc];
131 }
132 
TrigramAtIndex(const vector<TrigramChar> & trigram_chars,size_t index)133 Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) {
134   static int kTrigramCharacterCountSquared =
135       kTrigramCharacterCount * kTrigramCharacterCount;
136   if (trigram_chars[index] == kUndefinedTrigramChar ||
137       trigram_chars[index + 1] == kUndefinedTrigramChar ||
138       trigram_chars[index + 2] == kUndefinedTrigramChar)
139     return kUndefinedTrigram;
140   Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] +
141                     kTrigramCharacterCount * trigram_chars[index + 1] +
142                     trigram_chars[index + 2];
143   return trigram;
144 }
145 
Index()146 Index::Index() : last_file_id_(0) {
147   index_.resize(kTrigramCount);
148   is_normalized_.resize(kTrigramCount);
149   std::fill(is_normalized_.begin(), is_normalized_.end(), true);
150 }
151 
~Index()152 Index::~Index() {}
153 
LastModifiedTimeForFile(const FilePath & file_path)154 Time Index::LastModifiedTimeForFile(const FilePath& file_path) {
155   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
156   Time last_modified_time;
157   if (index_times_.find(file_path) != index_times_.end())
158     last_modified_time = index_times_[file_path];
159   return last_modified_time;
160 }
161 
SetTrigramsForFile(const FilePath & file_path,const vector<Trigram> & index,const Time & time)162 void Index::SetTrigramsForFile(const FilePath& file_path,
163                                const vector<Trigram>& index,
164                                const Time& time) {
165   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
166   FileId file_id = GetFileId(file_path);
167   vector<Trigram>::const_iterator it = index.begin();
168   for (; it != index.end(); ++it) {
169     Trigram trigram = *it;
170     index_[trigram].push_back(file_id);
171     is_normalized_[trigram] = false;
172   }
173   index_times_[file_path] = time;
174 }
175 
Search(string query)176 vector<FilePath> Index::Search(string query) {
177   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
178   const char* data = query.c_str();
179   vector<TrigramChar> trigram_chars;
180   trigram_chars.reserve(query.size());
181   for (size_t i = 0; i < query.size(); ++i)
182       trigram_chars.push_back(TrigramCharForChar(data[i]));
183   vector<Trigram> trigrams;
184   for (size_t i = 0; i + 2 < query.size(); ++i) {
185     Trigram trigram = TrigramAtIndex(trigram_chars, i);
186     if (trigram != kUndefinedTrigram)
187       trigrams.push_back(trigram);
188   }
189   set<FileId> file_ids;
190   bool first = true;
191   vector<Trigram>::const_iterator it = trigrams.begin();
192   for (; it != trigrams.end(); ++it) {
193     Trigram trigram = *it;
194     if (first) {
195       std::copy(index_[trigram].begin(),
196                 index_[trigram].end(),
197                 std::inserter(file_ids, file_ids.begin()));
198       first = false;
199       continue;
200     }
201     set<FileId> intersection;
202     std::set_intersection(file_ids.begin(),
203                           file_ids.end(),
204                           index_[trigram].begin(),
205                           index_[trigram].end(),
206                           std::inserter(intersection, intersection.begin()));
207     file_ids.swap(intersection);
208   }
209   vector<FilePath> result;
210   FileIdsMap::const_iterator ids_it = file_ids_.begin();
211   for (; ids_it != file_ids_.end(); ++ids_it) {
212     if (trigrams.size() == 0 ||
213         file_ids.find(ids_it->second) != file_ids.end()) {
214       result.push_back(ids_it->first);
215     }
216   }
217   return result;
218 }
219 
GetFileId(const FilePath & file_path)220 FileId Index::GetFileId(const FilePath& file_path) {
221   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
222   string file_path_str = file_path.AsUTF8Unsafe();
223   if (file_ids_.find(file_path) != file_ids_.end())
224     return file_ids_[file_path];
225   file_ids_[file_path] = ++last_file_id_;
226   return last_file_id_;
227 }
228 
NormalizeVectors()229 void Index::NormalizeVectors() {
230   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
231   for (size_t i = 0; i < kTrigramCount; ++i) {
232     if (!is_normalized_[i]) {
233       std::sort(index_[i].begin(), index_[i].end());
234       if (index_[i].capacity() > index_[i].size())
235         vector<FileId>(index_[i]).swap(index_[i]);
236       is_normalized_[i] = true;
237     }
238   }
239 }
240 
PrintStats()241 void Index::PrintStats() {
242   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
243   LOG(ERROR) << "Index stats:";
244   size_t size = 0;
245   size_t maxSize = 0;
246   size_t capacity = 0;
247   for (size_t i = 0; i < kTrigramCount; ++i) {
248     if (index_[i].size() > maxSize)
249       maxSize = index_[i].size();
250     size += index_[i].size();
251     capacity += index_[i].capacity();
252   }
253   LOG(ERROR) << "  - total trigram count: " << size;
254   LOG(ERROR) << "  - max file count per trigram: " << maxSize;
255   LOG(ERROR) << "  - total vectors capacity " << capacity;
256   size_t total_index_size =
257       capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount;
258   LOG(ERROR) << "  - estimated total index size " << total_index_size;
259 }
260 
261 typedef Callback<void(bool, const vector<bool>&)> IndexerCallback;
262 
263 }  // namespace
264 
FileSystemIndexingJob(const FilePath & file_system_path,const TotalWorkCallback & total_work_callback,const WorkedCallback & worked_callback,const DoneCallback & done_callback)265 DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
266     const FilePath& file_system_path,
267     const TotalWorkCallback& total_work_callback,
268     const WorkedCallback& worked_callback,
269     const DoneCallback& done_callback)
270     : file_system_path_(file_system_path),
271       total_work_callback_(total_work_callback),
272       worked_callback_(worked_callback),
273       done_callback_(done_callback),
274       files_indexed_(0),
275       stopped_(false) {
276   current_trigrams_set_.resize(kTrigramCount);
277   current_trigrams_.reserve(kTrigramCount);
278 }
279 
~FileSystemIndexingJob()280 DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
281 
Start()282 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
283   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
284   BrowserThread::PostTask(
285       BrowserThread::FILE,
286       FROM_HERE,
287       Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
288 }
289 
Stop()290 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
291   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
292   BrowserThread::PostTask(BrowserThread::FILE,
293                           FROM_HERE,
294                           Bind(&FileSystemIndexingJob::StopOnFileThread, this));
295 }
296 
StopOnFileThread()297 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
298   stopped_ = true;
299 }
300 
CollectFilesToIndex()301 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
302   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
303   if (stopped_)
304     return;
305   if (!file_enumerator_) {
306     file_enumerator_.reset(
307         new FileEnumerator(file_system_path_, true, FileEnumerator::FILES));
308   }
309   FilePath file_path = file_enumerator_->Next();
310   if (file_path.empty()) {
311     BrowserThread::PostTask(
312         BrowserThread::UI,
313         FROM_HERE,
314         Bind(total_work_callback_, file_path_times_.size()));
315     indexing_it_ = file_path_times_.begin();
316     IndexFiles();
317     return;
318   }
319   Time saved_last_modified_time =
320       g_trigram_index.Get().LastModifiedTimeForFile(file_path);
321   FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo();
322   Time current_last_modified_time = file_info.GetLastModifiedTime();
323   if (current_last_modified_time > saved_last_modified_time) {
324     file_path_times_[file_path] = current_last_modified_time;
325   }
326   BrowserThread::PostTask(
327       BrowserThread::FILE,
328       FROM_HERE,
329       Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
330 }
331 
IndexFiles()332 void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
333   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
334   if (stopped_)
335     return;
336   if (indexing_it_ == file_path_times_.end()) {
337     g_trigram_index.Get().NormalizeVectors();
338     BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_);
339     return;
340   }
341   FilePath file_path = indexing_it_->first;
342   FileUtilProxy::CreateOrOpen(
343       BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
344       file_path,
345       base::PLATFORM_FILE_OPEN | base::PLATFORM_FILE_READ,
346       Bind(&FileSystemIndexingJob::StartFileIndexing, this));
347 }
348 
StartFileIndexing(PlatformFileError error,PassPlatformFile pass_file,bool)349 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
350     PlatformFileError error,
351     PassPlatformFile pass_file,
352     bool) {
353   if (error != base::PLATFORM_FILE_OK) {
354     current_file_ = base::kInvalidPlatformFileValue;
355     FinishFileIndexing(false);
356     return;
357   }
358   current_file_ = pass_file.ReleaseValue();
359   current_file_offset_ = 0;
360   current_trigrams_.clear();
361   std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false);
362   ReadFromFile();
363 }
364 
ReadFromFile()365 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
366   if (stopped_) {
367     CloseFile();
368     return;
369   }
370   FileUtilProxy::Read(
371       BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
372       current_file_,
373       current_file_offset_,
374       kMaxReadLength,
375       Bind(&FileSystemIndexingJob::OnRead, this));
376 }
377 
OnRead(PlatformFileError error,const char * data,int bytes_read)378 void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
379     PlatformFileError error,
380     const char* data,
381     int bytes_read) {
382   if (error != base::PLATFORM_FILE_OK) {
383     FinishFileIndexing(false);
384     return;
385   }
386 
387   if (!bytes_read || bytes_read < 3) {
388     FinishFileIndexing(true);
389     return;
390   }
391 
392   size_t size = static_cast<size_t>(bytes_read);
393   vector<TrigramChar> trigram_chars;
394   trigram_chars.reserve(size);
395   for (size_t i = 0; i < size; ++i) {
396     if (IsBinaryChar(data[i])) {
397       current_trigrams_.clear();
398       FinishFileIndexing(true);
399       return;
400     }
401     trigram_chars.push_back(TrigramCharForChar(data[i]));
402   }
403 
404   for (size_t i = 0; i + 2 < size; ++i) {
405     Trigram trigram = TrigramAtIndex(trigram_chars, i);
406     if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) {
407       current_trigrams_set_[trigram] = true;
408       current_trigrams_.push_back(trigram);
409     }
410   }
411   current_file_offset_ += bytes_read - 2;
412   ReadFromFile();
413 }
414 
FinishFileIndexing(bool success)415 void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
416     bool success) {
417   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
418   CloseFile();
419   if (success) {
420     FilePath file_path = indexing_it_->first;
421     g_trigram_index.Get().SetTrigramsForFile(
422         file_path, current_trigrams_, file_path_times_[file_path]);
423   }
424   ReportWorked();
425   ++indexing_it_;
426   IndexFiles();
427 }
428 
CloseFile()429 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
430   if (current_file_ != base::kInvalidPlatformFileValue) {
431     FileUtilProxy::Close(
432         BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
433         current_file_,
434         Bind(&FileSystemIndexingJob::CloseCallback, this));
435   }
436 }
437 
CloseCallback(PlatformFileError error)438 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
439     PlatformFileError error) {}
440 
ReportWorked()441 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
442   TimeTicks current_time = TimeTicks::Now();
443   bool should_send_worked_nitification = true;
444   if (!last_worked_notification_time_.is_null()) {
445     TimeDelta delta = current_time - last_worked_notification_time_;
446     if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification)
447       should_send_worked_nitification = false;
448   }
449   ++files_indexed_;
450   if (should_send_worked_nitification) {
451     last_worked_notification_time_ = current_time;
452     BrowserThread::PostTask(
453         BrowserThread::UI, FROM_HERE, Bind(worked_callback_, files_indexed_));
454     files_indexed_ = 0;
455   }
456 }
457 
DevToolsFileSystemIndexer()458 DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
459   static bool maps_initialized = false;
460   if (!maps_initialized) {
461     InitIsBinaryCharMap();
462     InitTrigramCharsMap();
463     maps_initialized = true;
464   }
465 }
466 
~DevToolsFileSystemIndexer()467 DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
468 
469 scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob>
IndexPath(const string & file_system_path,const TotalWorkCallback & total_work_callback,const WorkedCallback & worked_callback,const DoneCallback & done_callback)470 DevToolsFileSystemIndexer::IndexPath(
471     const string& file_system_path,
472     const TotalWorkCallback& total_work_callback,
473     const WorkedCallback& worked_callback,
474     const DoneCallback& done_callback) {
475   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
476   scoped_refptr<FileSystemIndexingJob> indexing_job =
477       new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path),
478                                 total_work_callback,
479                                 worked_callback,
480                                 done_callback);
481   indexing_job->Start();
482   return indexing_job;
483 }
484 
SearchInPath(const string & file_system_path,const string & query,const SearchCallback & callback)485 void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path,
486                                              const string& query,
487                                              const SearchCallback& callback) {
488   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
489   BrowserThread::PostTask(
490       BrowserThread::FILE,
491       FROM_HERE,
492       Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread,
493            this,
494            file_system_path,
495            query,
496            callback));
497 }
498 
SearchInPathOnFileThread(const string & file_system_path,const string & query,const SearchCallback & callback)499 void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
500     const string& file_system_path,
501     const string& query,
502     const SearchCallback& callback) {
503   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
504   vector<FilePath> file_paths = g_trigram_index.Get().Search(query);
505   vector<string> result;
506   FilePath path = FilePath::FromUTF8Unsafe(file_system_path);
507   vector<FilePath>::const_iterator it = file_paths.begin();
508   for (; it != file_paths.end(); ++it) {
509     if (path.IsParent(*it))
510       result.push_back(it->AsUTF8Unsafe());
511   }
512   BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, Bind(callback, result));
513 }
514