• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/browser/metrics/thread_watcher.h"
6 
7 #include <math.h>  // ceil
8 
9 #include "base/bind.h"
10 #include "base/compiler_specific.h"
11 #include "base/debug/alias.h"
12 #include "base/debug/dump_without_crashing.h"
13 #include "base/lazy_instance.h"
14 #include "base/metrics/field_trial.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/string_split.h"
17 #include "base/strings/string_tokenizer.h"
18 #include "base/strings/stringprintf.h"
19 #include "base/threading/thread_restrictions.h"
20 #include "build/build_config.h"
21 #include "chrome/browser/chrome_notification_types.h"
22 #include "chrome/common/chrome_switches.h"
23 #include "chrome/common/chrome_version_info.h"
24 #include "chrome/common/logging_chrome.h"
25 #include "content/public/browser/notification_service.h"
26 
27 #if defined(OS_WIN)
28 #include "base/win/windows_version.h"
29 #endif
30 
31 using content::BrowserThread;
32 
33 namespace {
34 
35 // The following are unique function names for forcing the crash when a thread
36 // is unresponsive. This makes it possible to tell from the callstack alone what
37 // thread was unresponsive.
38 //
39 // We disable optimizations for this block of functions so the compiler doesn't
40 // merge them all together.
41 MSVC_DISABLE_OPTIMIZE()
42 MSVC_PUSH_DISABLE_WARNING(4748)
43 
44 #ifndef NDEBUG
NullPointer()45 int* NullPointer() {
46   return reinterpret_cast<int*>(NULL);
47 }
48 #endif
49 
NullPointerCrash(int line_number)50 void NullPointerCrash(int line_number) {
51 #ifndef NDEBUG
52   *NullPointer() = line_number;  // Crash.
53 #else
54   base::debug::DumpWithoutCrashing();
55 #endif
56 }
57 
58 #if !defined(OS_ANDROID) || !defined(NDEBUG)
59 // TODO(rtenneti): Enabled crashing, after getting data.
StartupCrash()60 NOINLINE void StartupCrash() {
61   NullPointerCrash(__LINE__);
62 }
63 #endif  // OS_ANDROID
64 
ShutdownCrash()65 NOINLINE void ShutdownCrash() {
66   NullPointerCrash(__LINE__);
67 }
68 
ThreadUnresponsive_UI()69 NOINLINE void ThreadUnresponsive_UI() {
70   NullPointerCrash(__LINE__);
71 }
72 
ThreadUnresponsive_DB()73 NOINLINE void ThreadUnresponsive_DB() {
74   NullPointerCrash(__LINE__);
75 }
76 
ThreadUnresponsive_FILE()77 NOINLINE void ThreadUnresponsive_FILE() {
78   NullPointerCrash(__LINE__);
79 }
80 
ThreadUnresponsive_FILE_USER_BLOCKING()81 NOINLINE void ThreadUnresponsive_FILE_USER_BLOCKING() {
82   NullPointerCrash(__LINE__);
83 }
84 
ThreadUnresponsive_PROCESS_LAUNCHER()85 NOINLINE void ThreadUnresponsive_PROCESS_LAUNCHER() {
86   NullPointerCrash(__LINE__);
87 }
88 
ThreadUnresponsive_CACHE()89 NOINLINE void ThreadUnresponsive_CACHE() {
90   NullPointerCrash(__LINE__);
91 }
92 
ThreadUnresponsive_IO()93 NOINLINE void ThreadUnresponsive_IO() {
94   NullPointerCrash(__LINE__);
95 }
96 
97 MSVC_POP_WARNING()
98 MSVC_ENABLE_OPTIMIZE();
99 
CrashBecauseThreadWasUnresponsive(BrowserThread::ID thread_id)100 void CrashBecauseThreadWasUnresponsive(BrowserThread::ID thread_id) {
101   base::debug::Alias(&thread_id);
102 
103   switch (thread_id) {
104     case BrowserThread::UI:
105       return ThreadUnresponsive_UI();
106     case BrowserThread::DB:
107       return ThreadUnresponsive_DB();
108     case BrowserThread::FILE:
109       return ThreadUnresponsive_FILE();
110     case BrowserThread::FILE_USER_BLOCKING:
111       return ThreadUnresponsive_FILE_USER_BLOCKING();
112     case BrowserThread::PROCESS_LAUNCHER:
113       return ThreadUnresponsive_PROCESS_LAUNCHER();
114     case BrowserThread::CACHE:
115       return ThreadUnresponsive_CACHE();
116     case BrowserThread::IO:
117       return ThreadUnresponsive_IO();
118     case BrowserThread::ID_COUNT:
119       CHECK(false);  // This shouldn't actually be reached!
120       break;
121 
122     // Omission of the default hander is intentional -- that way the compiler
123     // should warn if our switch becomes outdated.
124   }
125 
126   CHECK(false) << "Unknown thread was unresponsive.";  // Shouldn't be reached.
127 }
128 
129 }  // namespace
130 
131 // ThreadWatcher methods and members.
ThreadWatcher(const WatchingParams & params)132 ThreadWatcher::ThreadWatcher(const WatchingParams& params)
133     : thread_id_(params.thread_id),
134       thread_name_(params.thread_name),
135       watched_loop_(
136           BrowserThread::GetMessageLoopProxyForThread(params.thread_id)),
137       sleep_time_(params.sleep_time),
138       unresponsive_time_(params.unresponsive_time),
139       ping_time_(base::TimeTicks::Now()),
140       pong_time_(ping_time_),
141       ping_sequence_number_(0),
142       active_(false),
143       ping_count_(params.unresponsive_threshold),
144       response_time_histogram_(NULL),
145       unresponsive_time_histogram_(NULL),
146       unresponsive_count_(0),
147       hung_processing_complete_(false),
148       unresponsive_threshold_(params.unresponsive_threshold),
149       crash_on_hang_(params.crash_on_hang),
150       live_threads_threshold_(params.live_threads_threshold),
151       weak_ptr_factory_(this) {
152   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
153   Initialize();
154 }
155 
~ThreadWatcher()156 ThreadWatcher::~ThreadWatcher() {}
157 
158 // static
StartWatching(const WatchingParams & params)159 void ThreadWatcher::StartWatching(const WatchingParams& params) {
160   DCHECK_GE(params.sleep_time.InMilliseconds(), 0);
161   DCHECK_GE(params.unresponsive_time.InMilliseconds(),
162             params.sleep_time.InMilliseconds());
163 
164   // If we are not on WatchDogThread, then post a task to call StartWatching on
165   // WatchDogThread.
166   if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
167     WatchDogThread::PostTask(
168         FROM_HERE,
169         base::Bind(&ThreadWatcher::StartWatching, params));
170     return;
171   }
172 
173   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
174 
175   // Create a new thread watcher object for the given thread and activate it.
176   ThreadWatcher* watcher = new ThreadWatcher(params);
177 
178   DCHECK(watcher);
179   // If we couldn't register the thread watcher object, we are shutting down,
180   // then don't activate thread watching.
181   if (!ThreadWatcherList::IsRegistered(params.thread_id))
182     return;
183   watcher->ActivateThreadWatching();
184 }
185 
ActivateThreadWatching()186 void ThreadWatcher::ActivateThreadWatching() {
187   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
188   if (active_) return;
189   active_ = true;
190   ping_count_ = unresponsive_threshold_;
191   ResetHangCounters();
192   base::MessageLoop::current()->PostTask(
193       FROM_HERE,
194       base::Bind(&ThreadWatcher::PostPingMessage,
195                  weak_ptr_factory_.GetWeakPtr()));
196 }
197 
DeActivateThreadWatching()198 void ThreadWatcher::DeActivateThreadWatching() {
199   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
200   active_ = false;
201   ping_count_ = 0;
202   weak_ptr_factory_.InvalidateWeakPtrs();
203 }
204 
WakeUp()205 void ThreadWatcher::WakeUp() {
206   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
207   // There is some user activity, PostPingMessage task of thread watcher if
208   // needed.
209   if (!active_) return;
210 
211   // Throw away the previous |unresponsive_count_| and start over again. Just
212   // before going to sleep, |unresponsive_count_| could be very close to
213   // |unresponsive_threshold_| and when user becomes active,
214   // |unresponsive_count_| can go over |unresponsive_threshold_| if there was no
215   // response for ping messages. Reset |unresponsive_count_| to start measuring
216   // the unresponsiveness of the threads when system becomes active.
217   unresponsive_count_ = 0;
218 
219   if (ping_count_ <= 0) {
220     ping_count_ = unresponsive_threshold_;
221     ResetHangCounters();
222     PostPingMessage();
223   } else {
224     ping_count_ = unresponsive_threshold_;
225   }
226 }
227 
PostPingMessage()228 void ThreadWatcher::PostPingMessage() {
229   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
230   // If we have stopped watching or if the user is idle, then stop sending
231   // ping messages.
232   if (!active_ || ping_count_ <= 0)
233     return;
234 
235   // Save the current time when we have sent ping message.
236   ping_time_ = base::TimeTicks::Now();
237 
238   // Send a ping message to the watched thread. Callback will be called on
239   // the WatchDogThread.
240   base::Closure callback(
241       base::Bind(&ThreadWatcher::OnPongMessage, weak_ptr_factory_.GetWeakPtr(),
242                  ping_sequence_number_));
243   if (watched_loop_->PostTask(
244           FROM_HERE,
245           base::Bind(&ThreadWatcher::OnPingMessage, thread_id_,
246                      callback))) {
247       // Post a task to check the responsiveness of watched thread.
248       base::MessageLoop::current()->PostDelayedTask(
249           FROM_HERE,
250           base::Bind(&ThreadWatcher::OnCheckResponsiveness,
251                      weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
252           unresponsive_time_);
253   } else {
254     // Watched thread might have gone away, stop watching it.
255     DeActivateThreadWatching();
256   }
257 }
258 
OnPongMessage(uint64 ping_sequence_number)259 void ThreadWatcher::OnPongMessage(uint64 ping_sequence_number) {
260   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
261 
262   // Record watched thread's response time.
263   base::TimeTicks now = base::TimeTicks::Now();
264   base::TimeDelta response_time = now - ping_time_;
265   response_time_histogram_->AddTime(response_time);
266 
267   // Save the current time when we have got pong message.
268   pong_time_ = now;
269 
270   // Check if there are any extra pings in flight.
271   DCHECK_EQ(ping_sequence_number_, ping_sequence_number);
272   if (ping_sequence_number_ != ping_sequence_number)
273     return;
274 
275   // Increment sequence number for the next ping message to indicate watched
276   // thread is responsive.
277   ++ping_sequence_number_;
278 
279   // If we have stopped watching or if the user is idle, then stop sending
280   // ping messages.
281   if (!active_ || --ping_count_ <= 0)
282     return;
283 
284   base::MessageLoop::current()->PostDelayedTask(
285       FROM_HERE,
286       base::Bind(&ThreadWatcher::PostPingMessage,
287                  weak_ptr_factory_.GetWeakPtr()),
288       sleep_time_);
289 }
290 
OnCheckResponsiveness(uint64 ping_sequence_number)291 void ThreadWatcher::OnCheckResponsiveness(uint64 ping_sequence_number) {
292   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
293   // If we have stopped watching then consider thread as responding.
294   if (!active_) {
295     responsive_ = true;
296     return;
297   }
298   // If the latest ping_sequence_number_ is not same as the ping_sequence_number
299   // that is passed in, then we can assume OnPongMessage was called.
300   // OnPongMessage increments ping_sequence_number_.
301   if (ping_sequence_number_ != ping_sequence_number) {
302     // Reset unresponsive_count_ to zero because we got a response from the
303     // watched thread.
304     ResetHangCounters();
305 
306     responsive_ = true;
307     return;
308   }
309   // Record that we got no response from watched thread.
310   GotNoResponse();
311 
312   // Post a task to check the responsiveness of watched thread.
313   base::MessageLoop::current()->PostDelayedTask(
314       FROM_HERE,
315       base::Bind(&ThreadWatcher::OnCheckResponsiveness,
316                  weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
317       unresponsive_time_);
318   responsive_ = false;
319 }
320 
Initialize()321 void ThreadWatcher::Initialize() {
322   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
323   ThreadWatcherList::Register(this);
324 
325   const std::string response_time_histogram_name =
326       "ThreadWatcher.ResponseTime." + thread_name_;
327   response_time_histogram_ = base::Histogram::FactoryTimeGet(
328       response_time_histogram_name,
329       base::TimeDelta::FromMilliseconds(1),
330       base::TimeDelta::FromSeconds(100), 50,
331       base::Histogram::kUmaTargetedHistogramFlag);
332 
333   const std::string unresponsive_time_histogram_name =
334       "ThreadWatcher.Unresponsive." + thread_name_;
335   unresponsive_time_histogram_ = base::Histogram::FactoryTimeGet(
336       unresponsive_time_histogram_name,
337       base::TimeDelta::FromMilliseconds(1),
338       base::TimeDelta::FromSeconds(100), 50,
339       base::Histogram::kUmaTargetedHistogramFlag);
340 
341   const std::string responsive_count_histogram_name =
342       "ThreadWatcher.ResponsiveThreads." + thread_name_;
343   responsive_count_histogram_ = base::LinearHistogram::FactoryGet(
344       responsive_count_histogram_name, 1, 10, 11,
345       base::Histogram::kUmaTargetedHistogramFlag);
346 
347   const std::string unresponsive_count_histogram_name =
348       "ThreadWatcher.UnresponsiveThreads." + thread_name_;
349   unresponsive_count_histogram_ = base::LinearHistogram::FactoryGet(
350       unresponsive_count_histogram_name, 1, 10, 11,
351       base::Histogram::kUmaTargetedHistogramFlag);
352 }
353 
354 // static
OnPingMessage(const BrowserThread::ID & thread_id,const base::Closure & callback_task)355 void ThreadWatcher::OnPingMessage(const BrowserThread::ID& thread_id,
356                                   const base::Closure& callback_task) {
357   // This method is called on watched thread.
358   DCHECK(BrowserThread::CurrentlyOn(thread_id));
359   WatchDogThread::PostTask(FROM_HERE, callback_task);
360 }
361 
ResetHangCounters()362 void ThreadWatcher::ResetHangCounters() {
363   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
364   unresponsive_count_ = 0;
365   hung_processing_complete_ = false;
366 }
367 
GotNoResponse()368 void ThreadWatcher::GotNoResponse() {
369   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
370 
371   ++unresponsive_count_;
372   if (!IsVeryUnresponsive())
373     return;
374 
375   // Record total unresponsive_time since last pong message.
376   base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_;
377   unresponsive_time_histogram_->AddTime(unresponse_time);
378 
379   // We have already collected stats for the non-responding watched thread.
380   if (hung_processing_complete_)
381     return;
382 
383   // Record how other threads are responding.
384   uint32 responding_thread_count = 0;
385   uint32 unresponding_thread_count = 0;
386   ThreadWatcherList::GetStatusOfThreads(&responding_thread_count,
387                                         &unresponding_thread_count);
388 
389   // Record how many watched threads are responding.
390   responsive_count_histogram_->Add(responding_thread_count);
391 
392   // Record how many watched threads are not responding.
393   unresponsive_count_histogram_->Add(unresponding_thread_count);
394 
395   // Crash the browser if the watched thread is to be crashed on hang and if the
396   // number of other threads responding is less than or equal to
397   // live_threads_threshold_ and at least one other thread is responding.
398   if (crash_on_hang_ &&
399       responding_thread_count > 0 &&
400       responding_thread_count <= live_threads_threshold_) {
401     static bool crashed_once = false;
402     if (!crashed_once) {
403       crashed_once = true;
404       CrashBecauseThreadWasUnresponsive(thread_id_);
405     }
406   }
407 
408   hung_processing_complete_ = true;
409 }
410 
IsVeryUnresponsive()411 bool ThreadWatcher::IsVeryUnresponsive() {
412   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
413   return unresponsive_count_ >= unresponsive_threshold_;
414 }
415 
416 // ThreadWatcherList methods and members.
417 //
418 // static
419 ThreadWatcherList* ThreadWatcherList::g_thread_watcher_list_ = NULL;
420 // static
421 bool ThreadWatcherList::g_stopped_ = false;
422 // static
423 const int ThreadWatcherList::kSleepSeconds = 1;
424 // static
425 const int ThreadWatcherList::kUnresponsiveSeconds = 2;
426 // static
427 const int ThreadWatcherList::kUnresponsiveCount = 9;
428 // static
429 const int ThreadWatcherList::kLiveThreadsThreshold = 2;
430 // static, non-const for tests.
431 int ThreadWatcherList::g_initialize_delay_seconds = 120;
432 
CrashDataThresholds(uint32 live_threads_threshold,uint32 unresponsive_threshold)433 ThreadWatcherList::CrashDataThresholds::CrashDataThresholds(
434     uint32 live_threads_threshold,
435     uint32 unresponsive_threshold)
436     : live_threads_threshold(live_threads_threshold),
437       unresponsive_threshold(unresponsive_threshold) {
438 }
439 
CrashDataThresholds()440 ThreadWatcherList::CrashDataThresholds::CrashDataThresholds()
441     : live_threads_threshold(kLiveThreadsThreshold),
442       unresponsive_threshold(kUnresponsiveCount) {
443 }
444 
445 // static
StartWatchingAll(const CommandLine & command_line)446 void ThreadWatcherList::StartWatchingAll(const CommandLine& command_line) {
447   // TODO(rtenneti): Enable ThreadWatcher.
448   uint32 unresponsive_threshold;
449   CrashOnHangThreadMap crash_on_hang_threads;
450   ParseCommandLine(command_line,
451                    &unresponsive_threshold,
452                    &crash_on_hang_threads);
453 
454   ThreadWatcherObserver::SetupNotifications(
455       base::TimeDelta::FromSeconds(kSleepSeconds * unresponsive_threshold));
456 
457   WatchDogThread::PostTask(
458       FROM_HERE,
459       base::Bind(&ThreadWatcherList::SetStopped, false));
460 
461   WatchDogThread::PostDelayedTask(
462       FROM_HERE,
463       base::Bind(&ThreadWatcherList::InitializeAndStartWatching,
464                  unresponsive_threshold,
465                  crash_on_hang_threads),
466       base::TimeDelta::FromSeconds(g_initialize_delay_seconds));
467 }
468 
469 // static
StopWatchingAll()470 void ThreadWatcherList::StopWatchingAll() {
471   // TODO(rtenneti): Enable ThreadWatcher.
472   ThreadWatcherObserver::RemoveNotifications();
473   DeleteAll();
474 }
475 
476 // static
Register(ThreadWatcher * watcher)477 void ThreadWatcherList::Register(ThreadWatcher* watcher) {
478   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
479   if (!g_thread_watcher_list_)
480     return;
481   DCHECK(!g_thread_watcher_list_->Find(watcher->thread_id()));
482   g_thread_watcher_list_->registered_[watcher->thread_id()] = watcher;
483 }
484 
485 // static
IsRegistered(const BrowserThread::ID thread_id)486 bool ThreadWatcherList::IsRegistered(const BrowserThread::ID thread_id) {
487   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
488   return NULL != ThreadWatcherList::Find(thread_id);
489 }
490 
491 // static
GetStatusOfThreads(uint32 * responding_thread_count,uint32 * unresponding_thread_count)492 void ThreadWatcherList::GetStatusOfThreads(uint32* responding_thread_count,
493                                            uint32* unresponding_thread_count) {
494   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
495   *responding_thread_count = 0;
496   *unresponding_thread_count = 0;
497   if (!g_thread_watcher_list_)
498     return;
499 
500   for (RegistrationList::iterator it =
501            g_thread_watcher_list_->registered_.begin();
502        g_thread_watcher_list_->registered_.end() != it;
503        ++it) {
504     if (it->second->IsVeryUnresponsive())
505       ++(*unresponding_thread_count);
506     else
507       ++(*responding_thread_count);
508   }
509 }
510 
511 // static
WakeUpAll()512 void ThreadWatcherList::WakeUpAll() {
513   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
514   if (!g_thread_watcher_list_)
515     return;
516 
517   for (RegistrationList::iterator it =
518            g_thread_watcher_list_->registered_.begin();
519        g_thread_watcher_list_->registered_.end() != it;
520        ++it)
521     it->second->WakeUp();
522 }
523 
ThreadWatcherList()524 ThreadWatcherList::ThreadWatcherList() {
525   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
526   CHECK(!g_thread_watcher_list_);
527   g_thread_watcher_list_ = this;
528 }
529 
~ThreadWatcherList()530 ThreadWatcherList::~ThreadWatcherList() {
531   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
532   DCHECK(this == g_thread_watcher_list_);
533   g_thread_watcher_list_ = NULL;
534 }
535 
536 // static
ParseCommandLine(const CommandLine & command_line,uint32 * unresponsive_threshold,CrashOnHangThreadMap * crash_on_hang_threads)537 void ThreadWatcherList::ParseCommandLine(
538     const CommandLine& command_line,
539     uint32* unresponsive_threshold,
540     CrashOnHangThreadMap* crash_on_hang_threads) {
541   // Initialize |unresponsive_threshold| to a default value.
542   *unresponsive_threshold = kUnresponsiveCount;
543 
544   // Increase the unresponsive_threshold on the Stable and Beta channels to
545   // reduce the number of crashes due to ThreadWatcher.
546   chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
547   if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
548     *unresponsive_threshold *= 4;
549   } else if (channel == chrome::VersionInfo::CHANNEL_BETA) {
550     *unresponsive_threshold *= 2;
551   }
552 
553 #if defined(OS_WIN)
554   // For Windows XP (old systems), double the unresponsive_threshold to give
555   // the OS a chance to schedule UI/IO threads a time slice to respond with a
556   // pong message (to get around limitations with the OS).
557   if (base::win::GetVersion() <= base::win::VERSION_XP)
558     *unresponsive_threshold *= 2;
559 #endif
560 
561   uint32 crash_seconds = *unresponsive_threshold * kUnresponsiveSeconds;
562   std::string crash_on_hang_thread_names;
563   bool has_command_line_overwrite = false;
564   if (command_line.HasSwitch(switches::kCrashOnHangThreads)) {
565     crash_on_hang_thread_names =
566         command_line.GetSwitchValueASCII(switches::kCrashOnHangThreads);
567     has_command_line_overwrite = true;
568   } else if (channel != chrome::VersionInfo::CHANNEL_STABLE) {
569     // Default to crashing the browser if UI or IO or FILE threads are not
570     // responsive except in stable channel.
571     crash_on_hang_thread_names = base::StringPrintf(
572         "UI:%d:%d,IO:%d:%d,FILE:%d:%d",
573         kLiveThreadsThreshold, crash_seconds,
574         kLiveThreadsThreshold, crash_seconds,
575         kLiveThreadsThreshold, crash_seconds * 5);
576   }
577 
578   ParseCommandLineCrashOnHangThreads(crash_on_hang_thread_names,
579                                      kLiveThreadsThreshold,
580                                      crash_seconds,
581                                      crash_on_hang_threads);
582 
583   if (channel != chrome::VersionInfo::CHANNEL_CANARY ||
584       has_command_line_overwrite) {
585     return;
586   }
587 
588   const char* kFieldTrialName = "ThreadWatcher";
589 
590   // Nothing else to be done if the trial has already been set (i.e., when
591   // StartWatchingAll() has been already called once).
592   if (base::FieldTrialList::TrialExists(kFieldTrialName))
593     return;
594 
595   // Set up a field trial for 100% of the users to crash if either UI or IO
596   // thread is not responsive for 30 seconds (or 15 pings).
597   scoped_refptr<base::FieldTrial> field_trial(
598       base::FieldTrialList::FactoryGetFieldTrial(
599           kFieldTrialName, 100, "default_hung_threads",
600           2014, 10, 30, base::FieldTrial::SESSION_RANDOMIZED, NULL));
601   int hung_thread_group = field_trial->AppendGroup("hung_thread", 100);
602   if (field_trial->group() == hung_thread_group) {
603     for (CrashOnHangThreadMap::iterator it = crash_on_hang_threads->begin();
604          crash_on_hang_threads->end() != it;
605          ++it) {
606       if (it->first == "FILE")
607         continue;
608       it->second.live_threads_threshold = INT_MAX;
609       if (it->first == "UI") {
610         // TODO(rtenneti): set unresponsive threshold to 120 seconds to catch
611         // the worst UI hangs and for fewer crashes due to ThreadWatcher. Reduce
612         // it to a more reasonable time ala IO thread.
613         it->second.unresponsive_threshold = 60;
614       } else {
615         it->second.unresponsive_threshold = 15;
616       }
617     }
618   }
619 }
620 
621 // static
ParseCommandLineCrashOnHangThreads(const std::string & crash_on_hang_thread_names,uint32 default_live_threads_threshold,uint32 default_crash_seconds,CrashOnHangThreadMap * crash_on_hang_threads)622 void ThreadWatcherList::ParseCommandLineCrashOnHangThreads(
623     const std::string& crash_on_hang_thread_names,
624     uint32 default_live_threads_threshold,
625     uint32 default_crash_seconds,
626     CrashOnHangThreadMap* crash_on_hang_threads) {
627   base::StringTokenizer tokens(crash_on_hang_thread_names, ",");
628   std::vector<std::string> values;
629   while (tokens.GetNext()) {
630     const std::string& token = tokens.token();
631     base::SplitString(token, ':', &values);
632     std::string thread_name = values[0];
633 
634     uint32 live_threads_threshold = default_live_threads_threshold;
635     uint32 crash_seconds = default_crash_seconds;
636     if (values.size() >= 2 &&
637         (!base::StringToUint(values[1], &live_threads_threshold))) {
638       continue;
639     }
640     if (values.size() >= 3 &&
641         (!base::StringToUint(values[2], &crash_seconds))) {
642       continue;
643     }
644     uint32 unresponsive_threshold = static_cast<uint32>(
645         ceil(static_cast<float>(crash_seconds) / kUnresponsiveSeconds));
646 
647     CrashDataThresholds crash_data(live_threads_threshold,
648                                    unresponsive_threshold);
649     // Use the last specifier.
650     (*crash_on_hang_threads)[thread_name] = crash_data;
651   }
652 }
653 
654 // static
InitializeAndStartWatching(uint32 unresponsive_threshold,const CrashOnHangThreadMap & crash_on_hang_threads)655 void ThreadWatcherList::InitializeAndStartWatching(
656     uint32 unresponsive_threshold,
657     const CrashOnHangThreadMap& crash_on_hang_threads) {
658   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
659 
660   // Disarm the startup timebomb, even if stop has been called.
661   BrowserThread::PostTask(
662       BrowserThread::UI,
663       FROM_HERE,
664       base::Bind(&StartupTimeBomb::DisarmStartupTimeBomb));
665 
666   // This method is deferred in relationship to its StopWatchingAll()
667   // counterpart. If a previous initialization has already happened, or if
668   // stop has been called, there's nothing left to do here.
669   if (g_thread_watcher_list_ || g_stopped_)
670     return;
671 
672   ThreadWatcherList* thread_watcher_list = new ThreadWatcherList();
673   CHECK(thread_watcher_list);
674 
675   const base::TimeDelta kSleepTime =
676       base::TimeDelta::FromSeconds(kSleepSeconds);
677   const base::TimeDelta kUnresponsiveTime =
678       base::TimeDelta::FromSeconds(kUnresponsiveSeconds);
679 
680   StartWatching(BrowserThread::UI, "UI", kSleepTime, kUnresponsiveTime,
681                 unresponsive_threshold, crash_on_hang_threads);
682   StartWatching(BrowserThread::IO, "IO", kSleepTime, kUnresponsiveTime,
683                 unresponsive_threshold, crash_on_hang_threads);
684   StartWatching(BrowserThread::DB, "DB", kSleepTime, kUnresponsiveTime,
685                 unresponsive_threshold, crash_on_hang_threads);
686   StartWatching(BrowserThread::FILE, "FILE", kSleepTime, kUnresponsiveTime,
687                 unresponsive_threshold, crash_on_hang_threads);
688   StartWatching(BrowserThread::CACHE, "CACHE", kSleepTime, kUnresponsiveTime,
689                 unresponsive_threshold, crash_on_hang_threads);
690 }
691 
692 // static
StartWatching(const BrowserThread::ID & thread_id,const std::string & thread_name,const base::TimeDelta & sleep_time,const base::TimeDelta & unresponsive_time,uint32 unresponsive_threshold,const CrashOnHangThreadMap & crash_on_hang_threads)693 void ThreadWatcherList::StartWatching(
694     const BrowserThread::ID& thread_id,
695     const std::string& thread_name,
696     const base::TimeDelta& sleep_time,
697     const base::TimeDelta& unresponsive_time,
698     uint32 unresponsive_threshold,
699     const CrashOnHangThreadMap& crash_on_hang_threads) {
700   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
701 
702   CrashOnHangThreadMap::const_iterator it =
703       crash_on_hang_threads.find(thread_name);
704   bool crash_on_hang = false;
705   uint32 live_threads_threshold = 0;
706   if (it != crash_on_hang_threads.end()) {
707     crash_on_hang = true;
708     live_threads_threshold = it->second.live_threads_threshold;
709     unresponsive_threshold = it->second.unresponsive_threshold;
710   }
711 
712   ThreadWatcher::StartWatching(
713       ThreadWatcher::WatchingParams(thread_id,
714                                     thread_name,
715                                     sleep_time,
716                                     unresponsive_time,
717                                     unresponsive_threshold,
718                                     crash_on_hang,
719                                     live_threads_threshold));
720 }
721 
722 // static
DeleteAll()723 void ThreadWatcherList::DeleteAll() {
724   if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
725     WatchDogThread::PostTask(
726         FROM_HERE,
727         base::Bind(&ThreadWatcherList::DeleteAll));
728     return;
729   }
730 
731   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
732 
733   SetStopped(true);
734 
735   if (!g_thread_watcher_list_)
736     return;
737 
738   // Delete all thread watcher objects.
739   while (!g_thread_watcher_list_->registered_.empty()) {
740     RegistrationList::iterator it = g_thread_watcher_list_->registered_.begin();
741     delete it->second;
742     g_thread_watcher_list_->registered_.erase(it);
743   }
744 
745   delete g_thread_watcher_list_;
746 }
747 
748 // static
Find(const BrowserThread::ID & thread_id)749 ThreadWatcher* ThreadWatcherList::Find(const BrowserThread::ID& thread_id) {
750   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
751   if (!g_thread_watcher_list_)
752     return NULL;
753   RegistrationList::iterator it =
754       g_thread_watcher_list_->registered_.find(thread_id);
755   if (g_thread_watcher_list_->registered_.end() == it)
756     return NULL;
757   return it->second;
758 }
759 
760 // static
SetStopped(bool stopped)761 void ThreadWatcherList::SetStopped(bool stopped) {
762   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
763   g_stopped_ = stopped;
764 }
765 
766 // ThreadWatcherObserver methods and members.
767 //
768 // static
769 ThreadWatcherObserver* ThreadWatcherObserver::g_thread_watcher_observer_ = NULL;
770 
ThreadWatcherObserver(const base::TimeDelta & wakeup_interval)771 ThreadWatcherObserver::ThreadWatcherObserver(
772     const base::TimeDelta& wakeup_interval)
773     : last_wakeup_time_(base::TimeTicks::Now()),
774       wakeup_interval_(wakeup_interval) {
775   CHECK(!g_thread_watcher_observer_);
776   g_thread_watcher_observer_ = this;
777 }
778 
~ThreadWatcherObserver()779 ThreadWatcherObserver::~ThreadWatcherObserver() {
780   DCHECK(this == g_thread_watcher_observer_);
781   g_thread_watcher_observer_ = NULL;
782 }
783 
784 // static
SetupNotifications(const base::TimeDelta & wakeup_interval)785 void ThreadWatcherObserver::SetupNotifications(
786     const base::TimeDelta& wakeup_interval) {
787   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
788   ThreadWatcherObserver* observer = new ThreadWatcherObserver(wakeup_interval);
789   observer->registrar_.Add(
790       observer,
791       chrome::NOTIFICATION_BROWSER_OPENED,
792       content::NotificationService::AllBrowserContextsAndSources());
793   observer->registrar_.Add(observer,
794                            chrome::NOTIFICATION_BROWSER_CLOSED,
795                            content::NotificationService::AllSources());
796   observer->registrar_.Add(observer,
797                            chrome::NOTIFICATION_TAB_PARENTED,
798                            content::NotificationService::AllSources());
799   observer->registrar_.Add(observer,
800                            chrome::NOTIFICATION_TAB_CLOSING,
801                            content::NotificationService::AllSources());
802   observer->registrar_.Add(observer,
803                            content::NOTIFICATION_LOAD_START,
804                            content::NotificationService::AllSources());
805   observer->registrar_.Add(observer,
806                            content::NOTIFICATION_LOAD_STOP,
807                            content::NotificationService::AllSources());
808   observer->registrar_.Add(observer,
809                            content::NOTIFICATION_RENDERER_PROCESS_CLOSED,
810                            content::NotificationService::AllSources());
811   observer->registrar_.Add(observer,
812                            content::NOTIFICATION_RENDER_WIDGET_HOST_HANG,
813                            content::NotificationService::AllSources());
814   observer->registrar_.Add(observer,
815                            chrome::NOTIFICATION_OMNIBOX_OPENED_URL,
816                            content::NotificationService::AllSources());
817 }
818 
819 // static
RemoveNotifications()820 void ThreadWatcherObserver::RemoveNotifications() {
821   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
822   if (!g_thread_watcher_observer_)
823     return;
824   g_thread_watcher_observer_->registrar_.RemoveAll();
825   delete g_thread_watcher_observer_;
826 }
827 
Observe(int type,const content::NotificationSource & source,const content::NotificationDetails & details)828 void ThreadWatcherObserver::Observe(
829     int type,
830     const content::NotificationSource& source,
831     const content::NotificationDetails& details) {
832   // There is some user activity, see if thread watchers are to be awakened.
833   base::TimeTicks now = base::TimeTicks::Now();
834   if ((now - last_wakeup_time_) < wakeup_interval_)
835     return;
836   last_wakeup_time_ = now;
837   WatchDogThread::PostTask(
838       FROM_HERE,
839       base::Bind(&ThreadWatcherList::WakeUpAll));
840 }
841 
842 // WatchDogThread methods and members.
843 
844 // This lock protects g_watchdog_thread.
845 static base::LazyInstance<base::Lock>::Leaky
846     g_watchdog_lock = LAZY_INSTANCE_INITIALIZER;
847 
848 // The singleton of this class.
849 static WatchDogThread* g_watchdog_thread = NULL;
850 
WatchDogThread()851 WatchDogThread::WatchDogThread() : Thread("BrowserWatchdog") {
852 }
853 
~WatchDogThread()854 WatchDogThread::~WatchDogThread() {
855   Stop();
856 }
857 
858 // static
CurrentlyOnWatchDogThread()859 bool WatchDogThread::CurrentlyOnWatchDogThread() {
860   base::AutoLock lock(g_watchdog_lock.Get());
861   return g_watchdog_thread &&
862       g_watchdog_thread->message_loop() == base::MessageLoop::current();
863 }
864 
865 // static
PostTask(const tracked_objects::Location & from_here,const base::Closure & task)866 bool WatchDogThread::PostTask(const tracked_objects::Location& from_here,
867                               const base::Closure& task) {
868   return PostTaskHelper(from_here, task, base::TimeDelta());
869 }
870 
871 // static
PostDelayedTask(const tracked_objects::Location & from_here,const base::Closure & task,base::TimeDelta delay)872 bool WatchDogThread::PostDelayedTask(const tracked_objects::Location& from_here,
873                                      const base::Closure& task,
874                                      base::TimeDelta delay) {
875   return PostTaskHelper(from_here, task, delay);
876 }
877 
878 // static
PostTaskHelper(const tracked_objects::Location & from_here,const base::Closure & task,base::TimeDelta delay)879 bool WatchDogThread::PostTaskHelper(
880     const tracked_objects::Location& from_here,
881     const base::Closure& task,
882     base::TimeDelta delay) {
883   {
884     base::AutoLock lock(g_watchdog_lock.Get());
885 
886     base::MessageLoop* message_loop = g_watchdog_thread ?
887         g_watchdog_thread->message_loop() : NULL;
888     if (message_loop) {
889       message_loop->PostDelayedTask(from_here, task, delay);
890       return true;
891     }
892   }
893 
894   return false;
895 }
896 
Init()897 void WatchDogThread::Init() {
898   // This thread shouldn't be allowed to perform any blocking disk I/O.
899   base::ThreadRestrictions::SetIOAllowed(false);
900 
901   base::AutoLock lock(g_watchdog_lock.Get());
902   CHECK(!g_watchdog_thread);
903   g_watchdog_thread = this;
904 }
905 
CleanUp()906 void WatchDogThread::CleanUp() {
907   base::AutoLock lock(g_watchdog_lock.Get());
908   g_watchdog_thread = NULL;
909 }
910 
911 namespace {
912 
913 // StartupWatchDogThread methods and members.
914 //
915 // Class for detecting hangs during startup.
916 class StartupWatchDogThread : public base::Watchdog {
917  public:
918   // Constructor specifies how long the StartupWatchDogThread will wait before
919   // alarming.
StartupWatchDogThread(const base::TimeDelta & duration)920   explicit StartupWatchDogThread(const base::TimeDelta& duration)
921       : base::Watchdog(duration, "Startup watchdog thread", true) {
922 #if defined(OS_ANDROID)
923     // TODO(rtenneti): Delete this code, after getting data.
924     start_time_clock_= base::Time::Now();
925     start_time_monotonic_ = base::TimeTicks::Now();
926     start_time_thread_now_ = base::TimeTicks::IsThreadNowSupported()
927         ? base::TimeTicks::ThreadNow() : base::TimeTicks::Now();
928 #endif  // OS_ANDROID
929   }
930 
931   // Alarm is called if the time expires after an Arm() without someone calling
932   // Disarm(). When Alarm goes off, in release mode we get the crash dump
933   // without crashing and in debug mode we break into the debugger.
Alarm()934   virtual void Alarm() OVERRIDE {
935 #if !defined(NDEBUG)
936     StartupCrash();
937     return;
938 #elif !defined(OS_ANDROID)
939     WatchDogThread::PostTask(FROM_HERE, base::Bind(&StartupCrash));
940     return;
941 #else  // Android release: gather stats to figure out when to crash.
942     // TODO(rtenneti): Delete this code, after getting data.
943     UMA_HISTOGRAM_TIMES("StartupTimeBomb.Alarm.TimeDuration",
944                         base::Time::Now() - start_time_clock_);
945     UMA_HISTOGRAM_TIMES("StartupTimeBomb.Alarm.TimeTicksDuration",
946                         base::TimeTicks::Now() - start_time_monotonic_);
947     if (base::TimeTicks::IsThreadNowSupported()) {
948       UMA_HISTOGRAM_TIMES(
949           "StartupTimeBomb.Alarm.ThreadNowDuration",
950           base::TimeTicks::ThreadNow() - start_time_thread_now_);
951     }
952     return;
953 #endif  // OS_ANDROID
954   }
955 
956  private:
957 #if defined(OS_ANDROID)
958   // TODO(rtenneti): Delete this code, after getting data.
959   base::Time start_time_clock_;
960   base::TimeTicks start_time_monotonic_;
961   base::TimeTicks start_time_thread_now_;
962 #endif  // OS_ANDROID
963 
964   DISALLOW_COPY_AND_ASSIGN(StartupWatchDogThread);
965 };
966 
967 // ShutdownWatchDogThread methods and members.
968 //
969 // Class for detecting hangs during shutdown.
970 class ShutdownWatchDogThread : public base::Watchdog {
971  public:
972   // Constructor specifies how long the ShutdownWatchDogThread will wait before
973   // alarming.
ShutdownWatchDogThread(const base::TimeDelta & duration)974   explicit ShutdownWatchDogThread(const base::TimeDelta& duration)
975       : base::Watchdog(duration, "Shutdown watchdog thread", true) {
976   }
977 
978   // Alarm is called if the time expires after an Arm() without someone calling
979   // Disarm(). We crash the browser if this method is called.
Alarm()980   virtual void Alarm() OVERRIDE {
981     ShutdownCrash();
982   }
983 
984  private:
985   DISALLOW_COPY_AND_ASSIGN(ShutdownWatchDogThread);
986 };
987 }  // namespace
988 
989 // StartupTimeBomb methods and members.
990 //
991 // static
992 StartupTimeBomb* StartupTimeBomb::g_startup_timebomb_ = NULL;
993 
StartupTimeBomb()994 StartupTimeBomb::StartupTimeBomb()
995     : startup_watchdog_(NULL),
996       thread_id_(base::PlatformThread::CurrentId()) {
997   CHECK(!g_startup_timebomb_);
998   g_startup_timebomb_ = this;
999 }
1000 
~StartupTimeBomb()1001 StartupTimeBomb::~StartupTimeBomb() {
1002   DCHECK(this == g_startup_timebomb_);
1003   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1004   if (startup_watchdog_)
1005     Disarm();
1006   g_startup_timebomb_ = NULL;
1007 }
1008 
Arm(const base::TimeDelta & duration)1009 void StartupTimeBomb::Arm(const base::TimeDelta& duration) {
1010   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1011   DCHECK(!startup_watchdog_);
1012   startup_watchdog_ = new StartupWatchDogThread(duration);
1013   startup_watchdog_->Arm();
1014   return;
1015 }
1016 
Disarm()1017 void StartupTimeBomb::Disarm() {
1018   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1019   if (startup_watchdog_) {
1020     startup_watchdog_->Disarm();
1021     startup_watchdog_->Cleanup();
1022     DeleteStartupWatchdog();
1023   }
1024 }
1025 
DeleteStartupWatchdog()1026 void StartupTimeBomb::DeleteStartupWatchdog() {
1027   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1028   if (startup_watchdog_->IsJoinable()) {
1029     // Allow the watchdog thread to shutdown on UI. Watchdog thread shutdowns
1030     // very fast.
1031     base::ThreadRestrictions::SetIOAllowed(true);
1032     delete startup_watchdog_;
1033     startup_watchdog_ = NULL;
1034     return;
1035   }
1036   base::MessageLoop::current()->PostDelayedTask(
1037       FROM_HERE,
1038       base::Bind(&StartupTimeBomb::DeleteStartupWatchdog,
1039                  base::Unretained(this)),
1040       base::TimeDelta::FromSeconds(10));
1041 }
1042 
1043 // static
DisarmStartupTimeBomb()1044 void StartupTimeBomb::DisarmStartupTimeBomb() {
1045   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
1046   if (g_startup_timebomb_)
1047     g_startup_timebomb_->Disarm();
1048 }
1049 
1050 // ShutdownWatcherHelper methods and members.
1051 //
1052 // ShutdownWatcherHelper is a wrapper class for detecting hangs during
1053 // shutdown.
ShutdownWatcherHelper()1054 ShutdownWatcherHelper::ShutdownWatcherHelper()
1055     : shutdown_watchdog_(NULL),
1056       thread_id_(base::PlatformThread::CurrentId()) {
1057 }
1058 
~ShutdownWatcherHelper()1059 ShutdownWatcherHelper::~ShutdownWatcherHelper() {
1060   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1061   if (shutdown_watchdog_) {
1062     shutdown_watchdog_->Disarm();
1063     delete shutdown_watchdog_;
1064     shutdown_watchdog_ = NULL;
1065   }
1066 }
1067 
Arm(const base::TimeDelta & duration)1068 void ShutdownWatcherHelper::Arm(const base::TimeDelta& duration) {
1069   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
1070   DCHECK(!shutdown_watchdog_);
1071   base::TimeDelta actual_duration = duration;
1072 
1073   chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
1074   if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
1075     actual_duration *= 20;
1076   } else if (channel == chrome::VersionInfo::CHANNEL_BETA ||
1077              channel == chrome::VersionInfo::CHANNEL_DEV) {
1078     actual_duration *= 10;
1079   }
1080 
1081 #if defined(OS_WIN)
1082   // On Windows XP, give twice the time for shutdown.
1083   if (base::win::GetVersion() <= base::win::VERSION_XP)
1084     actual_duration *= 2;
1085 #endif
1086 
1087   shutdown_watchdog_ = new ShutdownWatchDogThread(actual_duration);
1088   shutdown_watchdog_->Arm();
1089 }
1090