• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "thread_list.h"
18 
19 #include <dirent.h>
20 #include <nativehelper/scoped_local_ref.h>
21 #include <nativehelper/scoped_utf_chars.h>
22 #include <sys/resource.h>  // For getpriority()
23 #include <sys/types.h>
24 #include <unistd.h>
25 
26 #include <map>
27 #include <sstream>
28 #include <tuple>
29 #include <vector>
30 
31 #include "android-base/properties.h"
32 #include "android-base/stringprintf.h"
33 #include "art_field-inl.h"
34 #include "base/aborting.h"
35 #include "base/histogram-inl.h"
36 #include "base/mutex-inl.h"
37 #include "base/systrace.h"
38 #include "base/time_utils.h"
39 #include "base/timing_logger.h"
40 #include "debugger.h"
41 #include "gc/collector/concurrent_copying.h"
42 #include "gc/gc_pause_listener.h"
43 #include "gc/heap.h"
44 #include "gc/reference_processor.h"
45 #include "gc_root.h"
46 #include "jni/jni_internal.h"
47 #include "lock_word.h"
48 #include "mirror/string.h"
49 #include "monitor.h"
50 #include "native_stack_dump.h"
51 #include "obj_ptr-inl.h"
52 #include "scoped_thread_state_change-inl.h"
53 #include "thread.h"
54 #include "trace.h"
55 #include "unwindstack/AndroidUnwinder.h"
56 #include "well_known_classes.h"
57 
58 #if ART_USE_FUTEXES
59 #include <linux/futex.h>
60 #include <sys/syscall.h>
61 #endif  // ART_USE_FUTEXES
62 
63 namespace art HIDDEN {
64 
65 using android::base::StringPrintf;
66 
67 static constexpr uint64_t kLongThreadSuspendThreshold = MsToNs(5);
68 
69 // Whether we should try to dump the native stack of unattached threads. See commit ed8b723 for
70 // some history.
71 static constexpr bool kDumpUnattachedThreadNativeStackForSigQuit = true;
72 
ThreadList(uint64_t thread_suspend_timeout_ns)73 ThreadList::ThreadList(uint64_t thread_suspend_timeout_ns)
74     : suspend_all_count_(0),
75       unregistering_count_(0),
76       suspend_all_histogram_("suspend all histogram", 16, 64),
77       long_suspend_(false),
78       shut_down_(false),
79       thread_suspend_timeout_ns_(thread_suspend_timeout_ns),
80       empty_checkpoint_barrier_(new Barrier(0)) {
81   CHECK(Monitor::IsValidLockWord(LockWord::FromThinLockId(kMaxThreadId, 1, 0U)));
82 }
83 
~ThreadList()84 ThreadList::~ThreadList() {
85   CHECK(shut_down_);
86 }
87 
ShutDown()88 void ThreadList::ShutDown() {
89   ScopedTrace trace(__PRETTY_FUNCTION__);
90   // Detach the current thread if necessary. If we failed to start, there might not be any threads.
91   // We need to detach the current thread here in case there's another thread waiting to join with
92   // us.
93   bool contains = false;
94   Thread* self = Thread::Current();
95   {
96     MutexLock mu(self, *Locks::thread_list_lock_);
97     contains = Contains(self);
98   }
99   if (contains) {
100     Runtime::Current()->DetachCurrentThread();
101   }
102   WaitForOtherNonDaemonThreadsToExit();
103   // The only caller of this function, ~Runtime, has already disabled GC and
104   // ensured that the last GC is finished.
105   gc::Heap* const heap = Runtime::Current()->GetHeap();
106   CHECK(heap->IsGCDisabledForShutdown());
107 
108   // TODO: there's an unaddressed race here where a thread may attach during shutdown, see
109   //       Thread::Init.
110   SuspendAllDaemonThreadsForShutdown();
111 
112   shut_down_ = true;
113 }
114 
Contains(Thread * thread)115 bool ThreadList::Contains(Thread* thread) {
116   return find(list_.begin(), list_.end(), thread) != list_.end();
117 }
118 
GetLockOwner()119 pid_t ThreadList::GetLockOwner() {
120   return Locks::thread_list_lock_->GetExclusiveOwnerTid();
121 }
122 
DumpNativeStacks(std::ostream & os)123 void ThreadList::DumpNativeStacks(std::ostream& os) {
124   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
125   unwindstack::AndroidLocalUnwinder unwinder;
126   for (const auto& thread : list_) {
127     os << "DUMPING THREAD " << thread->GetTid() << "\n";
128     DumpNativeStack(os, unwinder, thread->GetTid(), "\t");
129     os << "\n";
130   }
131 }
132 
DumpForSigQuit(std::ostream & os)133 void ThreadList::DumpForSigQuit(std::ostream& os) {
134   {
135     ScopedObjectAccess soa(Thread::Current());
136     // Only print if we have samples.
137     if (suspend_all_histogram_.SampleSize() > 0) {
138       Histogram<uint64_t>::CumulativeData data;
139       suspend_all_histogram_.CreateHistogram(&data);
140       suspend_all_histogram_.PrintConfidenceIntervals(os, 0.99, data);  // Dump time to suspend.
141     }
142   }
143   bool dump_native_stack = Runtime::Current()->GetDumpNativeStackOnSigQuit();
144   Dump(os, dump_native_stack);
145   DumpUnattachedThreads(os, dump_native_stack && kDumpUnattachedThreadNativeStackForSigQuit);
146 }
147 
DumpUnattachedThread(std::ostream & os,pid_t tid,bool dump_native_stack)148 static void DumpUnattachedThread(std::ostream& os, pid_t tid, bool dump_native_stack)
149     NO_THREAD_SAFETY_ANALYSIS {
150   // TODO: No thread safety analysis as DumpState with a null thread won't access fields, should
151   // refactor DumpState to avoid skipping analysis.
152   Thread::DumpState(os, nullptr, tid);
153   if (dump_native_stack) {
154     DumpNativeStack(os, tid, "  native: ");
155   }
156   os << std::endl;
157 }
158 
DumpUnattachedThreads(std::ostream & os,bool dump_native_stack)159 void ThreadList::DumpUnattachedThreads(std::ostream& os, bool dump_native_stack) {
160   DIR* d = opendir("/proc/self/task");
161   if (!d) {
162     return;
163   }
164 
165   Thread* self = Thread::Current();
166   dirent* e;
167   while ((e = readdir(d)) != nullptr) {
168     char* end;
169     pid_t tid = strtol(e->d_name, &end, 10);
170     if (!*end) {
171       Thread* thread;
172       {
173         MutexLock mu(self, *Locks::thread_list_lock_);
174         thread = FindThreadByTid(tid);
175       }
176       if (thread == nullptr) {
177         DumpUnattachedThread(os, tid, dump_native_stack);
178       }
179     }
180   }
181   closedir(d);
182 }
183 
184 // Dump checkpoint timeout in milliseconds. Larger amount on the target, since the device could be
185 // overloaded with ANR dumps.
186 static constexpr uint32_t kDumpWaitTimeout = kIsTargetBuild ? 100000 : 20000;
187 
188 // A closure used by Thread::Dump.
189 class DumpCheckpoint final : public Closure {
190  public:
DumpCheckpoint(bool dump_native_stack)191   DumpCheckpoint(bool dump_native_stack)
192       : lock_("Dump checkpoint lock", kGenericBottomLock),
193         os_(),
194         // Avoid verifying count in case a thread doesn't end up passing through the barrier.
195         // This avoids a SIGABRT that would otherwise happen in the destructor.
196         barrier_(0, /*verify_count_on_shutdown=*/false),
197         unwinder_(std::vector<std::string>{}, std::vector<std::string> {"oat", "odex"}),
198         dump_native_stack_(dump_native_stack) {
199   }
200 
Run(Thread * thread)201   void Run(Thread* thread) override {
202     // Note thread and self may not be equal if thread was already suspended at the point of the
203     // request.
204     Thread* self = Thread::Current();
205     CHECK(self != nullptr);
206     std::ostringstream local_os;
207     Locks::mutator_lock_->AssertSharedHeld(self);
208     Thread::DumpOrder dump_order = thread->Dump(local_os, unwinder_, dump_native_stack_);
209     {
210       MutexLock mu(self, lock_);
211       // Sort, so that the most interesting threads for ANR are printed first (ANRs can be trimmed).
212       std::pair<Thread::DumpOrder, uint32_t> sort_key(dump_order, thread->GetThreadId());
213       os_.emplace(sort_key, std::move(local_os));
214     }
215     barrier_.Pass(self);
216   }
217 
218   // Called at the end to print all the dumps in sequential prioritized order.
Dump(Thread * self,std::ostream & os)219   void Dump(Thread* self, std::ostream& os) {
220     MutexLock mu(self, lock_);
221     for (const auto& it : os_) {
222       os << it.second.str() << std::endl;
223     }
224   }
225 
WaitForThreadsToRunThroughCheckpoint(size_t threads_running_checkpoint)226   void WaitForThreadsToRunThroughCheckpoint(size_t threads_running_checkpoint) {
227     Thread* self = Thread::Current();
228     ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun);
229     bool timed_out = barrier_.Increment(self, threads_running_checkpoint, kDumpWaitTimeout);
230     if (timed_out) {
231       // Avoid a recursive abort.
232       LOG((kIsDebugBuild && (gAborting == 0)) ? ::android::base::FATAL : ::android::base::ERROR)
233           << "Unexpected time out during dump checkpoint.";
234     }
235   }
236 
237  private:
238   // Storage for the per-thread dumps (guarded by lock since they are generated in parallel).
239   // Map is used to obtain sorted order. The key is unique, but use multimap just in case.
240   Mutex lock_;
241   std::multimap<std::pair<Thread::DumpOrder, uint32_t>, std::ostringstream> os_ GUARDED_BY(lock_);
242   // The barrier to be passed through and for the requestor to wait upon.
243   Barrier barrier_;
244   // A backtrace map, so that all threads use a shared info and don't reacquire/parse separately.
245   unwindstack::AndroidLocalUnwinder unwinder_;
246   // Whether we should dump the native stack.
247   const bool dump_native_stack_;
248 };
249 
Dump(std::ostream & os,bool dump_native_stack)250 void ThreadList::Dump(std::ostream& os, bool dump_native_stack) {
251   Thread* self = Thread::Current();
252   {
253     MutexLock mu(self, *Locks::thread_list_lock_);
254     os << "DALVIK THREADS (" << list_.size() << "):\n";
255   }
256   if (self != nullptr) {
257     // Dump() can be called in any mutator lock state.
258     bool mutator_lock_held = Locks::mutator_lock_->IsSharedHeld(self);
259     DumpCheckpoint checkpoint(dump_native_stack);
260     // Acquire mutator lock separately for each thread, to avoid long runnable code sequence
261     // without suspend checks.
262     size_t threads_running_checkpoint =
263         RunCheckpoint(&checkpoint,
264                       nullptr,
265                       true,
266                       /* acquire_mutator_lock= */ !mutator_lock_held);
267     if (threads_running_checkpoint != 0) {
268       checkpoint.WaitForThreadsToRunThroughCheckpoint(threads_running_checkpoint);
269     }
270     checkpoint.Dump(self, os);
271   } else {
272     DumpUnattachedThreads(os, dump_native_stack);
273   }
274 }
275 
AssertOtherThreadsAreSuspended(Thread * self)276 void ThreadList::AssertOtherThreadsAreSuspended(Thread* self) {
277   MutexLock mu(self, *Locks::thread_list_lock_);
278   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
279   for (const auto& thread : list_) {
280     if (thread != self) {
281       CHECK(thread->IsSuspended())
282             << "\nUnsuspended thread: <<" << *thread << "\n"
283             << "self: <<" << *Thread::Current();
284     }
285   }
286 }
287 
288 #if HAVE_TIMED_RWLOCK
289 // Attempt to rectify locks so that we dump thread list with required locks before exiting.
UnsafeLogFatalForThreadSuspendAllTimeout()290 NO_RETURN static void UnsafeLogFatalForThreadSuspendAllTimeout() {
291   // Increment gAborting before doing the thread list dump since we don't want any failures from
292   // AssertThreadSuspensionIsAllowable in cases where thread suspension is not allowed.
293   // See b/69044468.
294   ++gAborting;
295   Runtime* runtime = Runtime::Current();
296   std::ostringstream ss;
297   ss << "Thread suspend timeout\n";
298   Locks::mutator_lock_->Dump(ss);
299   ss << "\n";
300   runtime->GetThreadList()->Dump(ss);
301   --gAborting;
302   LOG(FATAL) << ss.str();
303   exit(0);
304 }
305 #endif
306 
RunCheckpoint(Closure * checkpoint_function,Closure * callback,bool allow_lock_checking,bool acquire_mutator_lock)307 size_t ThreadList::RunCheckpoint(Closure* checkpoint_function,
308                                  Closure* callback,
309                                  bool allow_lock_checking,
310                                  bool acquire_mutator_lock) {
311   Thread* self = Thread::Current();
312   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
313   Locks::thread_list_lock_->AssertNotHeld(self);
314   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
315   if (kIsDebugBuild && allow_lock_checking && !acquire_mutator_lock) {
316     // TODO: Consider better checking with acquire_mutator_lock.
317     self->DisallowPreMonitorMutexes();
318   }
319 
320   std::vector<Thread*> remaining_threads;
321   size_t count = 0;
322   bool mutator_lock_held = Locks::mutator_lock_->IsSharedHeld(self);
323   ThreadState old_thread_state = self->GetState();
324   DCHECK(!(mutator_lock_held && acquire_mutator_lock));
325 
326   // Thread-safety analysis wants the lock state to always be the same at every program point.
327   // Allow us to pretend it is.
328   auto fake_mutator_lock = []() SHARED_LOCK_FUNCTION(Locks::mutator_lock_)
329                                NO_THREAD_SAFETY_ANALYSIS {};
330   auto fake_mutator_unlock = []() UNLOCK_FUNCTION(Locks::mutator_lock_)
331                                  NO_THREAD_SAFETY_ANALYSIS {};
332 
333   if (acquire_mutator_lock) {
334     self->TransitionFromSuspendedToRunnable();
335   } else {
336     fake_mutator_lock();
337   }
338   Locks::thread_list_lock_->Lock(self);
339   Locks::thread_suspend_count_lock_->Lock(self);
340 
341   // First try to install checkpoint function in each thread. This will succeed only for
342   // runnable threads. Track others in remaining_threads.
343   count = list_.size();
344   for (const auto& thread : list_) {
345     if (thread != self) {
346       if (thread->RequestCheckpoint(checkpoint_function)) {
347         // This thread will run its checkpoint some time in the near future.
348       } else {
349         remaining_threads.push_back(thread);
350       }
351     }
352     // Thread either has honored or will honor the checkpoint, or it has been added to
353     // remaining_threads.
354   }
355 
356   // ith entry corresponds to remaining_threads[i]:
357   std::unique_ptr<ThreadExitFlag[]> tefs(new ThreadExitFlag[remaining_threads.size()]);
358 
359   // Register a ThreadExitFlag for each remaining thread.
360   for (size_t i = 0; i < remaining_threads.size(); ++i) {
361     remaining_threads[i]->NotifyOnThreadExit(&tefs[i]);
362   }
363 
364   // Run the callback to be called inside this critical section.
365   if (callback != nullptr) {
366     callback->Run(self);
367   }
368 
369   size_t nthreads = remaining_threads.size();
370   size_t starting_thread = 0;
371   size_t next_starting_thread;  // First possible remaining non-null entry in remaining_threads.
372   // Run the checkpoint for the suspended threads.
373   do {
374     // We hold mutator_lock_ (if desired), thread_list_lock_, and suspend_count_lock_
375     next_starting_thread = nthreads;
376     for (size_t i = 0; i < nthreads; ++i) {
377       Thread* thread = remaining_threads[i];
378       if (thread == nullptr) {
379         continue;
380       }
381       if (tefs[i].HasExited()) {
382         remaining_threads[i] = nullptr;
383         --count;
384         continue;
385       }
386       bool was_runnable = thread->RequestCheckpoint(checkpoint_function);
387       if (was_runnable) {
388         // Thread became runnable, and will run the checkpoint; we're done.
389         thread->UnregisterThreadExitFlag(&tefs[i]);
390         remaining_threads[i] = nullptr;
391         continue;
392       }
393       // Thread was still suspended, as expected.
394       // We need to run the checkpoint ourselves. Suspend thread so it stays suspended.
395       thread->IncrementSuspendCount(self);
396       if (LIKELY(thread->IsSuspended())) {
397         // Run the checkpoint function ourselves.
398         // We need to run the checkpoint function without the thread_list and suspend_count locks.
399         Locks::thread_suspend_count_lock_->Unlock(self);
400         Locks::thread_list_lock_->Unlock(self);
401         if (mutator_lock_held || acquire_mutator_lock) {
402           // Make sure there is no pending flip function before running Java-heap-accessing
403           // checkpoint on behalf of thread.
404           Thread::EnsureFlipFunctionStarted(self, thread);
405           if (thread->GetStateAndFlags(std::memory_order_acquire)
406                   .IsAnyOfFlagsSet(Thread::FlipFunctionFlags())) {
407             // There is another thread running the flip function for 'thread'.
408             // Instead of waiting for it to complete, move to the next thread.
409             // Retry this one later from scratch.
410             next_starting_thread = std::min(next_starting_thread, i);
411             Locks::thread_list_lock_->Lock(self);
412             Locks::thread_suspend_count_lock_->Lock(self);
413             thread->DecrementSuspendCount(self);
414             Thread::resume_cond_->Broadcast(self);
415             continue;
416           }
417         }  // O.w. the checkpoint will not access Java data structures, and doesn't care whether
418            // the flip function has been called.
419         checkpoint_function->Run(thread);
420         if (acquire_mutator_lock) {
421           {
422             MutexLock mu3(self, *Locks::thread_suspend_count_lock_);
423             thread->DecrementSuspendCount(self);
424             // In the case of a thread waiting for IO or the like, there will be no waiters
425             // on resume_cond_, so Broadcast() will not enter the kernel, and thus be cheap.
426             Thread::resume_cond_->Broadcast(self);
427           }
428           {
429             // Allow us to run checkpoints, or be suspended between checkpoint invocations.
430             ScopedThreadSuspension sts(self, old_thread_state);
431           }
432           Locks::thread_list_lock_->Lock(self);
433           Locks::thread_suspend_count_lock_->Lock(self);
434         } else {
435           Locks::thread_list_lock_->Lock(self);
436           Locks::thread_suspend_count_lock_->Lock(self);
437           thread->DecrementSuspendCount(self);
438           Thread::resume_cond_->Broadcast(self);
439         }
440         thread->UnregisterThreadExitFlag(&tefs[i]);
441         remaining_threads[i] = nullptr;
442       } else {
443         // Thread may have become runnable between the time we last checked and
444         // the time we incremented the suspend count. We defer to the next attempt, rather than
445         // waiting for it to suspend. Note that this may still unnecessarily trigger a signal
446         // handler, but it should be exceedingly rare.
447         thread->DecrementSuspendCount(self);
448         Thread::resume_cond_->Broadcast(self);
449         next_starting_thread = std::min(next_starting_thread, i);
450       }
451     }
452     starting_thread = next_starting_thread;
453   } while (starting_thread != nthreads);
454 
455   // Finally run the checkpoint on ourself. We will already have run the flip function, if we're
456   // runnable.
457   Locks::thread_list_lock_->Unlock(self);
458   Locks::thread_suspend_count_lock_->Unlock(self);
459   checkpoint_function->Run(self);
460 
461   if (acquire_mutator_lock) {
462     self->TransitionFromRunnableToSuspended(old_thread_state);
463   } else {
464     fake_mutator_unlock();
465   }
466 
467   DCHECK(std::all_of(remaining_threads.cbegin(), remaining_threads.cend(), [](Thread* thread) {
468     return thread == nullptr;
469   }));
470   Thread::DCheckUnregisteredEverywhere(&tefs[0], &tefs[nthreads - 1]);
471 
472   if (kIsDebugBuild && allow_lock_checking & !acquire_mutator_lock) {
473     self->AllowPreMonitorMutexes();
474   }
475   return count;
476 }
477 
RunEmptyCheckpoint()478 void ThreadList::RunEmptyCheckpoint() {
479   Thread* self = Thread::Current();
480   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
481   Locks::thread_list_lock_->AssertNotHeld(self);
482   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
483   std::vector<uint32_t> runnable_thread_ids;
484   size_t count = 0;
485   Barrier* barrier = empty_checkpoint_barrier_.get();
486   barrier->Init(self, 0);
487   {
488     MutexLock mu(self, *Locks::thread_list_lock_);
489     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
490     for (Thread* thread : list_) {
491       if (thread != self) {
492         while (true) {
493           if (thread->RequestEmptyCheckpoint()) {
494             // This thread will run an empty checkpoint (decrement the empty checkpoint barrier)
495             // some time in the near future.
496             ++count;
497             if (kIsDebugBuild) {
498               runnable_thread_ids.push_back(thread->GetThreadId());
499             }
500             break;
501           }
502           if (thread->GetState() != ThreadState::kRunnable) {
503             // It's seen suspended, we are done because it must not be in the middle of a mutator
504             // heap access.
505             break;
506           }
507         }
508       }
509     }
510   }
511 
512   // Wake up the threads blocking for weak ref access so that they will respond to the empty
513   // checkpoint request. Otherwise we will hang as they are blocking in the kRunnable state.
514   Runtime::Current()->GetHeap()->GetReferenceProcessor()->BroadcastForSlowPath(self);
515   Runtime::Current()->BroadcastForNewSystemWeaks(/*broadcast_for_checkpoint=*/true);
516   {
517     ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun);
518     uint64_t total_wait_time = 0;
519     bool first_iter = true;
520     while (true) {
521       // Wake up the runnable threads blocked on the mutexes that another thread, which is blocked
522       // on a weak ref access, holds (indirectly blocking for weak ref access through another thread
523       // and a mutex.) This needs to be done periodically because the thread may be preempted
524       // between the CheckEmptyCheckpointFromMutex call and the subsequent futex wait in
525       // Mutex::ExclusiveLock, etc. when the wakeup via WakeupToRespondToEmptyCheckpoint
526       // arrives. This could cause a *very rare* deadlock, if not repeated. Most of the cases are
527       // handled in the first iteration.
528       for (BaseMutex* mutex : Locks::expected_mutexes_on_weak_ref_access_) {
529         mutex->WakeupToRespondToEmptyCheckpoint();
530       }
531       static constexpr uint64_t kEmptyCheckpointPeriodicTimeoutMs = 100;  // 100ms
532       static constexpr uint64_t kEmptyCheckpointTotalTimeoutMs = 600 * 1000;  // 10 minutes.
533       size_t barrier_count = first_iter ? count : 0;
534       first_iter = false;  // Don't add to the barrier count from the second iteration on.
535       bool timed_out = barrier->Increment(self, barrier_count, kEmptyCheckpointPeriodicTimeoutMs);
536       if (!timed_out) {
537         break;  // Success
538       }
539       // This is a very rare case.
540       total_wait_time += kEmptyCheckpointPeriodicTimeoutMs;
541       if (kIsDebugBuild && total_wait_time > kEmptyCheckpointTotalTimeoutMs) {
542         std::ostringstream ss;
543         ss << "Empty checkpoint timeout\n";
544         ss << "Barrier count " << barrier->GetCount(self) << "\n";
545         ss << "Runnable thread IDs";
546         for (uint32_t tid : runnable_thread_ids) {
547           ss << " " << tid;
548         }
549         ss << "\n";
550         Locks::mutator_lock_->Dump(ss);
551         ss << "\n";
552         LOG(FATAL_WITHOUT_ABORT) << ss.str();
553         // Some threads in 'runnable_thread_ids' are probably stuck. Try to dump their stacks.
554         // Avoid using ThreadList::Dump() initially because it is likely to get stuck as well.
555         {
556           ScopedObjectAccess soa(self);
557           MutexLock mu1(self, *Locks::thread_list_lock_);
558           for (Thread* thread : GetList()) {
559             uint32_t tid = thread->GetThreadId();
560             bool is_in_runnable_thread_ids =
561                 std::find(runnable_thread_ids.begin(), runnable_thread_ids.end(), tid) !=
562                 runnable_thread_ids.end();
563             if (is_in_runnable_thread_ids &&
564                 thread->ReadFlag(ThreadFlag::kEmptyCheckpointRequest, std::memory_order_relaxed)) {
565               // Found a runnable thread that hasn't responded to the empty checkpoint request.
566               // Assume it's stuck and safe to dump its stack.
567               thread->Dump(LOG_STREAM(FATAL_WITHOUT_ABORT),
568                            /*dump_native_stack=*/ true,
569                            /*force_dump_stack=*/ true);
570             }
571           }
572         }
573         LOG(FATAL_WITHOUT_ABORT)
574             << "Dumped runnable threads that haven't responded to empty checkpoint.";
575         // Now use ThreadList::Dump() to dump more threads, noting it may get stuck.
576         Dump(LOG_STREAM(FATAL_WITHOUT_ABORT));
577         LOG(FATAL) << "Dumped all threads.";
578       }
579     }
580   }
581 }
582 
583 // Separate function to disable just the right amount of thread-safety analysis.
AcquireMutatorLockSharedUncontended(Thread * self)584 ALWAYS_INLINE void AcquireMutatorLockSharedUncontended(Thread* self)
585     ACQUIRE_SHARED(*Locks::mutator_lock_) NO_THREAD_SAFETY_ANALYSIS {
586   bool success = Locks::mutator_lock_->SharedTryLock(self, /*check=*/false);
587   CHECK(success);
588 }
589 
590 // A checkpoint/suspend-all hybrid to switch thread roots from
591 // from-space to to-space refs. Used to synchronize threads at a point
592 // to mark the initiation of marking while maintaining the to-space
593 // invariant.
FlipThreadRoots(Closure * thread_flip_visitor,Closure * flip_callback,gc::collector::GarbageCollector * collector,gc::GcPauseListener * pause_listener)594 void ThreadList::FlipThreadRoots(Closure* thread_flip_visitor,
595                                  Closure* flip_callback,
596                                  gc::collector::GarbageCollector* collector,
597                                  gc::GcPauseListener* pause_listener) {
598   TimingLogger::ScopedTiming split("ThreadListFlip", collector->GetTimings());
599   Thread* self = Thread::Current();
600   Locks::mutator_lock_->AssertNotHeld(self);
601   Locks::thread_list_lock_->AssertNotHeld(self);
602   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
603   CHECK_NE(self->GetState(), ThreadState::kRunnable);
604 
605   collector->GetHeap()->ThreadFlipBegin(self);  // Sync with JNI critical calls.
606 
607   // ThreadFlipBegin happens before we suspend all the threads, so it does not
608   // count towards the pause.
609   const uint64_t suspend_start_time = NanoTime();
610   VLOG(threads) << "Suspending all for thread flip";
611   {
612     ScopedTrace trace("ThreadFlipSuspendAll");
613     SuspendAllInternal(self);
614   }
615 
616   std::vector<Thread*> flipping_threads;  // All suspended threads. Includes us.
617   int thread_count;
618   // Flipping threads might exit between the time we resume them and try to run the flip function.
619   // Track that in a parallel vector.
620   std::unique_ptr<ThreadExitFlag[]> exit_flags;
621 
622   {
623     TimingLogger::ScopedTiming t("FlipThreadSuspension", collector->GetTimings());
624     if (pause_listener != nullptr) {
625       pause_listener->StartPause();
626     }
627 
628     // Run the flip callback for the collector.
629     Locks::mutator_lock_->ExclusiveLock(self);
630     suspend_all_histogram_.AdjustAndAddValue(NanoTime() - suspend_start_time);
631     flip_callback->Run(self);
632 
633     {
634       MutexLock mu(self, *Locks::thread_list_lock_);
635       MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
636       thread_count = list_.size();
637       exit_flags.reset(new ThreadExitFlag[thread_count]);
638       flipping_threads.resize(thread_count, nullptr);
639       int i = 1;
640       for (Thread* thread : list_) {
641         // Set the flip function for all threads because once we start resuming any threads,
642         // they may need to run the flip function on behalf of other threads, even this one.
643         DCHECK(thread == self || thread->IsSuspended());
644         thread->SetFlipFunction(thread_flip_visitor);
645         // Put ourselves first, so other threads are more likely to have finished before we get
646         // there.
647         int thread_index = thread == self ? 0 : i++;
648         flipping_threads[thread_index] = thread;
649         thread->NotifyOnThreadExit(&exit_flags[thread_index]);
650       }
651       DCHECK(i == thread_count);
652     }
653 
654     if (pause_listener != nullptr) {
655       pause_listener->EndPause();
656     }
657   }
658   // Any new threads created after this will be created by threads that already ran their flip
659   // functions. In the normal GC use case in which the flip function converts all local references
660   // to to-space references, these newly created threads will also see only to-space references.
661 
662   // Resume threads, making sure that we do not release suspend_count_lock_ until we've reacquired
663   // the mutator_lock_ in shared mode, and decremented suspend_all_count_.  This avoids a
664   // concurrent SuspendAll, and ensures that newly started threads see a correct value of
665   // suspend_all_count.
666   {
667     MutexLock mu(self, *Locks::thread_list_lock_);
668     Locks::thread_suspend_count_lock_->Lock(self);
669     ResumeAllInternal(self);
670   }
671   collector->RegisterPause(NanoTime() - suspend_start_time);
672 
673   // Since all threads were suspended, they will attempt to run the flip function before
674   // reentering a runnable state. We will also attempt to run the flip functions ourselves.  Any
675   // intervening checkpoint request will do the same.  Exactly one of those flip function attempts
676   // will succeed, and the target thread will not be able to reenter a runnable state until one of
677   // them does.
678 
679   // Try to run the closure on the other threads.
680   TimingLogger::ScopedTiming split3("RunningThreadFlips", collector->GetTimings());
681   // Reacquire the mutator lock while holding suspend_count_lock. This cannot fail, since we
682   // do not acquire the mutator lock unless suspend_all_count was read as 0 while holding
683   // suspend_count_lock. We did not release suspend_count_lock since releasing the mutator
684   // lock.
685   AcquireMutatorLockSharedUncontended(self);
686 
687   Locks::thread_suspend_count_lock_->Unlock(self);
688   // Concurrent SuspendAll may now see zero suspend_all_count_, but block on mutator_lock_.
689 
690   collector->GetHeap()->ThreadFlipEnd(self);
691 
692   for (int i = 0; i < thread_count; ++i) {
693     bool finished;
694     Thread::EnsureFlipFunctionStarted(
695         self, flipping_threads[i], Thread::StateAndFlags(0), &exit_flags[i], &finished);
696     if (finished) {
697       MutexLock mu2(self, *Locks::thread_list_lock_);
698       flipping_threads[i]->UnregisterThreadExitFlag(&exit_flags[i]);
699       flipping_threads[i] = nullptr;
700     }
701   }
702   // Make sure all flips complete before we return.
703   for (int i = 0; i < thread_count; ++i) {
704     if (UNLIKELY(flipping_threads[i] != nullptr)) {
705       flipping_threads[i]->WaitForFlipFunctionTestingExited(self, &exit_flags[i]);
706       MutexLock mu2(self, *Locks::thread_list_lock_);
707       flipping_threads[i]->UnregisterThreadExitFlag(&exit_flags[i]);
708     }
709   }
710 
711   Thread::DCheckUnregisteredEverywhere(&exit_flags[0], &exit_flags[thread_count - 1]);
712 
713   Locks::mutator_lock_->SharedUnlock(self);
714 }
715 
716 // True only for debugging suspend timeout code. The resulting timeouts are short enough that
717 // failures are expected.
718 static constexpr bool kShortSuspendTimeouts = false;
719 
720 static constexpr unsigned kSuspendBarrierIters = kShortSuspendTimeouts ? 5 : 20;
721 
722 #if ART_USE_FUTEXES
723 
724 // Returns true if it timed out. Times out after timeout_ns/kSuspendBarrierIters nsecs
WaitOnceForSuspendBarrier(AtomicInteger * barrier,int32_t cur_val,uint64_t timeout_ns)725 static bool WaitOnceForSuspendBarrier(AtomicInteger* barrier,
726                                       int32_t cur_val,
727                                       uint64_t timeout_ns) {
728   timespec wait_timeout;
729   if (kShortSuspendTimeouts) {
730     timeout_ns = MsToNs(kSuspendBarrierIters);
731     CHECK_GE(NsToMs(timeout_ns / kSuspendBarrierIters), 1ul);
732   } else {
733     DCHECK_GE(NsToMs(timeout_ns / kSuspendBarrierIters), 10ul);
734   }
735   InitTimeSpec(false, CLOCK_MONOTONIC, NsToMs(timeout_ns / kSuspendBarrierIters), 0, &wait_timeout);
736   if (futex(barrier->Address(), FUTEX_WAIT_PRIVATE, cur_val, &wait_timeout, nullptr, 0) != 0) {
737     if (errno == ETIMEDOUT) {
738       return true;
739     } else if (errno != EAGAIN && errno != EINTR) {
740       PLOG(FATAL) << "futex wait for suspend barrier failed";
741     }
742   }
743   return false;
744 }
745 
746 #else
747 
WaitOnceForSuspendBarrier(AtomicInteger * barrier,int32_t cur_val,uint64_t timeout_ns)748 static bool WaitOnceForSuspendBarrier(AtomicInteger* barrier,
749                                       int32_t cur_val,
750                                       uint64_t timeout_ns) {
751   // In the normal case, aim for a couple of hundred milliseconds.
752   static constexpr unsigned kInnerIters =
753       kShortSuspendTimeouts ? 1'000 : (timeout_ns / 1000) / kSuspendBarrierIters;
754   DCHECK_GE(kInnerIters, 1'000u);
755   for (int i = 0; i < kInnerIters; ++i) {
756     sched_yield();
757     if (barrier->load(std::memory_order_acquire) == 0) {
758       return false;
759     }
760   }
761   return true;
762 }
763 
764 #endif  // ART_USE_FUTEXES
765 
WaitForSuspendBarrier(AtomicInteger * barrier,pid_t t,int attempt_of_4)766 std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barrier,
767                                                              pid_t t,
768                                                              int attempt_of_4) {
769 #if ART_USE_FUTEXES
770   const uint64_t start_time = NanoTime();
771 #endif
772   uint64_t timeout_ns =
773       attempt_of_4 == 0 ? thread_suspend_timeout_ns_ : thread_suspend_timeout_ns_ / 4;
774   static bool is_user_build = (android::base::GetProperty("ro.build.type", "") == "user");
775   // Significantly increase timeouts in user builds, since they result in crashes.
776   // Many of these are likely to turn into ANRs, which are less informative for the developer, but
777   // friendlier to the user. We do not completely suppress timeouts, so that we avoid invisible
778   // problems for cases not covered by ANR detection, e.g. a problem in a clean-up daemon.
779   if (is_user_build) {
780     static constexpr int USER_MULTIPLIER = 2;  // Start out small, perhaps increase later if we
781                                                // still have an issue?
782     timeout_ns *= USER_MULTIPLIER;
783   }
784   uint64_t avg_wait_multiplier = 1;
785   uint64_t wait_multiplier = 1;
786   if (attempt_of_4 != 1) {
787     // TODO: RequestSynchronousCheckpoint routinely passes attempt_of_4 = 0. Can
788     // we avoid the getpriority() call?
789     if (getpriority(PRIO_PROCESS, 0 /* this thread */) > 0) {
790       // We're a low priority thread, and thus have a longer ANR timeout. Increase the suspend
791       // timeout.
792       avg_wait_multiplier = 3;
793     }
794     // To avoid the system calls in the common case, we fail to increase the first of 4 waits, but
795     // then compensate during the last one. This also allows somewhat longer thread monitoring
796     // before we time out.
797     wait_multiplier = attempt_of_4 == 4 ? 2 * avg_wait_multiplier - 1 : avg_wait_multiplier;
798     timeout_ns *= wait_multiplier;
799   }
800   bool collect_state = (t != 0 && (attempt_of_4 == 0 || attempt_of_4 == 4));
801   int32_t cur_val = barrier->load(std::memory_order_acquire);
802   if (cur_val <= 0) {
803     DCHECK_EQ(cur_val, 0);
804     return std::nullopt;
805   }
806   unsigned i = 0;
807   if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns)) {
808     i = 1;
809   }
810   cur_val = barrier->load(std::memory_order_acquire);
811   if (cur_val <= 0) {
812     DCHECK_EQ(cur_val, 0);
813     return std::nullopt;
814   }
815 
816   // Extra timeout to compensate for concurrent thread dumps, so that we are less likely to time
817   // out during ANR dumps.
818   uint64_t dump_adjustment_ns = 0;
819   // Total timeout increment if we see a concurrent thread dump. Distributed evenly across
820   // remaining iterations.
821   static constexpr uint64_t kDumpWaitNSecs = 30'000'000'000ull;  // 30 seconds
822   // Replacement timeout if thread is stopped for tracing, probably by a debugger.
823   static constexpr uint64_t kTracingWaitNSecs = 7'200'000'000'000ull;  // wait a bit < 2 hours;
824 
825   // Long wait; gather information in case of timeout.
826   std::string sampled_state = collect_state ? GetOsThreadStatQuick(t) : "";
827   if (collect_state && GetStateFromStatString(sampled_state) == 't') {
828     LOG(WARNING) << "Thread suspension nearly timed out due to Tracing stop (debugger attached?)";
829     timeout_ns = kTracingWaitNSecs;
830   }
831   // Only fail after kSuspendBarrierIters timeouts, to make us robust against app freezing.
832   while (i < kSuspendBarrierIters) {
833     if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns + dump_adjustment_ns)) {
834       ++i;
835 #if ART_USE_FUTEXES
836       if (!kShortSuspendTimeouts) {
837         CHECK_GE(NanoTime() - start_time, i * timeout_ns / kSuspendBarrierIters - 1'000'000);
838       }
839 #endif
840     }
841     cur_val = barrier->load(std::memory_order_acquire);
842     if (cur_val <= 0) {
843       DCHECK_EQ(cur_val, 0);
844       return std::nullopt;
845     }
846     std::optional<uint64_t> last_sigquit_nanotime = Runtime::Current()->SigQuitNanoTime();
847     if (last_sigquit_nanotime.has_value() && i < kSuspendBarrierIters) {
848       // Adjust dump_adjustment_ns to reflect the number of iterations we have left and how long
849       // ago we started dumping threads.
850       uint64_t new_unscaled_adj = kDumpWaitNSecs + last_sigquit_nanotime.value() - NanoTime();
851       // Scale by the fraction of iterations still remaining.
852       dump_adjustment_ns = new_unscaled_adj * kSuspendBarrierIters / kSuspendBarrierIters - i;
853     }
854     // Keep the old dump_adjustment_ns if SigQuitNanoTime() was cleared.
855   }
856   uint64_t final_wait_time = NanoTime() - start_time;
857   uint64_t total_wait_time = attempt_of_4 == 0 ?
858                                  final_wait_time :
859                                  4 * final_wait_time * avg_wait_multiplier / wait_multiplier;
860   return collect_state ? "Target states: [" + sampled_state + ", " + GetOsThreadStatQuick(t) + "]" +
861                              (cur_val == 0 ? "(barrier now passed)" : "") +
862                              " Final wait time: " + PrettyDuration(final_wait_time) +
863                              "; appr. total wait time: " + PrettyDuration(total_wait_time) :
864                          "";
865 }
866 
SuspendAll(const char * cause,bool long_suspend)867 void ThreadList::SuspendAll(const char* cause, bool long_suspend) {
868   Thread* self = Thread::Current();
869 
870   if (self != nullptr) {
871     VLOG(threads) << *self << " SuspendAll for " << cause << " starting...";
872   } else {
873     VLOG(threads) << "Thread[null] SuspendAll for " << cause << " starting...";
874   }
875   {
876     ScopedTrace trace("Suspending mutator threads");
877     const uint64_t start_time = NanoTime();
878 
879     SuspendAllInternal(self);
880     // All threads are known to have suspended (but a thread may still own the mutator lock)
881     // Make sure this thread grabs exclusive access to the mutator lock and its protected data.
882 #if HAVE_TIMED_RWLOCK
883     while (true) {
884       if (Locks::mutator_lock_->ExclusiveLockWithTimeout(self,
885                                                          NsToMs(thread_suspend_timeout_ns_),
886                                                          0)) {
887         break;
888       } else if (!long_suspend_) {
889         // Reading long_suspend without the mutator lock is slightly racy, in some rare cases, this
890         // could result in a thread suspend timeout.
891         // Timeout if we wait more than thread_suspend_timeout_ns_ nanoseconds.
892         UnsafeLogFatalForThreadSuspendAllTimeout();
893       }
894     }
895 #else
896     Locks::mutator_lock_->ExclusiveLock(self);
897 #endif
898 
899     long_suspend_ = long_suspend;
900 
901     const uint64_t end_time = NanoTime();
902     const uint64_t suspend_time = end_time - start_time;
903     suspend_all_histogram_.AdjustAndAddValue(suspend_time);
904     if (suspend_time > kLongThreadSuspendThreshold) {
905       LOG(WARNING) << "Suspending all threads took: " << PrettyDuration(suspend_time);
906     }
907 
908     if (kDebugLocking) {
909       // Debug check that all threads are suspended.
910       AssertOtherThreadsAreSuspended(self);
911     }
912   }
913 
914   // SuspendAllInternal blocks if we are in the middle of a flip.
915   DCHECK(!self->ReadFlag(ThreadFlag::kPendingFlipFunction, std::memory_order_relaxed));
916   DCHECK(!self->ReadFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_relaxed));
917 
918   ATraceBegin((std::string("Mutator threads suspended for ") + cause).c_str());
919 
920   if (self != nullptr) {
921     VLOG(threads) << *self << " SuspendAll complete";
922   } else {
923     VLOG(threads) << "Thread[null] SuspendAll complete";
924   }
925 }
926 
927 // Ensures all threads running Java suspend and that those not running Java don't start.
SuspendAllInternal(Thread * self,SuspendReason reason)928 void ThreadList::SuspendAllInternal(Thread* self, SuspendReason reason) {
929   // self can be nullptr if this is an unregistered thread.
930   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
931   Locks::thread_list_lock_->AssertNotHeld(self);
932   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
933   if (kDebugLocking && self != nullptr) {
934     CHECK_NE(self->GetState(), ThreadState::kRunnable);
935   }
936 
937   // First request that all threads suspend, then wait for them to suspend before
938   // returning. This suspension scheme also relies on other behaviour:
939   // 1. Threads cannot be deleted while they are suspended or have a suspend-
940   //    request flag set - (see Unregister() below).
941   // 2. When threads are created, they are created in a suspended state (actually
942   //    kNative) and will never begin executing Java code without first checking
943   //    the suspend-request flag.
944 
945   // The atomic counter for number of threads that need to pass the barrier.
946   AtomicInteger pending_threads;
947 
948   for (int iter_count = 1;; ++iter_count) {
949     {
950       MutexLock mu(self, *Locks::thread_list_lock_);
951       MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
952       if (suspend_all_count_ == 0) {
953         // Never run multiple SuspendAlls concurrently.
954         // If we are asked to suspend ourselves, we proceed anyway, but must ignore suspend
955         // request from other threads until we resume them.
956         bool found_myself = false;
957         // Update global suspend all state for attaching threads.
958         ++suspend_all_count_;
959         pending_threads.store(list_.size() - (self == nullptr ? 0 : 1), std::memory_order_relaxed);
960         // Increment everybody else's suspend count.
961         for (const auto& thread : list_) {
962           if (thread == self) {
963             found_myself = true;
964           } else {
965             VLOG(threads) << "requesting thread suspend: " << *thread;
966             DCHECK_EQ(suspend_all_count_, 1);
967             thread->IncrementSuspendCount(self, &pending_threads, nullptr, reason);
968             if (thread->IsSuspended()) {
969               // Effectively pass the barrier on behalf of the already suspended thread.
970               // The thread itself cannot yet have acted on our request since we still hold the
971               // suspend_count_lock_, and it will notice that kActiveSuspendBarrier has already
972               // been cleared if and when it acquires the lock in PassActiveSuspendBarriers().
973               DCHECK_EQ(thread->tlsPtr_.active_suspendall_barrier, &pending_threads);
974               pending_threads.fetch_sub(1, std::memory_order_seq_cst);
975               thread->tlsPtr_.active_suspendall_barrier = nullptr;
976               if (!thread->HasActiveSuspendBarrier()) {
977                 thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
978               }
979             }
980             // else:
981             // The target thread was not yet suspended, and hence will be forced to execute
982             // TransitionFromRunnableToSuspended shortly. Since we set the kSuspendRequest flag
983             // before checking, and it checks kActiveSuspendBarrier after noticing kSuspendRequest,
984             // it must notice kActiveSuspendBarrier when it does. Thus it is guaranteed to
985             // decrement the suspend barrier. We're relying on store; load ordering here, but
986             // that's not a problem, since state and flags all reside in the same atomic, and
987             // are thus properly ordered, even for relaxed accesses.
988           }
989         }
990         self->AtomicSetFlag(ThreadFlag::kSuspensionImmune, std::memory_order_relaxed);
991         DCHECK(self == nullptr || found_myself);
992         break;
993       }
994     }
995     if (iter_count >= kMaxSuspendRetries) {
996       LOG(FATAL) << "Too many SuspendAll retries: " << iter_count;
997     } else {
998       MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
999       DCHECK_LE(suspend_all_count_, 1);
1000       if (suspend_all_count_ != 0) {
1001         // This may take a while, and we're not runnable, and thus would otherwise not block.
1002         Thread::resume_cond_->WaitHoldingLocks(self);
1003         continue;
1004       }
1005     }
1006     // We're already not runnable, so an attempt to suspend us should succeed.
1007   }
1008 
1009   Thread* culprit = nullptr;
1010   pid_t tid = 0;
1011   std::ostringstream oss;
1012   for (int attempt_of_4 = 1; attempt_of_4 <= 4; ++attempt_of_4) {
1013     auto result = WaitForSuspendBarrier(&pending_threads, tid, attempt_of_4);
1014     if (!result.has_value()) {
1015       // Wait succeeded.
1016       break;
1017     }
1018     if (attempt_of_4 == 3) {
1019       // Second to the last attempt; Try to gather more information in case we time out.
1020       MutexLock mu(self, *Locks::thread_list_lock_);
1021       MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1022       oss << "remaining threads: ";
1023       for (const auto& thread : list_) {
1024         if (thread != self && !thread->IsSuspended()) {
1025           culprit = thread;
1026           oss << *thread << ", ";
1027         }
1028       }
1029       if (culprit != nullptr) {
1030         tid = culprit->GetTid();
1031       }
1032     } else if (attempt_of_4 == 4) {
1033       // Final attempt still timed out.
1034       if (culprit == nullptr) {
1035         LOG(FATAL) << "SuspendAll timeout. Couldn't find holdouts.";
1036       } else {
1037         std::string name;
1038         culprit->GetThreadName(name);
1039         oss << "Info for " << name << ": ";
1040         std::string thr_descr =
1041             StringPrintf("state&flags: 0x%x, Java/native priority: %d/%d, barrier value: %d, ",
1042                          culprit->GetStateAndFlags(std::memory_order_relaxed).GetValue(),
1043                          culprit->GetNativePriority(),
1044                          getpriority(PRIO_PROCESS /* really thread */, culprit->GetTid()),
1045                          pending_threads.load());
1046         oss << thr_descr << result.value();
1047         culprit->AbortInThis("SuspendAll timeout; " + oss.str());
1048       }
1049     }
1050   }
1051 }
1052 
ResumeAll()1053 void ThreadList::ResumeAll() {
1054   Thread* self = Thread::Current();
1055   if (kDebugLocking) {
1056     // Debug check that all threads are suspended.
1057     AssertOtherThreadsAreSuspended(self);
1058   }
1059   MutexLock mu(self, *Locks::thread_list_lock_);
1060   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1061   ATraceEnd();  // Matching "Mutator threads suspended ..." in SuspendAll.
1062   ResumeAllInternal(self);
1063 }
1064 
1065 // Holds thread_list_lock_ and suspend_count_lock_
ResumeAllInternal(Thread * self)1066 void ThreadList::ResumeAllInternal(Thread* self) {
1067   DCHECK_NE(self->GetState(), ThreadState::kRunnable);
1068   if (self != nullptr) {
1069     VLOG(threads) << *self << " ResumeAll starting";
1070   } else {
1071     VLOG(threads) << "Thread[null] ResumeAll starting";
1072   }
1073 
1074   ScopedTrace trace("Resuming mutator threads");
1075 
1076   long_suspend_ = false;
1077 
1078   Locks::mutator_lock_->ExclusiveUnlock(self);
1079 
1080   // Decrement the suspend counts for all threads.
1081   for (const auto& thread : list_) {
1082     if (thread != self) {
1083       thread->DecrementSuspendCount(self);
1084     }
1085   }
1086 
1087   // Update global suspend all state for attaching threads. Unblocks other SuspendAlls once
1088   // suspend_count_lock_ is released.
1089   --suspend_all_count_;
1090   self->AtomicClearFlag(ThreadFlag::kSuspensionImmune, std::memory_order_relaxed);
1091   // Pending suspend requests for us will be handled when we become Runnable again.
1092 
1093   // Broadcast a notification to all suspended threads, some or all of
1094   // which may choose to wake up.  No need to wait for them.
1095   if (self != nullptr) {
1096     VLOG(threads) << *self << " ResumeAll waking others";
1097   } else {
1098     VLOG(threads) << "Thread[null] ResumeAll waking others";
1099   }
1100   Thread::resume_cond_->Broadcast(self);
1101 
1102   if (self != nullptr) {
1103     VLOG(threads) << *self << " ResumeAll complete";
1104   } else {
1105     VLOG(threads) << "Thread[null] ResumeAll complete";
1106   }
1107 }
1108 
Resume(Thread * thread,SuspendReason reason)1109 bool ThreadList::Resume(Thread* thread, SuspendReason reason) {
1110   // This assumes there was an ATraceBegin when we suspended the thread.
1111   ATraceEnd();
1112 
1113   Thread* self = Thread::Current();
1114   DCHECK_NE(thread, self);
1115   VLOG(threads) << "Resume(" << reinterpret_cast<void*>(thread) << ") starting..." << reason;
1116 
1117   {
1118     // To check Contains.
1119     MutexLock mu(self, *Locks::thread_list_lock_);
1120     // To check IsSuspended.
1121     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1122     if (UNLIKELY(!thread->IsSuspended())) {
1123       LOG(reason == SuspendReason::kForUserCode ? ERROR : FATAL)
1124           << "Resume(" << reinterpret_cast<void*>(thread) << ") thread not suspended";
1125       return false;
1126     }
1127     if (!Contains(thread)) {
1128       // We only expect threads within the thread-list to have been suspended otherwise we can't
1129       // stop such threads from delete-ing themselves.
1130       LOG(reason == SuspendReason::kForUserCode ? ERROR : FATAL)
1131           << "Resume(" << reinterpret_cast<void*>(thread) << ") thread not within thread list";
1132       return false;
1133     }
1134     thread->DecrementSuspendCount(self, /*for_user_code=*/(reason == SuspendReason::kForUserCode));
1135     Thread::resume_cond_->Broadcast(self);
1136   }
1137 
1138   VLOG(threads) << "Resume(" << reinterpret_cast<void*>(thread) << ") finished waking others";
1139   return true;
1140 }
1141 
SuspendThread(Thread * self,Thread * thread,SuspendReason reason,ThreadState self_state,const char * func_name,int attempt_of_4)1142 bool ThreadList::SuspendThread(Thread* self,
1143                                Thread* thread,
1144                                SuspendReason reason,
1145                                ThreadState self_state,
1146                                const char* func_name,
1147                                int attempt_of_4) {
1148   bool is_suspended = false;
1149   VLOG(threads) << func_name << "starting";
1150   pid_t tid = thread->GetTid();
1151   uint8_t suspended_count;
1152   uint8_t checkpoint_count;
1153   WrappedSuspend1Barrier wrapped_barrier{};
1154   static_assert(sizeof wrapped_barrier.barrier_ == sizeof(uint32_t));
1155   ThreadExitFlag tef;
1156   bool exited = false;
1157   thread->NotifyOnThreadExit(&tef);
1158   int iter_count = 1;
1159   do {
1160     {
1161       Locks::mutator_lock_->AssertSharedHeld(self);
1162       Locks::thread_list_lock_->AssertHeld(self);
1163       // Note: this will transition to runnable and potentially suspend.
1164       DCHECK(Contains(thread));
1165       // This implementation fails if thread == self. Let the clients handle that case
1166       // appropriately.
1167       CHECK_NE(thread, self) << func_name << "(self)";
1168       VLOG(threads) << func_name << " suspending: " << *thread;
1169       {
1170         MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
1171         if (LIKELY(self->GetSuspendCount() == 0)) {
1172           suspended_count = thread->suspended_count_;
1173           checkpoint_count = thread->checkpoint_count_;
1174           thread->IncrementSuspendCount(self, nullptr, &wrapped_barrier, reason);
1175           if (thread->IsSuspended()) {
1176             // See the discussion in mutator_gc_coord.md and SuspendAllInternal for the race here.
1177             thread->RemoveFirstSuspend1Barrier(&wrapped_barrier);
1178             // PassActiveSuspendBarriers couldn't have seen our barrier, since it also acquires
1179             // 'thread_suspend_count_lock_'. `wrapped_barrier` will not be accessed.
1180             if (!thread->HasActiveSuspendBarrier()) {
1181               thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
1182             }
1183             is_suspended = true;
1184           }
1185           DCHECK_GT(thread->GetSuspendCount(), 0);
1186           break;
1187         }
1188         // Else we hold the suspend count lock but another thread is trying to suspend us,
1189         // making it unsafe to try to suspend another thread in case we get a cycle.
1190         // Start the loop again, which will allow this thread to be suspended.
1191       }
1192     }
1193     // All locks are released, and we should quickly exit the suspend-unfriendly state. Retry.
1194     if (iter_count >= kMaxSuspendRetries) {
1195       LOG(FATAL) << "Too many suspend retries";
1196     }
1197     Locks::thread_list_lock_->ExclusiveUnlock(self);
1198     {
1199       ScopedThreadSuspension sts(self, ThreadState::kSuspended);
1200       usleep(kThreadSuspendSleepUs);
1201       ++iter_count;
1202     }
1203     Locks::thread_list_lock_->ExclusiveLock(self);
1204     exited = tef.HasExited();
1205   } while (!exited);
1206   thread->UnregisterThreadExitFlag(&tef);
1207   Locks::thread_list_lock_->ExclusiveUnlock(self);
1208   self->TransitionFromRunnableToSuspended(self_state);
1209   if (exited) {
1210     // This is OK: There's a race in inflating a lock and the owner giving up ownership and then
1211     // dying.
1212     LOG(WARNING) << StringPrintf("Thread with tid %d exited before suspending", tid);
1213     return false;
1214   }
1215   // Now wait for target to decrement suspend barrier.
1216   std::optional<std::string> failure_info;
1217   if (!is_suspended) {
1218     failure_info = WaitForSuspendBarrier(&wrapped_barrier.barrier_, tid, attempt_of_4);
1219     if (!failure_info.has_value()) {
1220       is_suspended = true;
1221     }
1222   }
1223   while (!is_suspended) {
1224     if (attempt_of_4 > 0 && attempt_of_4 < 4) {
1225       // Caller will try again. Give up and resume the thread for now.  We need to make sure
1226       // that wrapped_barrier is removed from the list before we deallocate it.
1227       MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
1228       if (wrapped_barrier.barrier_.load() == 0) {
1229         // Succeeded in the meantime.
1230         is_suspended = true;
1231         continue;
1232       }
1233       thread->RemoveSuspend1Barrier(&wrapped_barrier);
1234       if (!thread->HasActiveSuspendBarrier()) {
1235         thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
1236       }
1237       // Do not call Resume(), since we are probably not fully suspended.
1238       thread->DecrementSuspendCount(self,
1239                                     /*for_user_code=*/(reason == SuspendReason::kForUserCode));
1240       Thread::resume_cond_->Broadcast(self);
1241       return false;
1242     }
1243     std::string name;
1244     thread->GetThreadName(name);
1245     WrappedSuspend1Barrier* first_barrier;
1246     {
1247       MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
1248       first_barrier = thread->tlsPtr_.active_suspend1_barriers;
1249     }
1250     // 'thread' should still have a suspend request pending, and hence stick around. Try to abort
1251     // there, since its stack trace is much more interesting than ours.
1252     std::string message = StringPrintf(
1253         "%s timed out: %s: state&flags: 0x%x, Java/native priority: %d/%d,"
1254         " barriers: %p, ours: %p, barrier value: %d, nsusps: %d, ncheckpts: %d, thread_info: %s",
1255         func_name,
1256         name.c_str(),
1257         thread->GetStateAndFlags(std::memory_order_relaxed).GetValue(),
1258         thread->GetNativePriority(),
1259         getpriority(PRIO_PROCESS /* really thread */, thread->GetTid()),
1260         first_barrier,
1261         &wrapped_barrier,
1262         wrapped_barrier.barrier_.load(),
1263         thread->suspended_count_ - suspended_count,
1264         thread->checkpoint_count_ - checkpoint_count,
1265         failure_info.value().c_str());
1266     // Check one last time whether thread passed the suspend barrier. Empirically this seems to
1267     // happen maybe between 1 and 5% of the time.
1268     if (wrapped_barrier.barrier_.load() != 0) {
1269       // thread still has a pointer to wrapped_barrier. Returning and continuing would be unsafe
1270       // without additional cleanup.
1271       thread->AbortInThis(message);
1272       UNREACHABLE();
1273     }
1274     is_suspended = true;
1275   }
1276   // wrapped_barrier.barrier_ will no longer be accessed.
1277   VLOG(threads) << func_name << " suspended: " << *thread;
1278   if (ATraceEnabled()) {
1279     std::string name;
1280     thread->GetThreadName(name);
1281     ATraceBegin(
1282         StringPrintf("%s suspended %s for tid=%d", func_name, name.c_str(), thread->GetTid())
1283             .c_str());
1284   }
1285   if (kIsDebugBuild) {
1286     CHECK(thread->IsSuspended());
1287     MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
1288     thread->CheckBarrierInactive(&wrapped_barrier);
1289   }
1290   return true;
1291 }
1292 
SuspendThreadByPeer(jobject peer,SuspendReason reason)1293 Thread* ThreadList::SuspendThreadByPeer(jobject peer, SuspendReason reason) {
1294   Thread* const self = Thread::Current();
1295   ThreadState old_self_state = self->GetState();
1296   self->TransitionFromSuspendedToRunnable();
1297   Locks::thread_list_lock_->ExclusiveLock(self);
1298   ObjPtr<mirror::Object> thread_ptr = self->DecodeJObject(peer);
1299   Thread* thread = Thread::FromManagedThread(self, thread_ptr);
1300   if (thread == nullptr || !Contains(thread)) {
1301     if (thread == nullptr) {
1302       ObjPtr<mirror::Object> name = WellKnownClasses::java_lang_Thread_name->GetObject(thread_ptr);
1303       std::string thr_name = (name == nullptr ? "<unknown>" : name->AsString()->ToModifiedUtf8());
1304       LOG(WARNING) << "No such thread for suspend"
1305                    << ": " << peer << ":" << thr_name;
1306     } else {
1307       LOG(WARNING) << "SuspendThreadByPeer failed for unattached thread: "
1308                    << reinterpret_cast<void*>(thread);
1309     }
1310     Locks::thread_list_lock_->ExclusiveUnlock(self);
1311     self->TransitionFromRunnableToSuspended(old_self_state);
1312     return nullptr;
1313   }
1314   VLOG(threads) << "SuspendThreadByPeer found thread: " << *thread;
1315   // Releases thread_list_lock_ and mutator lock.
1316   bool success = SuspendThread(self, thread, reason, old_self_state, __func__, 0);
1317   Locks::thread_list_lock_->AssertNotHeld(self);
1318   return success ? thread : nullptr;
1319 }
1320 
SuspendThreadByThreadId(uint32_t thread_id,SuspendReason reason,int attempt_of_4)1321 Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id,
1322                                             SuspendReason reason,
1323                                             int attempt_of_4) {
1324   Thread* const self = Thread::Current();
1325   ThreadState old_self_state = self->GetState();
1326   CHECK_NE(thread_id, kInvalidThreadId);
1327   VLOG(threads) << "SuspendThreadByThreadId starting";
1328   self->TransitionFromSuspendedToRunnable();
1329   Locks::thread_list_lock_->ExclusiveLock(self);
1330   Thread* thread = FindThreadByThreadId(thread_id);
1331   if (thread == nullptr) {
1332     // There's a race in inflating a lock and the owner giving up ownership and then dying.
1333     LOG(WARNING) << StringPrintf("No such thread id %d for suspend", thread_id);
1334     Locks::thread_list_lock_->ExclusiveUnlock(self);
1335     self->TransitionFromRunnableToSuspended(old_self_state);
1336     return nullptr;
1337   }
1338   DCHECK(Contains(thread));
1339   VLOG(threads) << "SuspendThreadByThreadId found thread: " << *thread;
1340   // Releases thread_list_lock_ and mutator lock.
1341   bool success = SuspendThread(self, thread, reason, old_self_state, __func__, attempt_of_4);
1342   Locks::thread_list_lock_->AssertNotHeld(self);
1343   return success ? thread : nullptr;
1344 }
1345 
FindThreadByThreadId(uint32_t thread_id)1346 Thread* ThreadList::FindThreadByThreadId(uint32_t thread_id) {
1347   for (const auto& thread : list_) {
1348     if (thread->GetThreadId() == thread_id) {
1349       return thread;
1350     }
1351   }
1352   return nullptr;
1353 }
1354 
FindThreadByTid(int tid)1355 Thread* ThreadList::FindThreadByTid(int tid) {
1356   for (const auto& thread : list_) {
1357     if (thread->GetTid() == tid) {
1358       return thread;
1359     }
1360   }
1361   return nullptr;
1362 }
1363 
WaitForOtherNonDaemonThreadsToExit(bool check_no_birth)1364 void ThreadList::WaitForOtherNonDaemonThreadsToExit(bool check_no_birth) {
1365   ScopedTrace trace(__PRETTY_FUNCTION__);
1366   Thread* self = Thread::Current();
1367   Locks::mutator_lock_->AssertNotHeld(self);
1368   while (true) {
1369     Locks::runtime_shutdown_lock_->Lock(self);
1370     if (check_no_birth) {
1371       // No more threads can be born after we start to shutdown.
1372       CHECK(Runtime::Current()->IsShuttingDownLocked());
1373       CHECK_EQ(Runtime::Current()->NumberOfThreadsBeingBorn(), 0U);
1374     } else {
1375       if (Runtime::Current()->NumberOfThreadsBeingBorn() != 0U) {
1376         // Awkward. Shutdown_cond_ is private, but the only live thread may not be registered yet.
1377         // Fortunately, this is used mostly for testing, and not performance-critical.
1378         Locks::runtime_shutdown_lock_->Unlock(self);
1379         usleep(1000);
1380         continue;
1381       }
1382     }
1383     MutexLock mu(self, *Locks::thread_list_lock_);
1384     Locks::runtime_shutdown_lock_->Unlock(self);
1385     // Also wait for any threads that are unregistering to finish. This is required so that no
1386     // threads access the thread list after it is deleted. TODO: This may not work for user daemon
1387     // threads since they could unregister at the wrong time.
1388     bool done = unregistering_count_ == 0;
1389     if (done) {
1390       for (const auto& thread : list_) {
1391         if (thread != self && !thread->IsDaemon()) {
1392           done = false;
1393           break;
1394         }
1395       }
1396     }
1397     if (done) {
1398       break;
1399     }
1400     // Wait for another thread to exit before re-checking.
1401     Locks::thread_exit_cond_->Wait(self);
1402   }
1403 }
1404 
SuspendAllDaemonThreadsForShutdown()1405 void ThreadList::SuspendAllDaemonThreadsForShutdown() {
1406   ScopedTrace trace(__PRETTY_FUNCTION__);
1407   Thread* self = Thread::Current();
1408   size_t daemons_left = 0;
1409   {
1410     // Tell all the daemons it's time to suspend.
1411     MutexLock mu(self, *Locks::thread_list_lock_);
1412     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1413     for (const auto& thread : list_) {
1414       // This is only run after all non-daemon threads have exited, so the remainder should all be
1415       // daemons.
1416       CHECK(thread->IsDaemon()) << *thread;
1417       if (thread != self) {
1418         thread->IncrementSuspendCount(self);
1419         ++daemons_left;
1420       }
1421       // We are shutting down the runtime, set the JNI functions of all the JNIEnvs to be
1422       // the sleep forever one.
1423       thread->GetJniEnv()->SetFunctionsToRuntimeShutdownFunctions();
1424     }
1425   }
1426   if (daemons_left == 0) {
1427     // No threads left; safe to shut down.
1428     return;
1429   }
1430   // There is not a clean way to shut down if we have daemons left. We have no mechanism for
1431   // killing them and reclaiming thread stacks. We also have no mechanism for waiting until they
1432   // have truly finished touching the memory we are about to deallocate. We do the best we can with
1433   // timeouts.
1434   //
1435   // If we have any daemons left, wait until they are (a) suspended and (b) they are not stuck
1436   // in a place where they are about to access runtime state and are not in a runnable state.
1437   // We attempt to do the latter by just waiting long enough for things to
1438   // quiesce. Examples: Monitor code or waking up from a condition variable.
1439   //
1440   // Give the threads a chance to suspend, complaining if they're slow. (a)
1441   bool have_complained = false;
1442   static constexpr size_t kTimeoutMicroseconds = 2000 * 1000;
1443   static constexpr size_t kSleepMicroseconds = 1000;
1444   bool all_suspended = false;
1445   for (size_t i = 0; !all_suspended && i < kTimeoutMicroseconds / kSleepMicroseconds; ++i) {
1446     bool found_running = false;
1447     {
1448       MutexLock mu(self, *Locks::thread_list_lock_);
1449       for (const auto& thread : list_) {
1450         if (thread != self && thread->GetState() == ThreadState::kRunnable) {
1451           if (!have_complained) {
1452             LOG(WARNING) << "daemon thread not yet suspended: " << *thread;
1453             have_complained = true;
1454           }
1455           found_running = true;
1456         }
1457       }
1458     }
1459     if (found_running) {
1460       // Sleep briefly before checking again. Max total sleep time is kTimeoutMicroseconds.
1461       usleep(kSleepMicroseconds);
1462     } else {
1463       all_suspended = true;
1464     }
1465   }
1466   if (!all_suspended) {
1467     // We can get here if a daemon thread executed a fastnative native call, so that it
1468     // remained in runnable state, and then made a JNI call after we called
1469     // SetFunctionsToRuntimeShutdownFunctions(), causing it to permanently stay in a harmless
1470     // but runnable state. See b/147804269 .
1471     LOG(WARNING) << "timed out suspending all daemon threads";
1472   }
1473   // Assume all threads are either suspended or somehow wedged.
1474   // Wait again for all the now "suspended" threads to actually quiesce. (b)
1475   static constexpr size_t kDaemonSleepTime = 400'000;
1476   usleep(kDaemonSleepTime);
1477   std::list<Thread*> list_copy;
1478   {
1479     MutexLock mu(self, *Locks::thread_list_lock_);
1480     // Half-way through the wait, set the "runtime deleted" flag, causing any newly awoken
1481     // threads to immediately go back to sleep without touching memory. This prevents us from
1482     // touching deallocated memory, but it also prevents mutexes from getting released. Thus we
1483     // only do this once we're reasonably sure that no system mutexes are still held.
1484     for (const auto& thread : list_) {
1485       DCHECK(thread == self || !all_suspended || thread->GetState() != ThreadState::kRunnable);
1486       // In the !all_suspended case, the target is probably sleeping.
1487       thread->GetJniEnv()->SetRuntimeDeleted();
1488       // Possibly contended Mutex acquisitions are unsafe after this.
1489       // Releasing thread_list_lock_ is OK, since it can't block.
1490     }
1491   }
1492   // Finally wait for any threads woken before we set the "runtime deleted" flags to finish
1493   // touching memory.
1494   usleep(kDaemonSleepTime);
1495 #if defined(__has_feature)
1496 #if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer)
1497   // Sleep a bit longer with -fsanitize=address, since everything is slower.
1498   usleep(2 * kDaemonSleepTime);
1499 #endif
1500 #endif
1501   // At this point no threads should be touching our data structures anymore.
1502 }
1503 
Register(Thread * self)1504 void ThreadList::Register(Thread* self) {
1505   DCHECK_EQ(self, Thread::Current());
1506   CHECK(!shut_down_);
1507 
1508   if (VLOG_IS_ON(threads)) {
1509     std::ostringstream oss;
1510     self->ShortDump(oss);  // We don't hold the mutator_lock_ yet and so cannot call Dump.
1511     LOG(INFO) << "ThreadList::Register() " << *self  << "\n" << oss.str();
1512   }
1513 
1514   // Atomically add self to the thread list and make its thread_suspend_count_ reflect ongoing
1515   // SuspendAll requests.
1516   MutexLock mu(self, *Locks::thread_list_lock_);
1517   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1518   if (suspend_all_count_ == 1) {
1519     self->IncrementSuspendCount(self);
1520   } else {
1521     DCHECK_EQ(suspend_all_count_, 0);
1522   }
1523   CHECK(!Contains(self));
1524   list_.push_back(self);
1525   if (gUseReadBarrier) {
1526     gc::collector::ConcurrentCopying* const cc =
1527         Runtime::Current()->GetHeap()->ConcurrentCopyingCollector();
1528     // Initialize according to the state of the CC collector.
1529     self->SetIsGcMarkingAndUpdateEntrypoints(cc->IsMarking());
1530     if (cc->IsUsingReadBarrierEntrypoints()) {
1531       self->SetReadBarrierEntrypoints();
1532     }
1533     self->SetWeakRefAccessEnabled(cc->IsWeakRefAccessEnabled());
1534   }
1535 }
1536 
Unregister(Thread * self,bool should_run_callbacks)1537 void ThreadList::Unregister(Thread* self, bool should_run_callbacks) {
1538   DCHECK_EQ(self, Thread::Current());
1539   CHECK_NE(self->GetState(), ThreadState::kRunnable);
1540   Locks::mutator_lock_->AssertNotHeld(self);
1541   if (self->tls32_.disable_thread_flip_count != 0) {
1542     LOG(FATAL) << "Incomplete PrimitiveArrayCritical section at exit: " << *self << "count = "
1543                << self->tls32_.disable_thread_flip_count;
1544   }
1545 
1546   VLOG(threads) << "ThreadList::Unregister() " << *self;
1547 
1548   {
1549     MutexLock mu(self, *Locks::thread_list_lock_);
1550     ++unregistering_count_;
1551   }
1552 
1553   // Any time-consuming destruction, plus anything that can call back into managed code or
1554   // suspend and so on, must happen at this point, and not in ~Thread. The self->Destroy is what
1555   // causes the threads to join. It is important to do this after incrementing unregistering_count_
1556   // since we want the runtime to wait for the daemon threads to exit before deleting the thread
1557   // list.
1558   self->Destroy(should_run_callbacks);
1559 
1560   uint32_t thin_lock_id = self->GetThreadId();
1561   while (true) {
1562     // Remove and delete the Thread* while holding the thread_list_lock_ and
1563     // thread_suspend_count_lock_ so that the unregistering thread cannot be suspended.
1564     // Note: deliberately not using MutexLock that could hold a stale self pointer.
1565     {
1566       MutexLock mu(self, *Locks::thread_list_lock_);
1567       if (!Contains(self)) {
1568         std::string thread_name;
1569         self->GetThreadName(thread_name);
1570         std::ostringstream os;
1571         DumpNativeStack(os, GetTid(), "  native: ", nullptr);
1572         LOG(FATAL) << "Request to unregister unattached thread " << thread_name << "\n" << os.str();
1573         UNREACHABLE();
1574       } else {
1575         MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1576         Thread::StateAndFlags state_and_flags = self->GetStateAndFlags(std::memory_order_acquire);
1577         if (!state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction) &&
1578             !state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest)) {
1579           list_.remove(self);
1580           self->SignalExitFlags();
1581           break;
1582         }
1583       }
1584     }
1585     // In the case where we are not suspended yet, sleep to leave other threads time to execute.
1586     // This is important if there are realtime threads. b/111277984
1587     usleep(1);
1588     // We failed to remove the thread due to a suspend request or the like, loop and try again.
1589   }
1590 
1591   // We flush the trace buffer in Thread::Destroy. We have to check again here because once the
1592   // Thread::Destroy finishes we wait for any active suspend requests to finish before deleting
1593   // the thread. If a new trace was started during the wait period we may allocate the trace buffer
1594   // again. The trace buffer would only contain the method entry events for the methods on the stack
1595   // of an exiting thread. It is not required to flush these entries but we need to release the
1596   // buffer. Ideally we should either not generate trace events for a thread that is exiting or use
1597   // a different mechanism to report the initial events on a trace start that doesn't use per-thread
1598   // buffer. Both these approaches are not trivial to implement, so we are going with the approach
1599   // of just releasing the buffer here.
1600   if (UNLIKELY(self->GetMethodTraceBuffer() != nullptr)) {
1601     Trace::ReleaseThreadBuffer(self);
1602   }
1603   CHECK_EQ(self->GetMethodTraceBuffer(), nullptr) << Trace::GetDebugInformation();
1604   delete self;
1605 
1606   // Release the thread ID after the thread is finished and deleted to avoid cases where we can
1607   // temporarily have multiple threads with the same thread id. When this occurs, it causes
1608   // problems in FindThreadByThreadId / SuspendThreadByThreadId.
1609   ReleaseThreadId(nullptr, thin_lock_id);
1610 
1611   // Clear the TLS data, so that the underlying native thread is recognizably detached.
1612   // (It may wish to reattach later.)
1613 #ifdef __BIONIC__
1614   __get_tls()[TLS_SLOT_ART_THREAD_SELF] = nullptr;
1615 #else
1616   CHECK_PTHREAD_CALL(pthread_setspecific, (Thread::pthread_key_self_, nullptr), "detach self");
1617   Thread::self_tls_ = nullptr;
1618 #endif
1619 
1620   // Signal that a thread just detached.
1621   MutexLock mu(nullptr, *Locks::thread_list_lock_);
1622   --unregistering_count_;
1623   Locks::thread_exit_cond_->Broadcast(nullptr);
1624 }
1625 
ForEach(void (* callback)(Thread *,void *),void * context)1626 void ThreadList::ForEach(void (*callback)(Thread*, void*), void* context) {
1627   for (const auto& thread : list_) {
1628     callback(thread, context);
1629   }
1630 }
1631 
WaitForUnregisterToComplete(Thread * self)1632 void ThreadList::WaitForUnregisterToComplete(Thread* self) {
1633   // We hold thread_list_lock_ .
1634   while (unregistering_count_ != 0) {
1635     LOG(WARNING) << "Waiting for a thread to finish unregistering";
1636     Locks::thread_exit_cond_->Wait(self);
1637   }
1638 }
1639 
VisitRootsForSuspendedThreads(RootVisitor * visitor)1640 void ThreadList::VisitRootsForSuspendedThreads(RootVisitor* visitor) {
1641   Thread* const self = Thread::Current();
1642   std::vector<Thread*> threads_to_visit;
1643 
1644   // Tell threads to suspend and copy them into list.
1645   {
1646     MutexLock mu(self, *Locks::thread_list_lock_);
1647     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1648     for (Thread* thread : list_) {
1649       thread->IncrementSuspendCount(self);
1650       if (thread == self || thread->IsSuspended()) {
1651         threads_to_visit.push_back(thread);
1652       } else {
1653         thread->DecrementSuspendCount(self);
1654       }
1655     }
1656   }
1657 
1658   // Visit roots without holding thread_list_lock_ and thread_suspend_count_lock_ to prevent lock
1659   // order violations.
1660   for (Thread* thread : threads_to_visit) {
1661     thread->VisitRoots(visitor, kVisitRootFlagAllRoots);
1662   }
1663 
1664   // Restore suspend counts.
1665   {
1666     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
1667     for (Thread* thread : threads_to_visit) {
1668       thread->DecrementSuspendCount(self);
1669     }
1670     Thread::resume_cond_->Broadcast(self);
1671   }
1672 }
1673 
VisitRoots(RootVisitor * visitor,VisitRootFlags flags) const1674 void ThreadList::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) const {
1675   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
1676   for (const auto& thread : list_) {
1677     thread->VisitRoots(visitor, flags);
1678   }
1679 }
1680 
VisitReflectiveTargets(ReflectiveValueVisitor * visitor) const1681 void ThreadList::VisitReflectiveTargets(ReflectiveValueVisitor *visitor) const {
1682   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
1683   for (const auto& thread : list_) {
1684     thread->VisitReflectiveTargets(visitor);
1685   }
1686 }
1687 
SweepInterpreterCaches(IsMarkedVisitor * visitor) const1688 void ThreadList::SweepInterpreterCaches(IsMarkedVisitor* visitor) const {
1689   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
1690   for (const auto& thread : list_) {
1691     thread->SweepInterpreterCache(visitor);
1692   }
1693 }
1694 
ClearInterpreterCaches() const1695 void ThreadList::ClearInterpreterCaches() const {
1696   Thread* self = Thread::Current();
1697   Locks::mutator_lock_->AssertExclusiveHeld(self);
1698   MutexLock mu(self, *Locks::thread_list_lock_);
1699   for (const auto& thread : list_) {
1700     thread->GetInterpreterCache()->Clear(thread);
1701   }
1702 }
1703 
AllocThreadId(Thread * self)1704 uint32_t ThreadList::AllocThreadId(Thread* self) {
1705   MutexLock mu(self, *Locks::allocated_thread_ids_lock_);
1706   for (size_t i = 0; i < allocated_ids_.size(); ++i) {
1707     if (!allocated_ids_[i]) {
1708       allocated_ids_.set(i);
1709       return i + 1;  // Zero is reserved to mean "invalid".
1710     }
1711   }
1712   LOG(FATAL) << "Out of internal thread ids";
1713   UNREACHABLE();
1714 }
1715 
ReleaseThreadId(Thread * self,uint32_t id)1716 void ThreadList::ReleaseThreadId(Thread* self, uint32_t id) {
1717   MutexLock mu(self, *Locks::allocated_thread_ids_lock_);
1718   --id;  // Zero is reserved to mean "invalid".
1719   DCHECK(allocated_ids_[id]) << id;
1720   allocated_ids_.reset(id);
1721 }
1722 
ScopedSuspendAll(const char * cause,bool long_suspend)1723 ScopedSuspendAll::ScopedSuspendAll(const char* cause, bool long_suspend) {
1724   Runtime::Current()->GetThreadList()->SuspendAll(cause, long_suspend);
1725 }
1726 
~ScopedSuspendAll()1727 ScopedSuspendAll::~ScopedSuspendAll() {
1728   Runtime::Current()->GetThreadList()->ResumeAll();
1729 }
1730 
1731 }  // namespace art
1732