1 /*
2 * Copyright (C) 2021 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "TimerThread"
18
19 #include <optional>
20 #include <sstream>
21 #include <unistd.h>
22 #include <vector>
23
24 #include <audio_utils/mutex.h>
25 #include <mediautils/MediaUtilsDelayed.h>
26 #include <mediautils/TidWrapper.h>
27 #include <mediautils/TimerThread.h>
28 #include <utils/Log.h>
29 #include <utils/ThreadDefs.h>
30
31 using namespace std::chrono_literals;
32
33 namespace android::mediautils {
34
35 extern std::string formatTime(std::chrono::system_clock::time_point t);
36 extern std::string_view timeSuffix(std::string_view time1, std::string_view time2);
37
scheduleTask(std::string_view tag,TimerCallback && func,Duration timeoutDuration,Duration secondChanceDuration)38 TimerThread::Handle TimerThread::scheduleTask(
39 std::string_view tag, TimerCallback&& func,
40 Duration timeoutDuration, Duration secondChanceDuration) {
41 const auto now = std::chrono::system_clock::now();
42 auto request = std::make_shared<const Request>(now, now +
43 std::chrono::duration_cast<std::chrono::system_clock::duration>(timeoutDuration),
44 secondChanceDuration, getThreadIdWrapper(), tag);
45 return mMonitorThread.add(std::move(request), std::move(func), timeoutDuration);
46 }
47
trackTask(std::string_view tag)48 TimerThread::Handle TimerThread::trackTask(std::string_view tag) {
49 const auto now = std::chrono::system_clock::now();
50 auto request = std::make_shared<const Request>(now, now,
51 Duration{} /* secondChanceDuration */, getThreadIdWrapper(), tag);
52 return mNoTimeoutMap.add(std::move(request));
53 }
54
cancelTask(Handle handle)55 bool TimerThread::cancelTask(Handle handle) {
56 std::shared_ptr<const Request> request = isNoTimeoutHandle(handle) ?
57 mNoTimeoutMap.remove(handle) : mMonitorThread.remove(handle);
58 if (!request) return false;
59 mRetiredQueue.add(std::move(request));
60 return true;
61 }
62
toString(bool showTimeoutStack) const63 std::string TimerThread::SnapshotAnalysis::toString(bool showTimeoutStack) const {
64 // Note: These request queues are snapshot very close together but
65 // not at "identical" times as we don't use a class-wide lock.
66 std::string analysisSummary = std::string("\nanalysis [ ").append(description).append(" ]");
67 std::string timeoutStack;
68 std::string blockedStack;
69 std::string mutexWaitChainStack;
70 if (showTimeoutStack && timeoutTid != -1) {
71 timeoutStack = std::string(suspectTid == timeoutTid ? "\ntimeout/blocked(" : "\ntimeout(")
72 .append(std::to_string(timeoutTid)).append(") callstack [\n")
73 .append(getCallStackStringForTid(timeoutTid)).append("]");
74 }
75
76 if (suspectTid != -1 && suspectTid != timeoutTid) {
77 blockedStack = std::string("\nblocked(")
78 .append(std::to_string(suspectTid)).append(") callstack [\n")
79 .append(getCallStackStringForTid(suspectTid)).append("]");
80 }
81
82 if (!mutexWaitChain.empty()) {
83 mutexWaitChainStack.append("\nmutex wait chain [\n");
84 // the wait chain omits the initial timeout tid (which is good as we don't
85 // need to suppress it).
86 for (size_t i = 0; i < mutexWaitChain.size(); ++i) {
87 const auto& [tid, name] = mutexWaitChain[i];
88 mutexWaitChainStack.append("{ tid: ").append(std::to_string(tid))
89 .append(" (holding ").append(name).append(")");
90 if (tid == timeoutTid) {
91 mutexWaitChainStack.append(" TIMEOUT_STACK }\n");
92 } else if (tid == suspectTid) {
93 mutexWaitChainStack.append(" BLOCKED_STACK }\n");
94 } else if (hasMutexCycle && i == mutexWaitChain.size() - 1) {
95 // for a cycle, the last pid in the chain is repeated.
96 mutexWaitChainStack.append(" CYCLE_STACK }\n");
97 } else {
98 mutexWaitChainStack.append(" callstack [\n")
99 .append(getCallStackStringForTid(tid)).append("] }\n");
100 }
101 }
102 mutexWaitChainStack.append("]");
103 }
104
105 return std::string("now ")
106 .append(formatTime(std::chrono::system_clock::now()))
107 .append("\nsecondChanceCount ")
108 .append(std::to_string(secondChanceCount))
109 .append(analysisSummary)
110 .append("\ntimeout [ ")
111 .append(requestsToString(timeoutRequests))
112 .append(" ]\npending [ ")
113 .append(requestsToString(pendingRequests))
114 .append(" ]\nretired [ ")
115 .append(requestsToString(retiredRequests))
116 .append(" ]")
117 .append(timeoutStack)
118 .append(blockedStack)
119 .append(mutexWaitChainStack);
120 }
121
122 // A HAL method is where the substring "Hidl" is in the class name.
123 // The tag should look like: ... Hidl ... :: ...
124 // When the audio HAL is updated to AIDL perhaps we will use instead
125 // a global directory of HAL classes.
126 //
127 // See MethodStatistics.cpp:
128 // mediautils::getStatisticsClassesForModule(METHOD_STATISTICS_MODULE_NAME_AUDIO_HIDL)
129 //
130 /* static */
isRequestFromHal(const std::shared_ptr<const Request> & request)131 bool TimerThread::isRequestFromHal(const std::shared_ptr<const Request>& request) {
132 for (const auto& s : {"Hidl", "Aidl"}) {
133 const auto& tagSV = request->tag.asStringView();
134 const size_t halStrPos = tagSV.find(s);
135 // should be a separator afterwards Hidl/Aidl which indicates the string was in the class.
136 if (halStrPos != std::string::npos && tagSV.find("::", halStrPos) != std::string::npos) {
137 return true;
138 }
139 }
140
141 return false;
142 }
143
getSnapshotAnalysis(size_t retiredCount) const144 struct TimerThread::SnapshotAnalysis TimerThread::getSnapshotAnalysis(size_t retiredCount) const {
145 struct SnapshotAnalysis analysis{};
146 // The following snapshot of the TimerThread state will be utilized for
147 // analysis. Note, there is no lock around these calls, so there could be
148 // a state update between them.
149 mTimeoutQueue.copyRequests(analysis.timeoutRequests);
150 mRetiredQueue.copyRequests(analysis.retiredRequests, retiredCount);
151 analysis.pendingRequests = getPendingRequests();
152 analysis.secondChanceCount = mMonitorThread.getSecondChanceCount();
153 // No call has timed out, so there is no analysis to be done.
154 if (analysis.timeoutRequests.empty()) {
155 return analysis;
156 }
157
158 // for now look at last timeout (in our case, the only timeout)
159 const std::shared_ptr<const Request> timeout = analysis.timeoutRequests.back();
160 analysis.timeoutTid = timeout->tid;
161
162 std::string& description = analysis.description;
163
164 // audio mutex specific wait chain analysis
165 auto deadlockInfo = audio_utils::mutex::deadlock_detection(analysis.timeoutTid);
166 ALOGD("%s: deadlockInfo: %s", __func__, deadlockInfo.to_string().c_str());
167
168 if (!deadlockInfo.empty()) {
169 if (!description.empty()) description.append("\n");
170 description.append(deadlockInfo.to_string());
171 }
172
173 analysis.hasMutexCycle = deadlockInfo.has_cycle;
174 analysis.mutexWaitChain = std::move(deadlockInfo.chain);
175
176 // no pending requests, we don't attempt to use temporal correlation between a recent call.
177 if (analysis.pendingRequests.empty()) {
178 return analysis;
179 }
180
181 // pending Requests that are problematic.
182 std::vector<std::shared_ptr<const Request>> pendingExact;
183 std::vector<std::shared_ptr<const Request>> pendingPossible;
184
185 // We look at pending requests that were scheduled no later than kPendingDuration
186 // after the timeout request. This prevents false matches with calls
187 // that naturally block for a short period of time
188 // such as HAL write() and read().
189 //
190 constexpr Duration kPendingDuration = 1000ms;
191 for (const auto& pending : analysis.pendingRequests) {
192 // If the pending tid is the same as timeout tid, problem identified.
193 if (pending->tid == timeout->tid) {
194 pendingExact.emplace_back(pending);
195 continue;
196 }
197
198 // if the pending tid is scheduled within time limit
199 if (pending->scheduled - timeout->scheduled < kPendingDuration) {
200 pendingPossible.emplace_back(pending);
201 }
202 }
203
204 if (!pendingExact.empty()) {
205 const auto& request = pendingExact.front();
206 const bool hal = isRequestFromHal(request);
207
208 if (hal) {
209 if (!description.empty()) description.append("\n");
210 description.append("Blocked directly due to HAL call: ")
211 .append(request->toString());
212 analysis.suspectTid = request->tid;
213 }
214 }
215 if (description.empty() && !pendingPossible.empty()) {
216 for (const auto& request : pendingPossible) {
217 const bool hal = isRequestFromHal(request);
218 if (hal) {
219 // The first blocked call is the most likely one.
220 // Recent calls might be temporarily blocked
221 // calls such as write() or read() depending on kDuration.
222 description = std::string("Blocked possibly due to HAL call: ")
223 .append(request->toString());
224 analysis.suspectTid= request->tid;
225 }
226 }
227 }
228 return analysis;
229 }
230
getPendingRequests() const231 std::vector<std::shared_ptr<const TimerThread::Request>> TimerThread::getPendingRequests() const {
232 constexpr size_t kEstimatedPendingRequests = 8; // approx 128 byte alloc.
233 std::vector<std::shared_ptr<const Request>> pendingRequests;
234 pendingRequests.reserve(kEstimatedPendingRequests); // preallocate vector out of lock.
235
236 // following are internally locked calls, which add to our local pendingRequests.
237 mMonitorThread.copyRequests(pendingRequests);
238 mNoTimeoutMap.copyRequests(pendingRequests);
239
240 // Sort in order of scheduled time.
241 std::sort(pendingRequests.begin(), pendingRequests.end(),
242 [](const std::shared_ptr<const Request>& r1,
243 const std::shared_ptr<const Request>& r2) {
244 return r1->scheduled < r2->scheduled;
245 });
246 return pendingRequests;
247 }
248
pendingToString() const249 std::string TimerThread::pendingToString() const {
250 return requestsToString(getPendingRequests());
251 }
252
retiredToString(size_t n) const253 std::string TimerThread::retiredToString(size_t n) const {
254 std::vector<std::shared_ptr<const Request>> retiredRequests;
255 mRetiredQueue.copyRequests(retiredRequests, n);
256
257 // Dump to string
258 return requestsToString(retiredRequests);
259 }
260
timeoutToString(size_t n) const261 std::string TimerThread::timeoutToString(size_t n) const {
262 std::vector<std::shared_ptr<const Request>> timeoutRequests;
263 mTimeoutQueue.copyRequests(timeoutRequests, n);
264
265 // Dump to string
266 return requestsToString(timeoutRequests);
267 }
268
toString() const269 std::string TimerThread::Request::toString() const {
270 const auto scheduledString = formatTime(scheduled);
271 const auto deadlineString = formatTime(deadline);
272 return std::string(tag)
273 .append(" scheduled ").append(scheduledString)
274 .append(" deadline ").append(timeSuffix(scheduledString, deadlineString))
275 .append(" tid ").append(std::to_string(tid));
276 }
277
add(std::shared_ptr<const Request> request)278 void TimerThread::RequestQueue::add(std::shared_ptr<const Request> request) {
279 std::lock_guard lg(mRQMutex);
280 mRequestQueue.emplace_back(std::chrono::system_clock::now(), std::move(request));
281 if (mRequestQueue.size() > mRequestQueueMax) {
282 mRequestQueue.pop_front();
283 }
284 }
285
copyRequests(std::vector<std::shared_ptr<const Request>> & requests,size_t n) const286 void TimerThread::RequestQueue::copyRequests(
287 std::vector<std::shared_ptr<const Request>>& requests, size_t n) const {
288 std::lock_guard lg(mRQMutex);
289 const size_t size = mRequestQueue.size();
290 size_t i = n >= size ? 0 : size - n;
291 for (; i < size; ++i) {
292 const auto &[time, request] = mRequestQueue[i];
293 requests.emplace_back(request);
294 }
295 }
296
add(std::shared_ptr<const Request> request)297 TimerThread::Handle TimerThread::NoTimeoutMap::add(std::shared_ptr<const Request> request) {
298 std::lock_guard lg(mNTMutex);
299 // A unique handle is obtained by mNoTimeoutRequests.fetch_add(1),
300 // This need not be under a lock, but we do so anyhow.
301 const Handle handle = getUniqueHandle_l();
302 mMap[handle] = request;
303 return handle;
304 }
305
remove(Handle handle)306 std::shared_ptr<const TimerThread::Request> TimerThread::NoTimeoutMap::remove(Handle handle) {
307 std::lock_guard lg(mNTMutex);
308 auto it = mMap.find(handle);
309 if (it == mMap.end()) return {};
310 auto request = it->second;
311 mMap.erase(it);
312 return request;
313 }
314
copyRequests(std::vector<std::shared_ptr<const Request>> & requests) const315 void TimerThread::NoTimeoutMap::copyRequests(
316 std::vector<std::shared_ptr<const Request>>& requests) const {
317 std::lock_guard lg(mNTMutex);
318 for (const auto &[handle, request] : mMap) {
319 requests.emplace_back(request);
320 }
321 }
322
MonitorThread(RequestQueue & timeoutQueue)323 TimerThread::MonitorThread::MonitorThread(RequestQueue& timeoutQueue)
324 : mTimeoutQueue(timeoutQueue)
325 , mThread([this] { threadFunc(); }) {
326 pthread_setname_np(mThread.native_handle(), "TimerThread");
327 pthread_setschedprio(mThread.native_handle(), PRIORITY_URGENT_AUDIO);
328 }
329
~MonitorThread()330 TimerThread::MonitorThread::~MonitorThread() {
331 {
332 std::lock_guard _l(mMutex);
333 mShouldExit = true;
334 mCond.notify_all();
335 }
336 mThread.join();
337 }
338
threadFunc()339 void TimerThread::MonitorThread::threadFunc() {
340 std::unique_lock _l(mMutex);
341 ::android::base::ScopedLockAssertion lock_assertion(mMutex);
342 while (!mShouldExit) {
343 Handle nextDeadline = INVALID_HANDLE;
344 Handle now = INVALID_HANDLE;
345 if (!mMonitorRequests.empty()) {
346 nextDeadline = mMonitorRequests.begin()->first;
347 now = std::chrono::steady_clock::now();
348 if (nextDeadline < now) {
349 auto node = mMonitorRequests.extract(mMonitorRequests.begin());
350 // Deadline has expired, handle the request.
351 auto secondChanceDuration = node.mapped().first->secondChanceDuration;
352 if (secondChanceDuration.count() != 0) {
353 // We now apply the second chance duration to find the clock
354 // monotonic second deadline. The unique key is then the
355 // pair<second_deadline, first_deadline>.
356 //
357 // The second chance prevents a false timeout should there be
358 // any clock monotonic advancement during suspend.
359 auto newHandle = now + secondChanceDuration;
360 ALOGD("%s: TimeCheck second chance applied for %s",
361 __func__, node.mapped().first->tag.c_str()); // should be rare event.
362 mSecondChanceRequests.emplace_hint(mSecondChanceRequests.end(),
363 std::make_pair(newHandle, nextDeadline),
364 std::move(node.mapped()));
365 // increment second chance counter.
366 mSecondChanceCount.fetch_add(1 /* arg */, std::memory_order_relaxed);
367 } else {
368 {
369 _l.unlock();
370 // We add Request to retired queue early so that it can be dumped out.
371 mTimeoutQueue.add(std::move(node.mapped().first));
372 node.mapped().second(nextDeadline);
373 // Caution: we don't hold lock when we call TimerCallback,
374 // but this is the timeout case! We will crash soon,
375 // maybe before returning.
376 // anything left over is released here outside lock.
377 }
378 // reacquire the lock - if something was added, we loop immediately to check.
379 _l.lock();
380 }
381 // always process expiring monitor requests first.
382 continue;
383 }
384 }
385 // now process any second chance requests.
386 if (!mSecondChanceRequests.empty()) {
387 Handle secondDeadline = mSecondChanceRequests.begin()->first.first;
388 if (now == INVALID_HANDLE) now = std::chrono::steady_clock::now();
389 if (secondDeadline < now) {
390 auto node = mSecondChanceRequests.extract(mSecondChanceRequests.begin());
391 {
392 _l.unlock();
393 // We add Request to retired queue early so that it can be dumped out.
394 mTimeoutQueue.add(std::move(node.mapped().first));
395 const Handle originalHandle = node.key().second;
396 node.mapped().second(originalHandle);
397 // Caution: we don't hold lock when we call TimerCallback.
398 // This is benign issue - we permit concurrent operations
399 // while in the callback to the MonitorQueue.
400 //
401 // Anything left over is released here outside lock.
402 }
403 // reacquire the lock - if something was added, we loop immediately to check.
404 _l.lock();
405 continue;
406 }
407 // update the deadline.
408 if (nextDeadline == INVALID_HANDLE) {
409 nextDeadline = secondDeadline;
410 } else {
411 nextDeadline = std::min(nextDeadline, secondDeadline);
412 }
413 }
414 if (nextDeadline != INVALID_HANDLE) {
415 mCond.wait_until(_l, nextDeadline);
416 } else {
417 mCond.wait(_l);
418 }
419 }
420 }
421
add(std::shared_ptr<const Request> request,TimerCallback && func,Duration timeout)422 TimerThread::Handle TimerThread::MonitorThread::add(
423 std::shared_ptr<const Request> request, TimerCallback&& func, Duration timeout) {
424 std::lock_guard _l(mMutex);
425 const Handle handle = getUniqueHandle_l(timeout);
426 mMonitorRequests.emplace_hint(mMonitorRequests.end(),
427 handle, std::make_pair(std::move(request), std::move(func)));
428 mCond.notify_all();
429 return handle;
430 }
431
remove(Handle handle)432 std::shared_ptr<const TimerThread::Request> TimerThread::MonitorThread::remove(Handle handle) {
433 std::pair<std::shared_ptr<const Request>, TimerCallback> data;
434 std::unique_lock ul(mMutex);
435 ::android::base::ScopedLockAssertion lock_assertion(mMutex);
436 if (const auto it = mMonitorRequests.find(handle);
437 it != mMonitorRequests.end()) {
438 data = std::move(it->second);
439 mMonitorRequests.erase(it);
440 ul.unlock(); // manually release lock here so func (data.second)
441 // is released outside of lock.
442 return data.first; // request
443 }
444
445 // this check is O(N), but since the second chance requests are ordered
446 // in terms of earliest expiration time, we would expect better than average results.
447 for (auto it = mSecondChanceRequests.begin(); it != mSecondChanceRequests.end(); ++it) {
448 if (it->first.second == handle) {
449 data = std::move(it->second);
450 mSecondChanceRequests.erase(it);
451 ul.unlock(); // manually release lock here so func (data.second)
452 // is released outside of lock.
453 return data.first; // request
454 }
455 }
456 return {};
457 }
458
copyRequests(std::vector<std::shared_ptr<const Request>> & requests) const459 void TimerThread::MonitorThread::copyRequests(
460 std::vector<std::shared_ptr<const Request>>& requests) const {
461 std::lock_guard lg(mMutex);
462 for (const auto &[deadline, monitorpair] : mMonitorRequests) {
463 requests.emplace_back(monitorpair.first);
464 }
465 // we combine the second map with the first map - this is
466 // everything that is pending on the monitor thread.
467 // The second map will be older than the first map so this
468 // is in order.
469 for (const auto &[deadline, monitorpair] : mSecondChanceRequests) {
470 requests.emplace_back(monitorpair.first);
471 }
472 }
473
474 } // namespace android::mediautils
475