1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <csignal>
18 #include "mediautils/TimerThread.h"
19 #define LOG_TAG "TimeCheck"
20
21 #include <optional>
22
23 #include <android-base/logging.h>
24 #include <android-base/strings.h>
25 #include <audio_utils/clock.h>
26 #include <mediautils/EventLog.h>
27 #include <mediautils/FixedString.h>
28 #include <mediautils/MethodStatistics.h>
29 #include <mediautils/TimeCheck.h>
30 #include <mediautils/TidWrapper.h>
31 #include <utils/Log.h>
32
33 #if defined(__ANDROID__)
34 #include "debuggerd/handler.h"
35 #endif
36
37
38 namespace android::mediautils {
39 // This function appropriately signals a pid to dump a backtrace if we are
40 // running on device (and the HAL exists). If we are not running on an Android
41 // device, there is no HAL to signal (so we do nothing).
signalAudioHAL(pid_t pid)42 static inline void signalAudioHAL([[maybe_unused]] pid_t pid) {
43 #if defined(__ANDROID__)
44 sigqueue(pid, DEBUGGER_SIGNAL, {.sival_int = 0});
45 #endif
46 }
47
48 /**
49 * Returns the std::string "HH:MM:SS.MSc" from a system_clock time_point.
50 */
formatTime(std::chrono::system_clock::time_point t)51 std::string formatTime(std::chrono::system_clock::time_point t) {
52 auto time_string = audio_utils_time_string_from_ns(
53 std::chrono::nanoseconds(t.time_since_epoch()).count());
54
55 // The time string is 19 characters (including null termination).
56 // Example: "03-27 16:47:06.187"
57 // MM DD HH MM SS MS
58 // We offset by 6 to get HH:MM:SS.MSc
59 //
60 return time_string.time + 6; // offset to remove month/day.
61 }
62
63 /**
64 * Finds the end of the common time prefix.
65 *
66 * This is as an option to remove the common time prefix to avoid
67 * unnecessary duplicated strings.
68 *
69 * \param time1 a time string
70 * \param time2 a time string
71 * \return the position where the common time prefix ends. For abbreviated
72 * printing of time2, offset the character pointer by this position.
73 */
commonTimePrefixPosition(std::string_view time1,std::string_view time2)74 static size_t commonTimePrefixPosition(std::string_view time1, std::string_view time2) {
75 const size_t endPos = std::min(time1.size(), time2.size());
76 size_t i;
77
78 // Find location of the first mismatch between strings
79 for (i = 0; ; ++i) {
80 if (i == endPos) {
81 return i; // strings match completely to the length of one of the strings.
82 }
83 if (time1[i] != time2[i]) {
84 break;
85 }
86 if (time1[i] == '\0') {
87 return i; // "printed" strings match completely. No need to check further.
88 }
89 }
90
91 // Go backwards until we find a delimeter or space.
92 for (; i > 0
93 && isdigit(time1[i]) // still a number
94 && time1[i - 1] != ' '
95 ; --i) {
96 }
97 return i;
98 }
99
100 /**
101 * Returns the unique suffix of time2 that isn't present in time1.
102 *
103 * If time2 is identical to time1, then an empty string_view is returned.
104 * This method is used to elide the common prefix when printing times.
105 */
timeSuffix(std::string_view time1,std::string_view time2)106 std::string_view timeSuffix(std::string_view time1, std::string_view time2) {
107 const size_t pos = commonTimePrefixPosition(time1, time2);
108 return time2.substr(pos);
109 }
110
111 // Audio HAL server pids vector used to generate audio HAL processes tombstone
112 // when audioserver watchdog triggers.
113 // We use a lockless storage to avoid potential deadlocks in the context of watchdog
114 // trigger.
115 // Protection again simultaneous writes is not needed given one update takes place
116 // during AudioFlinger construction and other comes necessarily later once the IAudioFlinger
117 // interface is available.
118 // The use of an atomic index just guaranties that current vector is fully initialized
119 // when read.
120 /* static */
accessAudioHalPids(std::vector<pid_t> * pids,bool update)121 void TimeCheck::accessAudioHalPids(std::vector<pid_t>* pids, bool update) {
122 static constexpr int kNumAudioHalPidsVectors = 3;
123 static std::vector<pid_t> audioHalPids[kNumAudioHalPidsVectors];
124 static std::atomic<unsigned> curAudioHalPids = 0;
125
126 if (update) {
127 audioHalPids[(curAudioHalPids++ + 1) % kNumAudioHalPidsVectors] = *pids;
128 } else {
129 *pids = audioHalPids[curAudioHalPids % kNumAudioHalPidsVectors];
130 }
131 }
132
133 /* static */
setAudioHalPids(const std::vector<pid_t> & pids)134 void TimeCheck::setAudioHalPids(const std::vector<pid_t>& pids) {
135 accessAudioHalPids(&(const_cast<std::vector<pid_t>&>(pids)), true);
136 }
137
138 /* static */
getAudioHalPids()139 std::vector<pid_t> TimeCheck::getAudioHalPids() {
140 std::vector<pid_t> pids;
141 accessAudioHalPids(&pids, false);
142 return pids;
143 }
144
145 /* static */
getTimeCheckThread()146 TimerThread& TimeCheck::getTimeCheckThread() {
147 static TimerThread sTimeCheckThread{};
148 return sTimeCheckThread;
149 }
150
151 /* static */
toString()152 std::string TimeCheck::toString() {
153 // note pending and retired are individually locked for maximum concurrency,
154 // snapshot is not instantaneous at a single time.
155 return getTimeCheckThread().getSnapshotAnalysis().toString();
156 }
157
TimeCheck(std::string_view tag,OnTimerFunc && onTimer,Duration requestedTimeoutDuration,Duration secondChanceDuration,bool crashOnTimeout)158 TimeCheck::TimeCheck(std::string_view tag, OnTimerFunc&& onTimer, Duration requestedTimeoutDuration,
159 Duration secondChanceDuration, bool crashOnTimeout)
160 : mTimeCheckHandler{ std::make_shared<TimeCheckHandler>(
161 tag, std::move(onTimer), crashOnTimeout, requestedTimeoutDuration,
162 secondChanceDuration, std::chrono::system_clock::now(), getThreadIdWrapper()) }
163 , mTimerHandle(requestedTimeoutDuration.count() == 0
164 /* for TimeCheck we don't consider a non-zero secondChanceDuration here */
165 ? getTimeCheckThread().trackTask(mTimeCheckHandler->tag)
166 : getTimeCheckThread().scheduleTask(
167 mTimeCheckHandler->tag,
168 // Pass in all the arguments by value to this task for safety.
169 // The thread could call the callback before the constructor is finished.
170 // The destructor is not blocked on callback.
__anona50c4d160102(TimerThread::Handle timerHandle) 171 [ timeCheckHandler = mTimeCheckHandler ](TimerThread::Handle timerHandle) {
172 timeCheckHandler->onTimeout(timerHandle);
173 },
174 requestedTimeoutDuration,
175 secondChanceDuration)) {}
176
~TimeCheck()177 TimeCheck::~TimeCheck() {
178 if (mTimeCheckHandler) {
179 mTimeCheckHandler->onCancel(mTimerHandle);
180 }
181 }
182
183 /* static */
analyzeTimeouts(float requestedTimeoutMs,float elapsedSteadyMs,float elapsedSystemMs)184 std::string TimeCheck::analyzeTimeouts(
185 float requestedTimeoutMs, float elapsedSteadyMs, float elapsedSystemMs) {
186 // Track any OS clock issues with suspend.
187 // It is possible that the elapsedSystemMs is much greater than elapsedSteadyMs if
188 // a suspend occurs; however, we always expect the timeout ms should always be slightly
189 // less than the elapsed steady ms regardless of whether a suspend occurs or not.
190
191 std::string s("Timeout ms ");
192 s.append(std::to_string(requestedTimeoutMs))
193 .append(" elapsed steady ms ").append(std::to_string(elapsedSteadyMs))
194 .append(" elapsed system ms ").append(std::to_string(elapsedSystemMs));
195
196 // Is there something unusual?
197 static constexpr float TOLERANCE_CONTEXT_SWITCH_MS = 200.f;
198
199 if (requestedTimeoutMs > elapsedSteadyMs || requestedTimeoutMs > elapsedSystemMs) {
200 s.append("\nError: early expiration - "
201 "requestedTimeoutMs should be less than elapsed time");
202 }
203
204 if (elapsedSteadyMs > elapsedSystemMs + TOLERANCE_CONTEXT_SWITCH_MS) {
205 s.append("\nWarning: steady time should not advance faster than system time");
206 }
207
208 // This has been found in suspend stress testing.
209 if (elapsedSteadyMs > requestedTimeoutMs + TOLERANCE_CONTEXT_SWITCH_MS) {
210 s.append("\nWarning: steady time significantly exceeds timeout "
211 "- possible thread stall or aborted suspend");
212 }
213
214 // This has been found in suspend stress testing.
215 if (elapsedSystemMs > requestedTimeoutMs + TOLERANCE_CONTEXT_SWITCH_MS) {
216 s.append("\nInformation: system time significantly exceeds timeout "
217 "- possible suspend");
218 }
219 return s;
220 }
221
222 // To avoid any potential race conditions, the timer handle
223 // (expiration = clock steady start + timeout) is passed into the callback.
onCancel(TimerThread::Handle timerHandle) const224 void TimeCheck::TimeCheckHandler::onCancel(TimerThread::Handle timerHandle) const
225 {
226 if (TimeCheck::getTimeCheckThread().cancelTask(timerHandle) && onTimer) {
227 const std::chrono::steady_clock::time_point endSteadyTime =
228 std::chrono::steady_clock::now();
229 const float elapsedSteadyMs = std::chrono::duration_cast<FloatMs>(
230 endSteadyTime - timerHandle + timeoutDuration).count();
231 // send the elapsed steady time for statistics.
232 onTimer(false /* timeout */, elapsedSteadyMs);
233 }
234 }
235
236 // To avoid any potential race conditions, the timer handle
237 // (expiration = clock steady start + timeout) is passed into the callback.
onTimeout(TimerThread::Handle timerHandle) const238 void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) const
239 {
240 const std::chrono::steady_clock::time_point endSteadyTime = std::chrono::steady_clock::now();
241 const std::chrono::system_clock::time_point endSystemTime = std::chrono::system_clock::now();
242 // timerHandle incorporates the timeout
243 const float elapsedSteadyMs = std::chrono::duration_cast<FloatMs>(
244 endSteadyTime - (timerHandle - timeoutDuration)).count();
245 const float elapsedSystemMs = std::chrono::duration_cast<FloatMs>(
246 endSystemTime - startSystemTime).count();
247 const float requestedTimeoutMs = std::chrono::duration_cast<FloatMs>(
248 timeoutDuration).count();
249 const float secondChanceMs = std::chrono::duration_cast<FloatMs>(
250 secondChanceDuration).count();
251
252 if (onTimer) {
253 onTimer(true /* timeout */, elapsedSteadyMs);
254 }
255
256 if (!crashOnTimeout) return;
257
258 // Generate the TimerThread summary string early before sending signals to the
259 // HAL processes which can affect thread behavior.
260 const auto snapshotAnalysis = getTimeCheckThread().getSnapshotAnalysis(4 /* retiredCount */);
261
262 // Generate audio HAL processes tombstones and allow time to complete
263 // before forcing restart
264 std::vector<pid_t> pids = TimeCheck::getAudioHalPids();
265 std::string halPids = "HAL pids [ ";
266 if (pids.size() != 0) {
267 for (const auto& pid : pids) {
268 ALOGI("requesting tombstone for pid: %d", pid);
269 halPids.append(std::to_string(pid)).append(" ");
270 signalAudioHAL(pid);
271 }
272 sleep(1);
273 } else {
274 ALOGI("No HAL process pid available, skipping tombstones");
275 }
276 halPids.append("]");
277
278 LOG_EVENT_STRING(LOGTAG_AUDIO_BINDER_TIMEOUT, tag.c_str());
279
280 // Create abort message string - caution: this can be very large.
281 const std::string abortMessage = std::string("TimeCheck timeout for ")
282 .append(tag)
283 .append(" scheduled ").append(formatTime(startSystemTime))
284 .append(" on thread ").append(std::to_string(tid)).append("\n")
285 .append(analyzeTimeouts(requestedTimeoutMs + secondChanceMs,
286 elapsedSteadyMs, elapsedSystemMs)).append("\n")
287 .append(halPids).append("\n")
288 .append(snapshotAnalysis.toString());
289
290 // In many cases, the initial timeout stack differs from the abort backtrace because
291 // (1) the time difference between initial timeout and the final abort signal
292 // and (2) signalling the HAL audio service may cause
293 // the thread to unblock and continue.
294
295 // Note: LOG_ALWAYS_FATAL limits the size of the string - per log/log.h:
296 // Log message text may be truncated to less than an
297 // implementation-specific limit (1023 bytes).
298 //
299 // Here, we send the string through android-base/logging.h LOG()
300 // to avoid the size limitation. LOG(FATAL) does an abort whereas
301 // LOG(FATAL_WITHOUT_ABORT) does not abort.
302
303 static constexpr pid_t invalidPid = TimerThread::SnapshotAnalysis::INVALID_PID;
304 pid_t tidToAbort = invalidPid;
305 if (snapshotAnalysis.suspectTid != invalidPid) {
306 tidToAbort = snapshotAnalysis.suspectTid;
307 } else if (snapshotAnalysis.timeoutTid != invalidPid) {
308 tidToAbort = snapshotAnalysis.timeoutTid;
309 }
310
311 LOG(FATAL_WITHOUT_ABORT) << abortMessage;
312 const auto ret = abortTid(tidToAbort);
313 if (ret < 0) {
314 LOG(FATAL) << "TimeCheck thread signal failed, aborting process. "
315 "errno: " << errno << base::ErrnoNumberAsString(errno);
316 }
317 }
318
319 // Automatically create a TimeCheck class for a class and method.
320 // This is used for Audio HAL support.
makeTimeCheckStatsForClassMethod(std::string_view className,std::string_view methodName)321 mediautils::TimeCheck makeTimeCheckStatsForClassMethod(
322 std::string_view className, std::string_view methodName) {
323 std::shared_ptr<MethodStatistics<std::string>> statistics =
324 mediautils::getStatisticsForClass(className);
325 if (!statistics) return {}; // empty TimeCheck.
326 return mediautils::TimeCheck(
327 FixedString62(className).append("::").append(methodName),
328 [ safeMethodName = FixedString30(methodName),
329 stats = std::move(statistics) ]
330 (bool timeout, float elapsedMs) {
331 if (timeout) {
332 ; // ignored, there is no timeout value.
333 } else {
334 stats->event(safeMethodName.asStringView(), elapsedMs);
335 }
336 }, {} /* timeoutDuration */, {} /* secondChanceDuration */, false /* crashOnTimeout */);
337 }
338
339 } // namespace android::mediautils
340