1 /*
2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "gc_hung.h"
17
18 #include <unistd.h>
19 #include <cstdlib>
20 #include <dlfcn.h>
21 #include <ostream>
22 #include <sstream>
23 #include <sys/types.h>
24 #include <ctime>
25 #include <cinttypes>
26 #include <cstdio>
27 #include <csignal>
28 #include <sys/time.h>
29 #include "libpandabase/utils/logger.h"
30 #include "libpandabase/utils/type_converter.h"
31
32 namespace panda::mem {
33
34 /*
35 * If the macro HUNG_SYSTEM_SERVER_ONLY was defined, the module only
36 * supervise system_server. Otherwise, all the processes running in ARK
37 */
38 using std::string;
39
40 GcHung *GcHung::instance_ = nullptr;
41
42 // NOLINT(google-runtime-references)
Split(const PandaString & str,char delim,PandaVector<PandaString> * elems,bool skipEmpty=true)43 static void Split(const PandaString &str, char delim, PandaVector<PandaString> *elems, bool skipEmpty = true)
44 {
45 PandaIStringStream iss(str);
46 for (PandaString item; getline(iss, item, delim);) {
47 if (!(skipEmpty && item.empty())) {
48 elems->push_back(item);
49 }
50 }
51 }
52
GcHung()53 GcHung::GcHung()
54 : interval_limit_ms_(INTERVAL_LIMIT_MS_INIT),
55 over_time_limit_ms_(OVER_TIME_LIMIT_INIT_MS),
56 water_mark_limit_(WATER_MARK_LIMIT)
57 {
58 LOG(DEBUG, GC) << "GcHung: Instance created";
59 }
60
~GcHung()61 GcHung::~GcHung()
62 {
63 if (libimonitor_dl_handler_.GetNativeHandle() != nullptr) {
64 dlclose(libimonitor_dl_handler_.GetNativeHandle());
65 }
66 LOG(DEBUG, GC) << "GcHung: Instance deleted: water_mark: " << water_mark_;
67 }
68
GetConfig()69 int GcHung::GetConfig()
70 {
71 const unsigned int maxParaLen = 100;
72 // get parameter from zrhung
73 if (zrhung_get_config_ == nullptr) {
74 return -1;
75 }
76
77 char paraBuf[maxParaLen] = {0}; // NOLINT(modernize-avoid-c-arrays)
78 if (zrhung_get_config_(ZRHUNG_WP_GC, paraBuf, maxParaLen) != 0) {
79 LOG(DEBUG, GC) << "GcHung: failed to get config";
80 return -1;
81 }
82
83 PandaString paraStr(paraBuf);
84 PandaVector<PandaString> paraVec;
85 Split(paraStr, ',', ¶Vec);
86
87 if (paraVec.size() != GC_PARA_COUNT) {
88 LOG(ERROR, GC) << "GcHung: parse parameters failed";
89 return -1;
90 }
91
92 #ifdef HUNG_SYSTEM_SERVER_ONLY
93 enabled_ = (stoi(PandaStringToStd(paraVec[GC_PARA_ENABLE])) == 1) && is_systemserver_;
94 #else
95 enabled_ = (stoi(PandaStringToStd(paraVec[GC_PARA_ENABLE])) == 1);
96 #endif // HUNG_SYSTEM_SERVER_ONLY
97 LOG(INFO, GC) << "GcHung: module enable:" << enabled_;
98
99 interval_limit_ms_ = static_cast<uint64_t>(std::stoi(PandaStringToStd(paraVec[GC_PARA_INTERVAL])));
100 water_mark_limit_ = std::stoi(PandaStringToStd(paraVec[GC_PARA_WATERMARK]));
101 over_time_limit_ms_ = static_cast<uint64_t>(std::stoi(PandaStringToStd(paraVec[GC_PARA_OVERTIME])));
102 LOG(DEBUG, GC) << "GcHung: set parameter: interval_limit_ms_ = " << interval_limit_ms_ << "ms";
103 LOG(DEBUG, GC) << "GcHung: set parameter: water_mark_limit_ = " << water_mark_limit_;
104 LOG(DEBUG, GC) << "GcHung: set parameter: over_time_limit_ms_ = " << over_time_limit_ms_ << "ms";
105 config_ready_ = true;
106
107 return 0;
108 }
109
LoadLibimonitor()110 int GcHung::LoadLibimonitor()
111 {
112 if ((zrhung_send_event_ != nullptr) && (zrhung_get_config_ != nullptr)) {
113 return 1;
114 }
115 LOG(DEBUG, GC) << "GcHung: load libimonitor";
116 auto res_load = os::library_loader::Load(LIB_IMONITOR);
117 if (!res_load) {
118 LOG(ERROR, RUNTIME) << "failed to load " << LIB_IMONITOR << " Error: " << res_load.Error().ToString();
119 return -1;
120 }
121 libimonitor_dl_handler_ = std::move(res_load.Value());
122
123 auto zrhung_send_event_dlsym = os::library_loader::ResolveSymbol(libimonitor_dl_handler_, "zrhung_send_event");
124 if (!zrhung_send_event_dlsym) {
125 LOG(ERROR, RUNTIME) << "failed to dlsym symbol: zrhung_send_event";
126 dlclose(libimonitor_dl_handler_.GetNativeHandle());
127 return -1;
128 }
129 zrhung_send_event_ = reinterpret_cast<ZRHUNG_SEND_EVENT>(zrhung_send_event_dlsym.Value());
130
131 auto zrhung_get_config_dlsym = os::library_loader::ResolveSymbol(libimonitor_dl_handler_, "zrhung_get_config");
132 if (!zrhung_get_config_dlsym) {
133 LOG(ERROR, RUNTIME) << "failed to dlsym symbol: zrhung_get_config";
134 dlclose(libimonitor_dl_handler_.GetNativeHandle());
135 return -1;
136 }
137 zrhung_get_config_ = reinterpret_cast<ZRHUNG_GET_CONFIG>(zrhung_get_config_dlsym.Value());
138 return 0;
139 }
140
InitInternal(bool is_systemserver)141 void GcHung::InitInternal(bool is_systemserver)
142 {
143 pid_ = getpid();
144 last_gc_time_ns_ = 0;
145 congestion_duration_ns_ = 0;
146 water_mark_ = 0;
147 report_count_ = 0;
148 start_time_ns_ = 0;
149 is_systemserver_ = is_systemserver;
150 #ifdef HUNG_SYSTEM_SERVER_ONLY
151 ready_ = is_systemserver; // if is_systemserver == false, hung will be close and no way to open again
152 #else
153 ready_ = true;
154 #endif // HUNG_SYSTEM_SERVER_ONLY
155
156 LOG(DEBUG, GC) << "GcHung InitInternal: pid=" << pid_ << " enabled_=" << enabled_;
157 }
158
SendZerohungEvent(const PandaString & error,int pid,PandaString msg)159 void GcHung::SendZerohungEvent(const PandaString &error, int pid, PandaString msg)
160 {
161 msg = ">>>*******************" + error + "******************\n" + msg;
162 if ((zrhung_send_event_ == nullptr) || (zrhung_get_config_ == nullptr)) {
163 LOG(ERROR, GC) << "GcHung: zrhung functions not defined";
164 return;
165 }
166 if (pid > 0) {
167 PandaString command = "P=" + ToPandaString(pid);
168 zrhung_send_event_(ZRHUNG_WP_GC, command.c_str(), msg.c_str());
169 } else {
170 zrhung_send_event_(ZRHUNG_WP_GC, nullptr, msg.c_str());
171 }
172 }
173
174 // check threads suspend while get "Locks::mutator_lock->WriteLock()", and report to hung
CheckSuspend(const PandaList<MTManagedThread * > & threads,uint64_t start_time)175 void GcHung::CheckSuspend(const PandaList<MTManagedThread *> &threads, uint64_t start_time)
176 {
177 LOG(DEBUG, GC) << "GcHung: check suspend timeout";
178 PandaOStringStream oss;
179 for (const auto &thread : threads) {
180 if (thread == nullptr) {
181 continue;
182 }
183 if (!thread->IsSuspended()) {
184 auto tid = thread->GetId();
185
186 oss << "GcHung: Timed out waiting for thread " << tid << " to suspend, waited for "
187 << helpers::TimeConverter(time::GetCurrentTimeInNanos() - start_time) << std::endl;
188 }
189 }
190 LOG(ERROR, GC) << oss.str();
191
192 if (config_ready_ && enabled_) {
193 SendZerohungEvent("SuspendAll timed out", getpid(), oss.str());
194 }
195 }
196
CheckFrequency()197 void GcHung::CheckFrequency()
198 {
199 LOG(DEBUG, GC) << "GcHung: gc frequency check: PID = " << pid_
200 << " last_gc_time_ns_=" << helpers::TimeConverter(last_gc_time_ns_)
201 << " current_time=" << helpers::TimeConverter(time::GetCurrentTimeInNanos());
202
203 if (last_gc_time_ns_ == 0) {
204 last_gc_time_ns_ = time::GetCurrentTimeInNanos();
205 return;
206 }
207
208 using RESULT_DURATION = std::chrono::duration<uint64_t, std::deca>;
209 std::chrono::microseconds msec(interval_limit_ms_);
210 if ((start_time_ns_ - last_gc_time_ns_) < std::chrono::duration_cast<RESULT_DURATION>(msec).count()) {
211 water_mark_++;
212 congestion_duration_ns_ += (time::GetCurrentTimeInNanos() - last_gc_time_ns_);
213 LOG(DEBUG, GC) << "GcHung: proc " << pid_ << " water_mark_:" << water_mark_
214 << " duration:" << helpers::TimeConverter(time::GetCurrentTimeInNanos() - last_gc_time_ns_);
215 } else {
216 water_mark_ = 0;
217 congestion_duration_ns_ = 0;
218 }
219
220 if (water_mark_ > water_mark_limit_) {
221 PandaOStringStream oss;
222 oss << "GcHung: GC congestion PID:" << pid_ << " Freq:" << water_mark_ << "/"
223 << helpers::TimeConverter(congestion_duration_ns_);
224
225 LOG(ERROR, GC) << oss.str();
226 if (config_ready_ && enabled_) {
227 SendZerohungEvent("GC congestion", -1, oss.str()); // -1: invalid pid
228 }
229 water_mark_ = 0;
230 congestion_duration_ns_ = 0;
231 }
232 last_gc_time_ns_ = time::GetCurrentTimeInNanos();
233 }
234
CheckOvertime(const GCTask & task)235 void GcHung::CheckOvertime(const GCTask &task)
236 {
237 uint64_t gcTime = time::GetCurrentTimeInNanos() - start_time_ns_;
238 LOG(DEBUG, GC) << "GcHung: gc overtime check: start_time_ns_=" << helpers::TimeConverter(start_time_ns_)
239 << " current_time=" << helpers::TimeConverter(time::GetCurrentTimeInNanos())
240 << " total_time=" << helpers::TimeConverter(gcTime);
241 using RESULT_DURATION = std::chrono::duration<uint64_t, std::deca>;
242 std::chrono::microseconds msec(over_time_limit_ms_);
243 if (gcTime > std::chrono::duration_cast<RESULT_DURATION>(msec).count()) {
244 PandaOStringStream oss;
245 oss << "GcHung: GC overtime: total:" << helpers::TimeConverter(gcTime) << " cause: " << task.reason_;
246 LOG(ERROR, GC) << oss.str();
247 if (config_ready_ && enabled_) {
248 SendZerohungEvent("GC overtime", -1, oss.str()); // -1: invalid pid
249 }
250 }
251 }
252
UpdateStartTime()253 void GcHung::UpdateStartTime()
254 {
255 start_time_ns_ = time::GetCurrentTimeInNanos();
256 }
257
Start()258 void GcHung::Start()
259 {
260 if (instance_ == nullptr) {
261 return;
262 }
263 if (!instance_->IsEnabled() || !instance_->IsReady()) {
264 return;
265 }
266 instance_->UpdateStartTime();
267 }
268
Check(const GCTask & task)269 void GcHung::Check(const GCTask &task)
270 {
271 if (instance_ == nullptr) {
272 LOG(WARNING, GC) << "GcHung not initiated yet";
273 return;
274 }
275 if (!instance_->IsEnabled() || !instance_->IsReady()) {
276 return;
277 }
278 if (task.reason_ != GCTaskCause::NATIVE_ALLOC_CAUSE) {
279 instance_->CheckFrequency();
280 }
281 instance_->CheckOvertime(task);
282 }
283
284 // NOLINTNEXTLINE(google-runtime-references)
Check(const PandaList<MTManagedThread * > & threads,uint64_t start_time)285 void GcHung::Check(const PandaList<MTManagedThread *> &threads, uint64_t start_time)
286 {
287 if (instance_ != nullptr) {
288 instance_->CheckSuspend(threads, start_time);
289 } else {
290 LOG(INFO, GC) << "GcHung not initiated yet, skip checking";
291 }
292 }
293
UpdateConfig()294 bool GcHung::UpdateConfig()
295 {
296 if (instance_ == nullptr) {
297 LOG(ERROR, GC) << "GcHung Update Config failed, GcHung not initiated yet";
298 return false;
299 }
300 if (!instance_->IsReady()) {
301 LOG(ERROR, GC) << "GcHung Update Config failed, hung not ready";
302 return false;
303 }
304 if (instance_->GetConfig() != 0) {
305 LOG(ERROR, GC) << "GcHung Update Config failed, GetConfig again failed";
306 return false;
307 }
308 LOG(ERROR, GC) << "GcHung Update Config success";
309 return true;
310 }
311
InitPreFork(bool enabled)312 void GcHung::InitPreFork(bool enabled)
313 {
314 LOG(DEBUG, GC) << "GcHung: InitPreFork";
315 if (instance_ == nullptr) {
316 instance_ = new GcHung();
317 }
318 instance_->SetEnabled(enabled);
319 if (enabled) {
320 instance_->LoadLibimonitor();
321 instance_->GetConfig();
322 }
323 }
324
InitPostFork(bool is_systemserver)325 void GcHung::InitPostFork(bool is_systemserver)
326 {
327 LOG(DEBUG, GC) << "GcHung: InitPostFork";
328 if (instance_ != nullptr) {
329 instance_->InitInternal(is_systemserver);
330 }
331 }
332
Current()333 GcHung *GcHung::Current()
334 {
335 return instance_;
336 }
337
338 } // namespace panda::mem