• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "gc_hung.h"
17 
18 #include <unistd.h>
19 #include <cstdlib>
20 #include <dlfcn.h>
21 #include <ostream>
22 #include <sstream>
23 #include <sys/types.h>
24 #include <ctime>
25 #include <cinttypes>
26 #include <cstdio>
27 #include <csignal>
28 #include <sys/time.h>
29 #include "libpandabase/utils/logger.h"
30 #include "libpandabase/utils/type_converter.h"
31 
32 namespace panda::mem {
33 
34 /*
35  * If the macro HUNG_SYSTEM_SERVER_ONLY was defined, the module only
36  * supervise system_server. Otherwise, all the processes running in ARK
37  */
38 using std::string;
39 
40 GcHung *GcHung::instance_ = nullptr;
41 
42 // NOLINT(google-runtime-references)
Split(const PandaString & str,char delim,PandaVector<PandaString> * elems,bool skipEmpty=true)43 static void Split(const PandaString &str, char delim, PandaVector<PandaString> *elems, bool skipEmpty = true)
44 {
45     PandaIStringStream iss(str);
46     for (PandaString item; getline(iss, item, delim);) {
47         if (!(skipEmpty && item.empty())) {
48             elems->push_back(item);
49         }
50     }
51 }
52 
GcHung()53 GcHung::GcHung()
54     : interval_limit_ms_(INTERVAL_LIMIT_MS_INIT),
55       over_time_limit_ms_(OVER_TIME_LIMIT_INIT_MS),
56       water_mark_limit_(WATER_MARK_LIMIT)
57 {
58     LOG(DEBUG, GC) << "GcHung: Instance created";
59 }
60 
~GcHung()61 GcHung::~GcHung()
62 {
63     if (libimonitor_dl_handler_.GetNativeHandle() != nullptr) {
64         dlclose(libimonitor_dl_handler_.GetNativeHandle());
65     }
66     LOG(DEBUG, GC) << "GcHung: Instance deleted: water_mark: " << water_mark_;
67 }
68 
GetConfig()69 int GcHung::GetConfig()
70 {
71     const unsigned int maxParaLen = 100;
72     // get parameter from zrhung
73     if (zrhung_get_config_ == nullptr) {
74         return -1;
75     }
76 
77     char paraBuf[maxParaLen] = {0};  // NOLINT(modernize-avoid-c-arrays)
78     if (zrhung_get_config_(ZRHUNG_WP_GC, paraBuf, maxParaLen) != 0) {
79         LOG(DEBUG, GC) << "GcHung: failed to get config";
80         return -1;
81     }
82 
83     PandaString paraStr(paraBuf);
84     PandaVector<PandaString> paraVec;
85     Split(paraStr, ',', &paraVec);
86 
87     if (paraVec.size() != GC_PARA_COUNT) {
88         LOG(ERROR, GC) << "GcHung: parse parameters failed";
89         return -1;
90     }
91 
92 #ifdef HUNG_SYSTEM_SERVER_ONLY
93     enabled_ = (stoi(PandaStringToStd(paraVec[GC_PARA_ENABLE])) == 1) && is_systemserver_;
94 #else
95     enabled_ = (stoi(PandaStringToStd(paraVec[GC_PARA_ENABLE])) == 1);
96 #endif  // HUNG_SYSTEM_SERVER_ONLY
97     LOG(INFO, GC) << "GcHung: module enable:" << enabled_;
98 
99     interval_limit_ms_ = static_cast<uint64_t>(std::stoi(PandaStringToStd(paraVec[GC_PARA_INTERVAL])));
100     water_mark_limit_ = std::stoi(PandaStringToStd(paraVec[GC_PARA_WATERMARK]));
101     over_time_limit_ms_ = static_cast<uint64_t>(std::stoi(PandaStringToStd(paraVec[GC_PARA_OVERTIME])));
102     LOG(DEBUG, GC) << "GcHung: set parameter: interval_limit_ms_ = " << interval_limit_ms_ << "ms";
103     LOG(DEBUG, GC) << "GcHung: set parameter: water_mark_limit_ = " << water_mark_limit_;
104     LOG(DEBUG, GC) << "GcHung: set parameter: over_time_limit_ms_ = " << over_time_limit_ms_ << "ms";
105     config_ready_ = true;
106 
107     return 0;
108 }
109 
LoadLibimonitor()110 int GcHung::LoadLibimonitor()
111 {
112     if ((zrhung_send_event_ != nullptr) && (zrhung_get_config_ != nullptr)) {
113         return 1;
114     }
115     LOG(DEBUG, GC) << "GcHung: load libimonitor";
116     auto res_load = os::library_loader::Load(LIB_IMONITOR);
117     if (!res_load) {
118         LOG(ERROR, RUNTIME) << "failed to load " << LIB_IMONITOR << " Error: " << res_load.Error().ToString();
119         return -1;
120     }
121     libimonitor_dl_handler_ = std::move(res_load.Value());
122 
123     auto zrhung_send_event_dlsym = os::library_loader::ResolveSymbol(libimonitor_dl_handler_, "zrhung_send_event");
124     if (!zrhung_send_event_dlsym) {
125         LOG(ERROR, RUNTIME) << "failed to dlsym symbol: zrhung_send_event";
126         dlclose(libimonitor_dl_handler_.GetNativeHandle());
127         return -1;
128     }
129     zrhung_send_event_ = reinterpret_cast<ZRHUNG_SEND_EVENT>(zrhung_send_event_dlsym.Value());
130 
131     auto zrhung_get_config_dlsym = os::library_loader::ResolveSymbol(libimonitor_dl_handler_, "zrhung_get_config");
132     if (!zrhung_get_config_dlsym) {
133         LOG(ERROR, RUNTIME) << "failed to dlsym symbol: zrhung_get_config";
134         dlclose(libimonitor_dl_handler_.GetNativeHandle());
135         return -1;
136     }
137     zrhung_get_config_ = reinterpret_cast<ZRHUNG_GET_CONFIG>(zrhung_get_config_dlsym.Value());
138     return 0;
139 }
140 
InitInternal(bool is_systemserver)141 void GcHung::InitInternal(bool is_systemserver)
142 {
143     pid_ = getpid();
144     last_gc_time_ns_ = 0;
145     congestion_duration_ns_ = 0;
146     water_mark_ = 0;
147     report_count_ = 0;
148     start_time_ns_ = 0;
149     is_systemserver_ = is_systemserver;
150 #ifdef HUNG_SYSTEM_SERVER_ONLY
151     ready_ = is_systemserver;  // if is_systemserver == false, hung will be close and no way to open again
152 #else
153     ready_ = true;
154 #endif  // HUNG_SYSTEM_SERVER_ONLY
155 
156     LOG(DEBUG, GC) << "GcHung InitInternal: pid=" << pid_ << " enabled_=" << enabled_;
157 }
158 
SendZerohungEvent(const PandaString & error,int pid,PandaString msg)159 void GcHung::SendZerohungEvent(const PandaString &error, int pid, PandaString msg)
160 {
161     msg = ">>>*******************" + error + "******************\n" + msg;
162     if ((zrhung_send_event_ == nullptr) || (zrhung_get_config_ == nullptr)) {
163         LOG(ERROR, GC) << "GcHung: zrhung functions not defined";
164         return;
165     }
166     if (pid > 0) {
167         PandaString command = "P=" + ToPandaString(pid);
168         zrhung_send_event_(ZRHUNG_WP_GC, command.c_str(), msg.c_str());
169     } else {
170         zrhung_send_event_(ZRHUNG_WP_GC, nullptr, msg.c_str());
171     }
172 }
173 
174 // check threads suspend while get "Locks::mutator_lock->WriteLock()", and report to hung
CheckSuspend(const PandaList<MTManagedThread * > & threads,uint64_t start_time)175 void GcHung::CheckSuspend(const PandaList<MTManagedThread *> &threads, uint64_t start_time)
176 {
177     LOG(DEBUG, GC) << "GcHung: check suspend timeout";
178     PandaOStringStream oss;
179     for (const auto &thread : threads) {
180         if (thread == nullptr) {
181             continue;
182         }
183         if (!thread->IsSuspended()) {
184             auto tid = thread->GetId();
185 
186             oss << "GcHung: Timed out waiting for thread " << tid << " to suspend, waited for "
187                 << helpers::TimeConverter(time::GetCurrentTimeInNanos() - start_time) << std::endl;
188         }
189     }
190     LOG(ERROR, GC) << oss.str();
191 
192     if (config_ready_ && enabled_) {
193         SendZerohungEvent("SuspendAll timed out", getpid(), oss.str());
194     }
195 }
196 
CheckFrequency()197 void GcHung::CheckFrequency()
198 {
199     LOG(DEBUG, GC) << "GcHung: gc frequency check: PID = " << pid_
200                    << " last_gc_time_ns_=" << helpers::TimeConverter(last_gc_time_ns_)
201                    << " current_time=" << helpers::TimeConverter(time::GetCurrentTimeInNanos());
202 
203     if (last_gc_time_ns_ == 0) {
204         last_gc_time_ns_ = time::GetCurrentTimeInNanos();
205         return;
206     }
207 
208     using RESULT_DURATION = std::chrono::duration<uint64_t, std::deca>;
209     std::chrono::microseconds msec(interval_limit_ms_);
210     if ((start_time_ns_ - last_gc_time_ns_) < std::chrono::duration_cast<RESULT_DURATION>(msec).count()) {
211         water_mark_++;
212         congestion_duration_ns_ += (time::GetCurrentTimeInNanos() - last_gc_time_ns_);
213         LOG(DEBUG, GC) << "GcHung: proc " << pid_ << " water_mark_:" << water_mark_
214                        << " duration:" << helpers::TimeConverter(time::GetCurrentTimeInNanos() - last_gc_time_ns_);
215     } else {
216         water_mark_ = 0;
217         congestion_duration_ns_ = 0;
218     }
219 
220     if (water_mark_ > water_mark_limit_) {
221         PandaOStringStream oss;
222         oss << "GcHung: GC congestion PID:" << pid_ << " Freq:" << water_mark_ << "/"
223             << helpers::TimeConverter(congestion_duration_ns_);
224 
225         LOG(ERROR, GC) << oss.str();
226         if (config_ready_ && enabled_) {
227             SendZerohungEvent("GC congestion", -1, oss.str());  // -1: invalid pid
228         }
229         water_mark_ = 0;
230         congestion_duration_ns_ = 0;
231     }
232     last_gc_time_ns_ = time::GetCurrentTimeInNanos();
233 }
234 
CheckOvertime(const GCTask & task)235 void GcHung::CheckOvertime(const GCTask &task)
236 {
237     uint64_t gcTime = time::GetCurrentTimeInNanos() - start_time_ns_;
238     LOG(DEBUG, GC) << "GcHung: gc overtime check: start_time_ns_=" << helpers::TimeConverter(start_time_ns_)
239                    << " current_time=" << helpers::TimeConverter(time::GetCurrentTimeInNanos())
240                    << " total_time=" << helpers::TimeConverter(gcTime);
241     using RESULT_DURATION = std::chrono::duration<uint64_t, std::deca>;
242     std::chrono::microseconds msec(over_time_limit_ms_);
243     if (gcTime > std::chrono::duration_cast<RESULT_DURATION>(msec).count()) {
244         PandaOStringStream oss;
245         oss << "GcHung: GC overtime: total:" << helpers::TimeConverter(gcTime) << " cause: " << task.reason_;
246         LOG(ERROR, GC) << oss.str();
247         if (config_ready_ && enabled_) {
248             SendZerohungEvent("GC overtime", -1, oss.str());  // -1: invalid pid
249         }
250     }
251 }
252 
UpdateStartTime()253 void GcHung::UpdateStartTime()
254 {
255     start_time_ns_ = time::GetCurrentTimeInNanos();
256 }
257 
Start()258 void GcHung::Start()
259 {
260     if (instance_ == nullptr) {
261         return;
262     }
263     if (!instance_->IsEnabled() || !instance_->IsReady()) {
264         return;
265     }
266     instance_->UpdateStartTime();
267 }
268 
Check(const GCTask & task)269 void GcHung::Check(const GCTask &task)
270 {
271     if (instance_ == nullptr) {
272         LOG(WARNING, GC) << "GcHung not initiated yet";
273         return;
274     }
275     if (!instance_->IsEnabled() || !instance_->IsReady()) {
276         return;
277     }
278     if (task.reason_ != GCTaskCause::NATIVE_ALLOC_CAUSE) {
279         instance_->CheckFrequency();
280     }
281     instance_->CheckOvertime(task);
282 }
283 
284 // NOLINTNEXTLINE(google-runtime-references)
Check(const PandaList<MTManagedThread * > & threads,uint64_t start_time)285 void GcHung::Check(const PandaList<MTManagedThread *> &threads, uint64_t start_time)
286 {
287     if (instance_ != nullptr) {
288         instance_->CheckSuspend(threads, start_time);
289     } else {
290         LOG(INFO, GC) << "GcHung not initiated yet, skip checking";
291     }
292 }
293 
UpdateConfig()294 bool GcHung::UpdateConfig()
295 {
296     if (instance_ == nullptr) {
297         LOG(ERROR, GC) << "GcHung Update Config failed, GcHung not initiated yet";
298         return false;
299     }
300     if (!instance_->IsReady()) {
301         LOG(ERROR, GC) << "GcHung Update Config failed, hung not ready";
302         return false;
303     }
304     if (instance_->GetConfig() != 0) {
305         LOG(ERROR, GC) << "GcHung Update Config failed, GetConfig again failed";
306         return false;
307     }
308     LOG(ERROR, GC) << "GcHung Update Config success";
309     return true;
310 }
311 
InitPreFork(bool enabled)312 void GcHung::InitPreFork(bool enabled)
313 {
314     LOG(DEBUG, GC) << "GcHung: InitPreFork";
315     if (instance_ == nullptr) {
316         instance_ = new GcHung();
317     }
318     instance_->SetEnabled(enabled);
319     if (enabled) {
320         instance_->LoadLibimonitor();
321         instance_->GetConfig();
322     }
323 }
324 
InitPostFork(bool is_systemserver)325 void GcHung::InitPostFork(bool is_systemserver)
326 {
327     LOG(DEBUG, GC) << "GcHung: InitPostFork";
328     if (instance_ != nullptr) {
329         instance_->InitInternal(is_systemserver);
330     }
331 }
332 
Current()333 GcHung *GcHung::Current()
334 {
335     return instance_;
336 }
337 
338 }  // namespace panda::mem