1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "host/libs/process_monitor/process_monitor.h"
18
19 #ifdef __linux__
20 #include <sys/prctl.h>
21 #endif
22
23 #include <sys/types.h>
24 #include <sys/wait.h>
25
26 #include <errno.h>
27 #include <signal.h>
28 #include <stdio.h>
29
30 #include <algorithm>
31 #include <atomic>
32 #include <future>
33 #include <memory>
34 #include <string>
35 #include <vector>
36
37 #include <android-base/file.h>
38 #include <android-base/logging.h>
39 #include "android-base/strings.h"
40
41 #include "common/libs/transport/channel.h"
42 #include "common/libs/transport/channel_sharedfd.h"
43 #include "common/libs/utils/contains.h"
44 #include "common/libs/utils/result.h"
45 #include "common/libs/utils/subprocess.h"
46 #include "host/libs/command_util/util.h"
47 #include "host/libs/config/known_paths.h"
48
49 namespace cuttlefish {
50 namespace {
51
52 using transport::Channel;
53 using transport::CreateMessage;
54 using transport::ManagedMessage;
55
56 enum ParentToChildMessageType : std::uint8_t {
57 kStop = 1,
58 kHostResume = 2,
59 kHostSuspend = 3,
60 kError = 4,
61 };
62
63 enum ChildToParentResponseType : std::uint8_t {
64 kSuccess = 0,
65 kFailure = 1,
66 };
67
SendEmptyRequest(Channel & channel,uint32_t type)68 Result<void> SendEmptyRequest(Channel& channel, uint32_t type) {
69 ManagedMessage message = CF_EXPECT(CreateMessage(type, false, 0));
70 CF_EXPECT(channel.SendRequest(*message));
71 return {};
72 }
73
SendEmptyResponse(Channel & channel,uint32_t type)74 Result<void> SendEmptyResponse(Channel& channel, uint32_t type) {
75 ManagedMessage message = CF_EXPECT(CreateMessage(type, true, 0));
76 CF_EXPECT(channel.SendResponse(*message));
77 return {};
78 }
79
LogSubprocessExit(const std::string & name,pid_t pid,int wstatus)80 void LogSubprocessExit(const std::string& name, pid_t pid, int wstatus) {
81 LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
82 if (WIFEXITED(wstatus)) {
83 LOG(INFO) << "Subprocess " << name << " (" << pid
84 << ") has exited with exit code " << WEXITSTATUS(wstatus);
85 } else if (WIFSIGNALED(wstatus)) {
86 int sig_num = WTERMSIG(wstatus);
87 LOG(ERROR) << "Subprocess " << name << " (" << pid
88 << ") was interrupted by a signal '" << strsignal(sig_num)
89 << "' (" << sig_num << ")";
90 } else {
91 LOG(INFO) << "subprocess " << name << " (" << pid
92 << ") has exited for unknown reasons";
93 }
94 }
95
LogSubprocessExit(const std::string & name,const siginfo_t & infop)96 void LogSubprocessExit(const std::string& name, const siginfo_t& infop) {
97 LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
98 if (infop.si_code == CLD_EXITED) {
99 LOG(INFO) << "Subprocess " << name << " (" << infop.si_pid
100 << ") has exited with exit code " << infop.si_status;
101 } else if (infop.si_code == CLD_KILLED) {
102 LOG(ERROR) << "Subprocess " << name << " (" << infop.si_pid
103 << ") was interrupted by a signal '"
104 << strsignal(infop.si_status) << "' (" << infop.si_status << ")";
105 } else {
106 LOG(INFO) << "subprocess " << name << " (" << infop.si_pid
107 << ") has exited for unknown reasons (code = " << infop.si_code
108 << ", status = " << infop.si_status << ")";
109 }
110 }
111
MonitorLoop(std::atomic_bool & running,std::mutex & properties_mutex,const bool restart_subprocesses,std::vector<MonitorEntry> & monitored)112 Result<void> MonitorLoop(std::atomic_bool& running,
113 std::mutex& properties_mutex,
114 const bool restart_subprocesses,
115 std::vector<MonitorEntry>& monitored) {
116 while (running.load()) {
117 int wstatus;
118 pid_t pid = wait(&wstatus);
119 int error_num = errno;
120 CF_EXPECT(pid != -1, "Wait failed: " << strerror(error_num));
121 if (!WIFSIGNALED(wstatus) && !WIFEXITED(wstatus)) {
122 LOG(DEBUG) << "Unexpected status from wait: " << wstatus << " for pid "
123 << pid;
124 continue;
125 }
126 if (!running.load()) { // Avoid extra restarts near the end
127 break;
128 }
129 auto matches = [pid](const auto& it) { return it.proc->pid() == pid; };
130 std::unique_lock lock(properties_mutex);
131 auto it = std::find_if(monitored.begin(), monitored.end(), matches);
132 if (it == monitored.end()) {
133 LogSubprocessExit("(unknown)", pid, wstatus);
134 } else {
135 LogSubprocessExit(it->cmd->GetShortName(), it->proc->pid(), wstatus);
136 if (restart_subprocesses) {
137 auto options = SubprocessOptions().InGroup(true);
138 // in the future, cmd->Start might not run exec()
139 it->proc.reset(new Subprocess(it->cmd->Start(std::move(options))));
140 } else {
141 bool is_critical = it->is_critical;
142 monitored.erase(it);
143 if (running.load() && is_critical) {
144 LOG(ERROR) << "Stopping all monitored processes due to unexpected "
145 "exit of critical process";
146 running.store(false);
147 break;
148 }
149 }
150 }
151 }
152 return {};
153 }
154
StopSubprocesses(std::vector<MonitorEntry> & monitored)155 Result<void> StopSubprocesses(std::vector<MonitorEntry>& monitored) {
156 LOG(DEBUG) << "Stopping monitored subprocesses";
157 auto stop = [](const auto& it) {
158 auto stop_result = it.proc->Stop();
159 if (stop_result == StopperResult::kStopFailure) {
160 LOG(WARNING) << "Error in stopping \"" << it.cmd->GetShortName() << "\"";
161 return false;
162 }
163 siginfo_t infop;
164 auto success = it.proc->Wait(&infop, WEXITED);
165 if (success < 0) {
166 LOG(WARNING) << "Failed to wait for process " << it.cmd->GetShortName();
167 return false;
168 }
169 if (stop_result == StopperResult::kStopCrash) {
170 LogSubprocessExit(it.cmd->GetShortName(), infop);
171 }
172 return true;
173 };
174 // Processes were started in the order they appear in the vector, stop them in
175 // reverse order for symmetry.
176 size_t stopped = std::count_if(monitored.rbegin(), monitored.rend(), stop);
177 CF_EXPECT(stopped == monitored.size(), "Didn't stop all subprocesses");
178 return {};
179 }
180
SuspendResumeImpl(std::vector<MonitorEntry> & monitor_entries,std::mutex & properties_mutex,const SharedFD & channel_to_secure_env,const bool is_suspend,transport::SharedFdChannel & socket)181 Result<void> SuspendResumeImpl(std::vector<MonitorEntry>& monitor_entries,
182 std::mutex& properties_mutex,
183 const SharedFD& channel_to_secure_env,
184 const bool is_suspend,
185 transport::SharedFdChannel& socket) {
186 std::lock_guard lock(properties_mutex);
187 auto secure_env_itr = std::find_if(
188 monitor_entries.begin(), monitor_entries.end(), [](MonitorEntry& entry) {
189 auto prog_name = android::base::Basename(entry.cmd->Executable());
190 return (prog_name == "secure_env");
191 });
192 if (secure_env_itr != monitor_entries.end()) {
193 CF_EXPECT(channel_to_secure_env->IsOpen(),
194 "channel to secure_env is not open.");
195 run_cvd::ExtendedLauncherAction extended_action;
196 if (is_suspend) {
197 extended_action.mutable_suspend();
198 } else {
199 extended_action.mutable_resume();
200 }
201 CF_EXPECT(RunLauncherAction(channel_to_secure_env, extended_action,
202 std::nullopt));
203 }
204
205 for (const auto& entry : monitor_entries) {
206 if (!entry.cmd) {
207 LOG(ERROR) << "Monitor Entry has a nullptr for cmd.";
208 continue;
209 }
210 if (!entry.proc) {
211 LOG(ERROR) << "Monitor Entry has a nullptr for proc.";
212 continue;
213 }
214 auto prog_name = android::base::Basename(entry.cmd->Executable());
215 auto process_restart_bin =
216 android::base::Basename(ProcessRestarterBinary());
217 if (prog_name == "log_tee") {
218 // Don't stop log_tee, we want to continue processing logs while
219 // suspended.
220 continue;
221 }
222 if (prog_name == "wmediumd") {
223 // wmediumd should be running while openWRT is saved using the
224 // guest snapshot logic
225 continue;
226 }
227 if (prog_name == "secure_env") {
228 // secure_env was handled above in a customized way
229 continue;
230 }
231 if (android::base::StartsWith(prog_name, "cf_vhost_user_")) {
232 // vhost user backend processes need to continue handling requests from
233 // the VMM, which should send them the suspend signal.
234 continue;
235 }
236
237 if (process_restart_bin == prog_name) {
238 if (is_suspend) {
239 CF_EXPECT(entry.proc->SendSignal(SIGTSTP));
240 } else {
241 CF_EXPECT(entry.proc->SendSignal(SIGCONT));
242 }
243 continue;
244 }
245 if (is_suspend) {
246 CF_EXPECT(entry.proc->SendSignalToGroup(SIGTSTP));
247 } else {
248 CF_EXPECT(entry.proc->SendSignalToGroup(SIGCONT));
249 }
250 }
251 CF_EXPECT(SendEmptyResponse(socket, ChildToParentResponseType::kSuccess));
252 return {};
253 }
254
255 } // namespace
256
StartSubprocesses(ProcessMonitor::Properties & properties)257 Result<void> ProcessMonitor::StartSubprocesses(
258 ProcessMonitor::Properties& properties) {
259 LOG(DEBUG) << "Starting monitored subprocesses";
260 for (auto& monitored : properties.entries_) {
261 LOG(INFO) << monitored.cmd->GetShortName();
262 auto options = SubprocessOptions().InGroup(true);
263 std::string short_name = monitored.cmd->GetShortName();
264 auto last_slash = short_name.find_last_of('/');
265 if (last_slash != std::string::npos) {
266 short_name = short_name.substr(last_slash + 1);
267 }
268 if (Contains(properties_.strace_commands_, short_name)) {
269 options.Strace(properties.strace_log_dir_ + "/strace-" + short_name);
270 }
271 monitored.proc.reset(
272 new Subprocess(monitored.cmd->Start(std::move(options))));
273 CF_EXPECT(monitored.proc->Started(), "Failed to start subprocess");
274 }
275 return {};
276 }
277
ReadMonitorSocketLoop(std::atomic_bool & running)278 Result<void> ProcessMonitor::ReadMonitorSocketLoop(std::atomic_bool& running) {
279 LOG(DEBUG) << "Waiting for a `stop` message from the parent";
280 while (running.load()) {
281 ManagedMessage message = CF_EXPECT(child_channel_->ReceiveMessage());
282 if (message->command == ParentToChildMessageType::kStop) {
283 running.store(false);
284 // Wake up the wait() loop by giving it an exited child process
285 if (fork() == 0) {
286 std::exit(0);
287 }
288 // will break the for-loop as running is now false
289 continue;
290 }
291 if (message->command == ParentToChildMessageType::kHostSuspend) {
292 CF_EXPECT(SuspendHostProcessesImpl());
293 continue;
294 }
295 if (message->command == ParentToChildMessageType::kHostResume) {
296 CF_EXPECT(ResumeHostProcessesImpl());
297 continue;
298 }
299 }
300 return {};
301 }
302
SuspendHostProcessesImpl()303 Result<void> ProcessMonitor::SuspendHostProcessesImpl() {
304 CF_EXPECT(child_channel_.has_value());
305 CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
306 channel_to_secure_env_, /* is_suspend */ true,
307 *child_channel_),
308 "Failed suspend");
309 return {};
310 }
311
ResumeHostProcessesImpl()312 Result<void> ProcessMonitor::ResumeHostProcessesImpl() {
313 CF_EXPECT(child_channel_.has_value());
314 CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
315 channel_to_secure_env_, /* is_suspend */ false,
316 *child_channel_),
317 "Failed resume");
318 return {};
319 }
320
RestartSubprocesses(bool r)321 ProcessMonitor::Properties& ProcessMonitor::Properties::RestartSubprocesses(
322 bool r) & {
323 restart_subprocesses_ = r;
324 return *this;
325 }
326
AddCommand(MonitorCommand cmd)327 ProcessMonitor::Properties& ProcessMonitor::Properties::AddCommand(
328 MonitorCommand cmd) & {
329 entries_.emplace_back(std::move(cmd.command), cmd.is_critical);
330 return *this;
331 }
332
StraceCommands(std::set<std::string> strace)333 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceCommands(
334 std::set<std::string> strace) & {
335 strace_commands_ = std::move(strace);
336 return *this;
337 }
338
StraceLogDir(std::string log_dir)339 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceLogDir(
340 std::string log_dir) & {
341 strace_log_dir_ = std::move(log_dir);
342 return *this;
343 }
344
ProcessMonitor(ProcessMonitor::Properties && properties,const SharedFD & secure_env_fd)345 ProcessMonitor::ProcessMonitor(ProcessMonitor::Properties&& properties,
346 const SharedFD& secure_env_fd)
347 : properties_(std::move(properties)),
348 channel_to_secure_env_(secure_env_fd),
349 monitor_(-1) {}
350
StopMonitoredProcesses()351 Result<void> ProcessMonitor::StopMonitoredProcesses() {
352 CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
353 CF_EXPECT(parent_channel_.has_value(),
354 "The monitor socket is already closed");
355 CF_EXPECT(
356 SendEmptyRequest(*parent_channel_, ParentToChildMessageType::kStop));
357
358 pid_t last_monitor = monitor_;
359 monitor_ = -1;
360 parent_channel_.reset();
361 int wstatus;
362 CF_EXPECT(waitpid(last_monitor, &wstatus, 0) == last_monitor,
363 "Failed to wait for monitor process");
364 CF_EXPECT(!WIFSIGNALED(wstatus), "Monitor process exited due to a signal");
365 CF_EXPECT(WIFEXITED(wstatus), "Monitor process exited for unknown reasons");
366 CF_EXPECT(WEXITSTATUS(wstatus) == 0,
367 "Monitor process exited with code " << WEXITSTATUS(wstatus));
368 return {};
369 }
370
SuspendMonitoredProcesses()371 Result<void> ProcessMonitor::SuspendMonitoredProcesses() {
372 CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
373 CF_EXPECT(parent_channel_.has_value());
374 CF_EXPECT(SendEmptyRequest(*parent_channel_,
375 ParentToChildMessageType::kHostSuspend));
376
377 ManagedMessage response = CF_EXPECT(parent_channel_->ReceiveMessage());
378 CF_EXPECT(response->command == ChildToParentResponseType::kSuccess,
379 "On kHostSuspend, the child run_cvd returned kFailure.");
380 return {};
381 }
382
ResumeMonitoredProcesses()383 Result<void> ProcessMonitor::ResumeMonitoredProcesses() {
384 CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
385 CF_EXPECT(parent_channel_.has_value());
386 CF_EXPECT(SendEmptyRequest(*parent_channel_,
387 ParentToChildMessageType::kHostResume));
388
389 ManagedMessage response = CF_EXPECT(parent_channel_->ReceiveMessage());
390 CF_EXPECT(response->command == ChildToParentResponseType::kSuccess,
391 "On kHostResume, the child run_cvd returned kFailure.");
392 return {};
393 }
394
StartAndMonitorProcesses()395 Result<void> ProcessMonitor::StartAndMonitorProcesses() {
396 CF_EXPECT(monitor_ == -1, "The monitor process was already started");
397 CF_EXPECT(!parent_channel_.has_value(),
398 "Parent monitor socket was already opened");
399 SharedFD parent_sock;
400 SharedFD child_sock;
401 SharedFD::SocketPair(AF_UNIX, SOCK_STREAM, 0, &parent_sock, &child_sock);
402 monitor_ = fork();
403 if (monitor_ == 0) {
404 child_channel_ = transport::SharedFdChannel(child_sock, child_sock);
405 Result<void> monitor_result = MonitorRoutine();
406 if (!monitor_result.ok()) {
407 LOG(ERROR) << "Monitoring processes failed:\n"
408 << monitor_result.error().FormatForEnv();
409 }
410 std::exit(monitor_result.ok() ? 0 : 1);
411 } else {
412 parent_channel_ = transport::SharedFdChannel(parent_sock, parent_sock);
413 return {};
414 }
415 }
416
MonitorRoutine()417 Result<void> ProcessMonitor::MonitorRoutine() {
418 #ifdef __linux__
419 // Make this process a subreaper to reliably catch subprocess exits.
420 // See https://man7.org/linux/man-pages/man2/prctl.2.html
421 prctl(PR_SET_CHILD_SUBREAPER, 1);
422 prctl(PR_SET_PDEATHSIG, SIGHUP); // Die when parent dies
423 #endif
424
425 LOG(DEBUG) << "Monitoring subprocesses";
426 CF_EXPECT(StartSubprocesses(properties_));
427
428 std::atomic_bool running(true);
429
430 auto read_monitor_socket_loop =
431 [this](std::atomic_bool& running) -> Result<void> {
432 CF_EXPECT(this->ReadMonitorSocketLoop(running));
433 return {};
434 };
435 auto parent_comms = std::async(std::launch::async, read_monitor_socket_loop,
436 std::ref(running));
437
438 CF_EXPECT(MonitorLoop(running, properties_mutex_,
439 properties_.restart_subprocesses_,
440 properties_.entries_));
441 CF_EXPECT(parent_comms.get(), "Should have exited if monitoring stopped");
442
443 CF_EXPECT(StopSubprocesses(properties_.entries_));
444 LOG(DEBUG) << "Done monitoring subprocesses";
445 return {};
446 }
447
448 } // namespace cuttlefish
449