• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "host/libs/process_monitor/process_monitor.h"
18 
19 #ifdef __linux__
20 #include <sys/prctl.h>
21 #endif
22 
23 #include <sys/types.h>
24 #include <sys/wait.h>
25 
26 #include <errno.h>
27 #include <signal.h>
28 #include <stdio.h>
29 
30 #include <algorithm>
31 #include <atomic>
32 #include <future>
33 #include <memory>
34 #include <string>
35 #include <vector>
36 
37 #include <android-base/file.h>
38 #include <android-base/logging.h>
39 #include "android-base/strings.h"
40 
41 #include "common/libs/transport/channel.h"
42 #include "common/libs/transport/channel_sharedfd.h"
43 #include "common/libs/utils/contains.h"
44 #include "common/libs/utils/result.h"
45 #include "common/libs/utils/subprocess.h"
46 #include "host/libs/command_util/util.h"
47 #include "host/libs/config/known_paths.h"
48 
49 namespace cuttlefish {
50 namespace {
51 
52 using transport::Channel;
53 using transport::CreateMessage;
54 using transport::ManagedMessage;
55 
56 enum ParentToChildMessageType : std::uint8_t {
57   kStop = 1,
58   kHostResume = 2,
59   kHostSuspend = 3,
60   kError = 4,
61 };
62 
63 enum ChildToParentResponseType : std::uint8_t {
64   kSuccess = 0,
65   kFailure = 1,
66 };
67 
SendEmptyRequest(Channel & channel,uint32_t type)68 Result<void> SendEmptyRequest(Channel& channel, uint32_t type) {
69   ManagedMessage message = CF_EXPECT(CreateMessage(type, false, 0));
70   CF_EXPECT(channel.SendRequest(*message));
71   return {};
72 }
73 
SendEmptyResponse(Channel & channel,uint32_t type)74 Result<void> SendEmptyResponse(Channel& channel, uint32_t type) {
75   ManagedMessage message = CF_EXPECT(CreateMessage(type, true, 0));
76   CF_EXPECT(channel.SendResponse(*message));
77   return {};
78 }
79 
LogSubprocessExit(const std::string & name,pid_t pid,int wstatus)80 void LogSubprocessExit(const std::string& name, pid_t pid, int wstatus) {
81   LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
82   if (WIFEXITED(wstatus)) {
83     LOG(INFO) << "Subprocess " << name << " (" << pid
84               << ") has exited with exit code " << WEXITSTATUS(wstatus);
85   } else if (WIFSIGNALED(wstatus)) {
86     int sig_num = WTERMSIG(wstatus);
87     LOG(ERROR) << "Subprocess " << name << " (" << pid
88                << ") was interrupted by a signal '" << strsignal(sig_num)
89                << "' (" << sig_num << ")";
90   } else {
91     LOG(INFO) << "subprocess " << name << " (" << pid
92               << ") has exited for unknown reasons";
93   }
94 }
95 
LogSubprocessExit(const std::string & name,const siginfo_t & infop)96 void LogSubprocessExit(const std::string& name, const siginfo_t& infop) {
97   LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
98   if (infop.si_code == CLD_EXITED) {
99     LOG(INFO) << "Subprocess " << name << " (" << infop.si_pid
100               << ") has exited with exit code " << infop.si_status;
101   } else if (infop.si_code == CLD_KILLED) {
102     LOG(ERROR) << "Subprocess " << name << " (" << infop.si_pid
103                << ") was interrupted by a signal '"
104                << strsignal(infop.si_status) << "' (" << infop.si_status << ")";
105   } else {
106     LOG(INFO) << "subprocess " << name << " (" << infop.si_pid
107               << ") has exited for unknown reasons (code = " << infop.si_code
108               << ", status = " << infop.si_status << ")";
109   }
110 }
111 
MonitorLoop(std::atomic_bool & running,std::mutex & properties_mutex,const bool restart_subprocesses,std::vector<MonitorEntry> & monitored)112 Result<void> MonitorLoop(std::atomic_bool& running,
113                          std::mutex& properties_mutex,
114                          const bool restart_subprocesses,
115                          std::vector<MonitorEntry>& monitored) {
116   while (running.load()) {
117     int wstatus;
118     pid_t pid = wait(&wstatus);
119     int error_num = errno;
120     CF_EXPECT(pid != -1, "Wait failed: " << strerror(error_num));
121     if (!WIFSIGNALED(wstatus) && !WIFEXITED(wstatus)) {
122       LOG(DEBUG) << "Unexpected status from wait: " << wstatus << " for pid "
123                  << pid;
124       continue;
125     }
126     if (!running.load()) {  // Avoid extra restarts near the end
127       break;
128     }
129     auto matches = [pid](const auto& it) { return it.proc->pid() == pid; };
130     std::unique_lock lock(properties_mutex);
131     auto it = std::find_if(monitored.begin(), monitored.end(), matches);
132     if (it == monitored.end()) {
133       LogSubprocessExit("(unknown)", pid, wstatus);
134     } else {
135       LogSubprocessExit(it->cmd->GetShortName(), it->proc->pid(), wstatus);
136       if (restart_subprocesses) {
137         auto options = SubprocessOptions().InGroup(true);
138         // in the future, cmd->Start might not run exec()
139         it->proc.reset(new Subprocess(it->cmd->Start(std::move(options))));
140       } else {
141         bool is_critical = it->is_critical;
142         monitored.erase(it);
143         if (running.load() && is_critical) {
144           LOG(ERROR) << "Stopping all monitored processes due to unexpected "
145                         "exit of critical process";
146           running.store(false);
147           break;
148         }
149       }
150     }
151   }
152   return {};
153 }
154 
StopSubprocesses(std::vector<MonitorEntry> & monitored)155 Result<void> StopSubprocesses(std::vector<MonitorEntry>& monitored) {
156   LOG(DEBUG) << "Stopping monitored subprocesses";
157   auto stop = [](const auto& it) {
158     auto stop_result = it.proc->Stop();
159     if (stop_result == StopperResult::kStopFailure) {
160       LOG(WARNING) << "Error in stopping \"" << it.cmd->GetShortName() << "\"";
161       return false;
162     }
163     siginfo_t infop;
164     auto success = it.proc->Wait(&infop, WEXITED);
165     if (success < 0) {
166       LOG(WARNING) << "Failed to wait for process " << it.cmd->GetShortName();
167       return false;
168     }
169     if (stop_result == StopperResult::kStopCrash) {
170       LogSubprocessExit(it.cmd->GetShortName(), infop);
171     }
172     return true;
173   };
174   // Processes were started in the order they appear in the vector, stop them in
175   // reverse order for symmetry.
176   size_t stopped = std::count_if(monitored.rbegin(), monitored.rend(), stop);
177   CF_EXPECT(stopped == monitored.size(), "Didn't stop all subprocesses");
178   return {};
179 }
180 
SuspendResumeImpl(std::vector<MonitorEntry> & monitor_entries,std::mutex & properties_mutex,const SharedFD & channel_to_secure_env,const bool is_suspend,transport::SharedFdChannel & socket)181 Result<void> SuspendResumeImpl(std::vector<MonitorEntry>& monitor_entries,
182                                std::mutex& properties_mutex,
183                                const SharedFD& channel_to_secure_env,
184                                const bool is_suspend,
185                                transport::SharedFdChannel& socket) {
186   std::lock_guard lock(properties_mutex);
187   auto secure_env_itr = std::find_if(
188       monitor_entries.begin(), monitor_entries.end(), [](MonitorEntry& entry) {
189         auto prog_name = android::base::Basename(entry.cmd->Executable());
190         return (prog_name == "secure_env");
191       });
192   if (secure_env_itr != monitor_entries.end()) {
193     CF_EXPECT(channel_to_secure_env->IsOpen(),
194               "channel to secure_env is not open.");
195     run_cvd::ExtendedLauncherAction extended_action;
196     if (is_suspend) {
197       extended_action.mutable_suspend();
198     } else {
199       extended_action.mutable_resume();
200     }
201     CF_EXPECT(RunLauncherAction(channel_to_secure_env, extended_action,
202                                 std::nullopt));
203   }
204 
205   for (const auto& entry : monitor_entries) {
206     if (!entry.cmd) {
207       LOG(ERROR) << "Monitor Entry has a nullptr for cmd.";
208       continue;
209     }
210     if (!entry.proc) {
211       LOG(ERROR) << "Monitor Entry has a nullptr for proc.";
212       continue;
213     }
214     auto prog_name = android::base::Basename(entry.cmd->Executable());
215     auto process_restart_bin =
216         android::base::Basename(ProcessRestarterBinary());
217     if (prog_name == "log_tee") {
218       // Don't stop log_tee, we want to continue processing logs while
219       // suspended.
220       continue;
221     }
222     if (prog_name == "wmediumd") {
223       // wmediumd should be running while openWRT is saved using the
224       // guest snapshot logic
225       continue;
226     }
227     if (prog_name == "secure_env") {
228       // secure_env was handled above in a customized way
229       continue;
230     }
231     if (android::base::StartsWith(prog_name, "cf_vhost_user_")) {
232       // vhost user backend processes need to continue handling requests from
233       // the VMM, which should send them the suspend signal.
234       continue;
235     }
236 
237     if (process_restart_bin == prog_name) {
238       if (is_suspend) {
239         CF_EXPECT(entry.proc->SendSignal(SIGTSTP));
240       } else {
241         CF_EXPECT(entry.proc->SendSignal(SIGCONT));
242       }
243       continue;
244     }
245     if (is_suspend) {
246       CF_EXPECT(entry.proc->SendSignalToGroup(SIGTSTP));
247     } else {
248       CF_EXPECT(entry.proc->SendSignalToGroup(SIGCONT));
249     }
250   }
251   CF_EXPECT(SendEmptyResponse(socket, ChildToParentResponseType::kSuccess));
252   return {};
253 }
254 
255 }  // namespace
256 
StartSubprocesses(ProcessMonitor::Properties & properties)257 Result<void> ProcessMonitor::StartSubprocesses(
258     ProcessMonitor::Properties& properties) {
259   LOG(DEBUG) << "Starting monitored subprocesses";
260   for (auto& monitored : properties.entries_) {
261     LOG(INFO) << monitored.cmd->GetShortName();
262     auto options = SubprocessOptions().InGroup(true);
263     std::string short_name = monitored.cmd->GetShortName();
264     auto last_slash = short_name.find_last_of('/');
265     if (last_slash != std::string::npos) {
266       short_name = short_name.substr(last_slash + 1);
267     }
268     if (Contains(properties_.strace_commands_, short_name)) {
269       options.Strace(properties.strace_log_dir_ + "/strace-" + short_name);
270     }
271     monitored.proc.reset(
272         new Subprocess(monitored.cmd->Start(std::move(options))));
273     CF_EXPECT(monitored.proc->Started(), "Failed to start subprocess");
274   }
275   return {};
276 }
277 
ReadMonitorSocketLoop(std::atomic_bool & running)278 Result<void> ProcessMonitor::ReadMonitorSocketLoop(std::atomic_bool& running) {
279   LOG(DEBUG) << "Waiting for a `stop` message from the parent";
280   while (running.load()) {
281     ManagedMessage message = CF_EXPECT(child_channel_->ReceiveMessage());
282     if (message->command == ParentToChildMessageType::kStop) {
283       running.store(false);
284       // Wake up the wait() loop by giving it an exited child process
285       if (fork() == 0) {
286         std::exit(0);
287       }
288       // will break the for-loop as running is now false
289       continue;
290     }
291     if (message->command == ParentToChildMessageType::kHostSuspend) {
292       CF_EXPECT(SuspendHostProcessesImpl());
293       continue;
294     }
295     if (message->command == ParentToChildMessageType::kHostResume) {
296       CF_EXPECT(ResumeHostProcessesImpl());
297       continue;
298     }
299   }
300   return {};
301 }
302 
SuspendHostProcessesImpl()303 Result<void> ProcessMonitor::SuspendHostProcessesImpl() {
304   CF_EXPECT(child_channel_.has_value());
305   CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
306                               channel_to_secure_env_, /* is_suspend */ true,
307                               *child_channel_),
308             "Failed suspend");
309   return {};
310 }
311 
ResumeHostProcessesImpl()312 Result<void> ProcessMonitor::ResumeHostProcessesImpl() {
313   CF_EXPECT(child_channel_.has_value());
314   CF_EXPECT(SuspendResumeImpl(properties_.entries_, properties_mutex_,
315                               channel_to_secure_env_, /* is_suspend */ false,
316                               *child_channel_),
317             "Failed resume");
318   return {};
319 }
320 
RestartSubprocesses(bool r)321 ProcessMonitor::Properties& ProcessMonitor::Properties::RestartSubprocesses(
322     bool r) & {
323   restart_subprocesses_ = r;
324   return *this;
325 }
326 
AddCommand(MonitorCommand cmd)327 ProcessMonitor::Properties& ProcessMonitor::Properties::AddCommand(
328     MonitorCommand cmd) & {
329   entries_.emplace_back(std::move(cmd.command), cmd.is_critical);
330   return *this;
331 }
332 
StraceCommands(std::set<std::string> strace)333 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceCommands(
334     std::set<std::string> strace) & {
335   strace_commands_ = std::move(strace);
336   return *this;
337 }
338 
StraceLogDir(std::string log_dir)339 ProcessMonitor::Properties& ProcessMonitor::Properties::StraceLogDir(
340     std::string log_dir) & {
341   strace_log_dir_ = std::move(log_dir);
342   return *this;
343 }
344 
ProcessMonitor(ProcessMonitor::Properties && properties,const SharedFD & secure_env_fd)345 ProcessMonitor::ProcessMonitor(ProcessMonitor::Properties&& properties,
346                                const SharedFD& secure_env_fd)
347     : properties_(std::move(properties)),
348       channel_to_secure_env_(secure_env_fd),
349       monitor_(-1) {}
350 
StopMonitoredProcesses()351 Result<void> ProcessMonitor::StopMonitoredProcesses() {
352   CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
353   CF_EXPECT(parent_channel_.has_value(),
354             "The monitor socket is already closed");
355   CF_EXPECT(
356       SendEmptyRequest(*parent_channel_, ParentToChildMessageType::kStop));
357 
358   pid_t last_monitor = monitor_;
359   monitor_ = -1;
360   parent_channel_.reset();
361   int wstatus;
362   CF_EXPECT(waitpid(last_monitor, &wstatus, 0) == last_monitor,
363             "Failed to wait for monitor process");
364   CF_EXPECT(!WIFSIGNALED(wstatus), "Monitor process exited due to a signal");
365   CF_EXPECT(WIFEXITED(wstatus), "Monitor process exited for unknown reasons");
366   CF_EXPECT(WEXITSTATUS(wstatus) == 0,
367             "Monitor process exited with code " << WEXITSTATUS(wstatus));
368   return {};
369 }
370 
SuspendMonitoredProcesses()371 Result<void> ProcessMonitor::SuspendMonitoredProcesses() {
372   CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
373   CF_EXPECT(parent_channel_.has_value());
374   CF_EXPECT(SendEmptyRequest(*parent_channel_,
375                              ParentToChildMessageType::kHostSuspend));
376 
377   ManagedMessage response = CF_EXPECT(parent_channel_->ReceiveMessage());
378   CF_EXPECT(response->command == ChildToParentResponseType::kSuccess,
379             "On kHostSuspend, the child run_cvd returned kFailure.");
380   return {};
381 }
382 
ResumeMonitoredProcesses()383 Result<void> ProcessMonitor::ResumeMonitoredProcesses() {
384   CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
385   CF_EXPECT(parent_channel_.has_value());
386   CF_EXPECT(SendEmptyRequest(*parent_channel_,
387                              ParentToChildMessageType::kHostResume));
388 
389   ManagedMessage response = CF_EXPECT(parent_channel_->ReceiveMessage());
390   CF_EXPECT(response->command == ChildToParentResponseType::kSuccess,
391             "On kHostResume, the child run_cvd returned kFailure.");
392   return {};
393 }
394 
StartAndMonitorProcesses()395 Result<void> ProcessMonitor::StartAndMonitorProcesses() {
396   CF_EXPECT(monitor_ == -1, "The monitor process was already started");
397   CF_EXPECT(!parent_channel_.has_value(),
398             "Parent monitor socket was already opened");
399   SharedFD parent_sock;
400   SharedFD child_sock;
401   SharedFD::SocketPair(AF_UNIX, SOCK_STREAM, 0, &parent_sock, &child_sock);
402   monitor_ = fork();
403   if (monitor_ == 0) {
404     child_channel_ = transport::SharedFdChannel(child_sock, child_sock);
405     Result<void> monitor_result = MonitorRoutine();
406     if (!monitor_result.ok()) {
407       LOG(ERROR) << "Monitoring processes failed:\n"
408                  << monitor_result.error().FormatForEnv();
409     }
410     std::exit(monitor_result.ok() ? 0 : 1);
411   } else {
412     parent_channel_ = transport::SharedFdChannel(parent_sock, parent_sock);
413     return {};
414   }
415 }
416 
MonitorRoutine()417 Result<void> ProcessMonitor::MonitorRoutine() {
418 #ifdef __linux__
419   // Make this process a subreaper to reliably catch subprocess exits.
420   // See https://man7.org/linux/man-pages/man2/prctl.2.html
421   prctl(PR_SET_CHILD_SUBREAPER, 1);
422   prctl(PR_SET_PDEATHSIG, SIGHUP);  // Die when parent dies
423 #endif
424 
425   LOG(DEBUG) << "Monitoring subprocesses";
426   CF_EXPECT(StartSubprocesses(properties_));
427 
428   std::atomic_bool running(true);
429 
430   auto read_monitor_socket_loop =
431       [this](std::atomic_bool& running) -> Result<void> {
432     CF_EXPECT(this->ReadMonitorSocketLoop(running));
433     return {};
434   };
435   auto parent_comms = std::async(std::launch::async, read_monitor_socket_loop,
436                                  std::ref(running));
437 
438   CF_EXPECT(MonitorLoop(running, properties_mutex_,
439                         properties_.restart_subprocesses_,
440                         properties_.entries_));
441   CF_EXPECT(parent_comms.get(), "Should have exited if monitoring stopped");
442 
443   CF_EXPECT(StopSubprocesses(properties_.entries_));
444   LOG(DEBUG) << "Done monitoring subprocesses";
445   return {};
446 }
447 
448 }  // namespace cuttlefish
449