1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 
20 #include <assert.h>
21 #include <errno.h>
22 #include <signal.h>
23 #include <stdio.h>
24 
25 #include <map>
26 
27 #include <android-base/logging.h>
28 
29 #include "common/libs/fs/shared_select.h"
30 #include "host/commands/run_cvd/process_monitor.h"
31 
32 namespace cuttlefish {
33 
34 namespace {
35 
NotifyThread(SharedFD fd)36 void NotifyThread(SharedFD fd) {
37   // The restarter thread is (likely) blocked on a call to select, to make it
38   // wake up and do some work we write something (anything, the content is not
39   // important) into the main side of the socket pair so that the call to select
40   // returns and the notification fd (restarter side of the socket pair) is
41   // marked as ready to read.
42   char buffer = 'a';
43   fd->Write(&buffer, sizeof(buffer));
44 }
45 
ConsumeNotifications(SharedFD fd)46 void ConsumeNotifications(SharedFD fd) {
47   // Once the starter thread is waken up due to a notification, the calls to
48   // select will continue to return immediately unless we read what was written
49   // on the main side of the socket pair. More than one notification can
50   // accumulate before the restarter thread consumes them, so we attempt to read
51   // more than it's written to consume them all at once. In the unlikely case of
52   // more than 8 notifications acummulating we simply read the first 8 and have
53   // another iteration on the restarter thread loop.
54   char buffer[8];
55   fd->Read(buffer, sizeof(buffer));
56 }
57 
58 }  // namespace
59 
ProcessMonitor()60 ProcessMonitor::ProcessMonitor() {
61   if (!SharedFD::SocketPair(AF_LOCAL, SOCK_STREAM, 0, &thread_comm_main_,
62                             &thread_comm_monitor_)) {
63     LOG(ERROR) << "Unable to create restarter communication socket pair: "
64                << strerror(errno);
65     return;
66   }
67   monitor_thread_ = std::thread([this]() { MonitorRoutine(); });
68 }
69 
StartSubprocess(Command cmd,OnSocketReadyCb callback)70 void ProcessMonitor::StartSubprocess(Command cmd, OnSocketReadyCb callback) {
71   cuttlefish::SubprocessOptions options;
72   options.InGroup(true);
73   options.WithControlSocket(true);
74   auto proc = cmd.Start(options);
75   if (!proc.Started()) {
76     LOG(ERROR) << "Failed to start process";
77     return;
78   }
79   MonitorExistingSubprocess(std::move(cmd), std::move(proc), callback);
80 }
81 
MonitorExistingSubprocess(Command cmd,Subprocess proc,OnSocketReadyCb callback)82 void ProcessMonitor::MonitorExistingSubprocess(Command cmd, Subprocess proc,
83                                                OnSocketReadyCb callback) {
84   {
85     std::lock_guard<std::mutex> lock(processes_mutex_);
86     monitored_processes_.push_back(MonitorEntry());
87     auto& entry = monitored_processes_.back();
88     entry.cmd.reset(new Command(std::move(cmd)));
89     entry.proc.reset(new Subprocess(std::move(proc)));
90     entry.on_control_socket_ready_cb = callback;
91   }
92   // Wake the restarter thread up so that it starts monitoring this subprocess
93   // Do this after releasing the lock so that the restarter thread is free to
94   // begin work as soon as select returns.
95   NotifyThread(thread_comm_main_);
96 }
97 
StopMonitoredProcesses()98 bool ProcessMonitor::StopMonitoredProcesses() {
99   // Because the mutex is held while this function executes, the restarter
100   // thread is kept blocked and by the time it resumes execution there are no
101   // more processes to monitor
102   std::lock_guard<std::mutex> lock(processes_mutex_);
103   bool result = true;
104   // Processes were started in the order they appear in the vector, stop them in
105   // reverse order for symmetry.
106   for (auto entry_it = monitored_processes_.rbegin();
107        entry_it != monitored_processes_.rend(); ++entry_it) {
108     auto& entry = *entry_it;
109     result = result && entry.proc->Stop();
110   }
111   // Wait for all processes to actually exit.
112   for (auto& entry : monitored_processes_) {
113     // Most processes are being killed by signals, calling Wait(void) would be
114     // too verbose on the logs.
115     int wstatus;
116     auto ret = entry.proc->Wait(&wstatus, 0);
117     if (ret < 0) {
118       LOG(WARNING) << "Failed to wait for process "
119                    << entry.cmd->GetShortName();
120     }
121   }
122   // Clear the list to ensure they are not started again
123   monitored_processes_.clear();
124   return result;
125 }
126 
RestartOnExitCb(MonitorEntry * entry)127 bool ProcessMonitor::RestartOnExitCb(MonitorEntry* entry) {
128   // Make sure the process actually exited
129   char buffer[16];
130   auto bytes_read = entry->proc->control_socket()->Read(buffer, sizeof(buffer));
131   if (bytes_read > 0) {
132     LOG(WARNING) << "Subprocess " << entry->cmd->GetShortName() << " wrote "
133                  << bytes_read
134                  << " bytes on the control socket, this is unexpected";
135     // The process may not have exited, continue monitoring without restarting
136     return true;
137   }
138 
139   LOG(INFO) << "Detected exit of monitored subprocess";
140   // Make sure the subprocess isn't left in a zombie state, and that the
141   // pid is logged
142   int wstatus;
143   auto wait_ret = TEMP_FAILURE_RETRY(entry->proc->Wait(&wstatus, 0));
144   // None of the error conditions specified on waitpid(2) apply
145   assert(wait_ret > 0);
146   if (WIFEXITED(wstatus)) {
147     LOG(INFO) << "Subprocess " << entry->cmd->GetShortName() << " (" << wait_ret
148               << ") has exited with exit code " << WEXITSTATUS(wstatus);
149   } else if (WIFSIGNALED(wstatus)) {
150     LOG(ERROR) << "Subprocess " << entry->cmd->GetShortName() << " ("
151                << wait_ret
152                << ") was interrupted by a signal: " << WTERMSIG(wstatus);
153   } else {
154     LOG(INFO) << "subprocess " << entry->cmd->GetShortName() << " (" << wait_ret
155               << ") has exited for unknown reasons";
156   }
157   cuttlefish::SubprocessOptions options;
158   options.WithControlSocket(true);
159   entry->proc.reset(new Subprocess(entry->cmd->Start(options)));
160   return true;
161 }
162 
DoNotMonitorCb(MonitorEntry *)163 bool ProcessMonitor::DoNotMonitorCb(MonitorEntry*) { return false; }
164 
MonitorRoutine()165 void ProcessMonitor::MonitorRoutine() {
166   LOG(DEBUG) << "Started monitoring subprocesses";
167   do {
168     SharedFDSet read_set;
169     read_set.Set(thread_comm_monitor_);
170     {
171       std::lock_guard<std::mutex> lock(processes_mutex_);
172       for (auto& monitored_process : monitored_processes_) {
173         auto control_socket = monitored_process.proc->control_socket();
174         if (!control_socket->IsOpen()) {
175           LOG(ERROR) << "The control socket for "
176                      << monitored_process.cmd->GetShortName()
177                      << " is closed, it's effectively NOT being monitored";
178         }
179         read_set.Set(control_socket);
180       }
181     }
182     // We can't call select while holding the lock as it would lead to a
183     // deadlock (restarter thread waiting for notifications from main thread,
184     // main thread waiting for the lock)
185     int num_fds = cuttlefish::Select(&read_set, nullptr, nullptr, nullptr);
186     if (num_fds < 0) {
187       LOG(ERROR) << "Select call returned error on restarter thread: "
188                  << strerror(errno);
189     }
190     if (num_fds > 0) {
191       // Try the communication fd, it's the most likely to be set
192       if (read_set.IsSet(thread_comm_monitor_)) {
193         --num_fds;
194         ConsumeNotifications(thread_comm_monitor_);
195       }
196     }
197     {
198       std::lock_guard<std::mutex> lock(processes_mutex_);
199       // Keep track of the number of file descriptors ready for read, chances
200       // are we don't need to go over the entire list of subprocesses
201       auto it = monitored_processes_.begin();
202       while (it != monitored_processes_.end()) {
203         auto control_socket = it->proc->control_socket();
204         bool keep_monitoring = true;
205         if (read_set.IsSet(control_socket)) {
206           --num_fds;
207           keep_monitoring = it->on_control_socket_ready_cb(&(*it));
208         }
209         if (keep_monitoring) {
210           ++it;
211         } else {
212           it = monitored_processes_.erase(it);
213         }
214       }
215     }
216     assert(num_fds == 0);
217   } while (true);
218 }
219 
220 }  // namespace cuttlefish
221