1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <sys/types.h>
18 #include <sys/wait.h>
19
20 #include <assert.h>
21 #include <errno.h>
22 #include <signal.h>
23 #include <stdio.h>
24
25 #include <map>
26
27 #include <android-base/logging.h>
28
29 #include "common/libs/fs/shared_select.h"
30 #include "host/commands/run_cvd/process_monitor.h"
31
32 namespace cuttlefish {
33
34 namespace {
35
NotifyThread(SharedFD fd)36 void NotifyThread(SharedFD fd) {
37 // The restarter thread is (likely) blocked on a call to select, to make it
38 // wake up and do some work we write something (anything, the content is not
39 // important) into the main side of the socket pair so that the call to select
40 // returns and the notification fd (restarter side of the socket pair) is
41 // marked as ready to read.
42 char buffer = 'a';
43 fd->Write(&buffer, sizeof(buffer));
44 }
45
ConsumeNotifications(SharedFD fd)46 void ConsumeNotifications(SharedFD fd) {
47 // Once the starter thread is waken up due to a notification, the calls to
48 // select will continue to return immediately unless we read what was written
49 // on the main side of the socket pair. More than one notification can
50 // accumulate before the restarter thread consumes them, so we attempt to read
51 // more than it's written to consume them all at once. In the unlikely case of
52 // more than 8 notifications acummulating we simply read the first 8 and have
53 // another iteration on the restarter thread loop.
54 char buffer[8];
55 fd->Read(buffer, sizeof(buffer));
56 }
57
58 } // namespace
59
ProcessMonitor()60 ProcessMonitor::ProcessMonitor() {
61 if (!SharedFD::SocketPair(AF_LOCAL, SOCK_STREAM, 0, &thread_comm_main_,
62 &thread_comm_monitor_)) {
63 LOG(ERROR) << "Unable to create restarter communication socket pair: "
64 << strerror(errno);
65 return;
66 }
67 monitor_thread_ = std::thread([this]() { MonitorRoutine(); });
68 }
69
StartSubprocess(Command cmd,OnSocketReadyCb callback)70 void ProcessMonitor::StartSubprocess(Command cmd, OnSocketReadyCb callback) {
71 cuttlefish::SubprocessOptions options;
72 options.InGroup(true);
73 options.WithControlSocket(true);
74 auto proc = cmd.Start(options);
75 if (!proc.Started()) {
76 LOG(ERROR) << "Failed to start process";
77 return;
78 }
79 MonitorExistingSubprocess(std::move(cmd), std::move(proc), callback);
80 }
81
MonitorExistingSubprocess(Command cmd,Subprocess proc,OnSocketReadyCb callback)82 void ProcessMonitor::MonitorExistingSubprocess(Command cmd, Subprocess proc,
83 OnSocketReadyCb callback) {
84 {
85 std::lock_guard<std::mutex> lock(processes_mutex_);
86 monitored_processes_.push_back(MonitorEntry());
87 auto& entry = monitored_processes_.back();
88 entry.cmd.reset(new Command(std::move(cmd)));
89 entry.proc.reset(new Subprocess(std::move(proc)));
90 entry.on_control_socket_ready_cb = callback;
91 }
92 // Wake the restarter thread up so that it starts monitoring this subprocess
93 // Do this after releasing the lock so that the restarter thread is free to
94 // begin work as soon as select returns.
95 NotifyThread(thread_comm_main_);
96 }
97
StopMonitoredProcesses()98 bool ProcessMonitor::StopMonitoredProcesses() {
99 // Because the mutex is held while this function executes, the restarter
100 // thread is kept blocked and by the time it resumes execution there are no
101 // more processes to monitor
102 std::lock_guard<std::mutex> lock(processes_mutex_);
103 bool result = true;
104 // Processes were started in the order they appear in the vector, stop them in
105 // reverse order for symmetry.
106 for (auto entry_it = monitored_processes_.rbegin();
107 entry_it != monitored_processes_.rend(); ++entry_it) {
108 auto& entry = *entry_it;
109 result = result && entry.proc->Stop();
110 }
111 // Wait for all processes to actually exit.
112 for (auto& entry : monitored_processes_) {
113 // Most processes are being killed by signals, calling Wait(void) would be
114 // too verbose on the logs.
115 int wstatus;
116 auto ret = entry.proc->Wait(&wstatus, 0);
117 if (ret < 0) {
118 LOG(WARNING) << "Failed to wait for process "
119 << entry.cmd->GetShortName();
120 }
121 }
122 // Clear the list to ensure they are not started again
123 monitored_processes_.clear();
124 return result;
125 }
126
RestartOnExitCb(MonitorEntry * entry)127 bool ProcessMonitor::RestartOnExitCb(MonitorEntry* entry) {
128 // Make sure the process actually exited
129 char buffer[16];
130 auto bytes_read = entry->proc->control_socket()->Read(buffer, sizeof(buffer));
131 if (bytes_read > 0) {
132 LOG(WARNING) << "Subprocess " << entry->cmd->GetShortName() << " wrote "
133 << bytes_read
134 << " bytes on the control socket, this is unexpected";
135 // The process may not have exited, continue monitoring without restarting
136 return true;
137 }
138
139 LOG(INFO) << "Detected exit of monitored subprocess";
140 // Make sure the subprocess isn't left in a zombie state, and that the
141 // pid is logged
142 int wstatus;
143 auto wait_ret = TEMP_FAILURE_RETRY(entry->proc->Wait(&wstatus, 0));
144 // None of the error conditions specified on waitpid(2) apply
145 assert(wait_ret > 0);
146 if (WIFEXITED(wstatus)) {
147 LOG(INFO) << "Subprocess " << entry->cmd->GetShortName() << " (" << wait_ret
148 << ") has exited with exit code " << WEXITSTATUS(wstatus);
149 } else if (WIFSIGNALED(wstatus)) {
150 LOG(ERROR) << "Subprocess " << entry->cmd->GetShortName() << " ("
151 << wait_ret
152 << ") was interrupted by a signal: " << WTERMSIG(wstatus);
153 } else {
154 LOG(INFO) << "subprocess " << entry->cmd->GetShortName() << " (" << wait_ret
155 << ") has exited for unknown reasons";
156 }
157 cuttlefish::SubprocessOptions options;
158 options.WithControlSocket(true);
159 entry->proc.reset(new Subprocess(entry->cmd->Start(options)));
160 return true;
161 }
162
DoNotMonitorCb(MonitorEntry *)163 bool ProcessMonitor::DoNotMonitorCb(MonitorEntry*) { return false; }
164
MonitorRoutine()165 void ProcessMonitor::MonitorRoutine() {
166 LOG(DEBUG) << "Started monitoring subprocesses";
167 do {
168 SharedFDSet read_set;
169 read_set.Set(thread_comm_monitor_);
170 {
171 std::lock_guard<std::mutex> lock(processes_mutex_);
172 for (auto& monitored_process : monitored_processes_) {
173 auto control_socket = monitored_process.proc->control_socket();
174 if (!control_socket->IsOpen()) {
175 LOG(ERROR) << "The control socket for "
176 << monitored_process.cmd->GetShortName()
177 << " is closed, it's effectively NOT being monitored";
178 }
179 read_set.Set(control_socket);
180 }
181 }
182 // We can't call select while holding the lock as it would lead to a
183 // deadlock (restarter thread waiting for notifications from main thread,
184 // main thread waiting for the lock)
185 int num_fds = cuttlefish::Select(&read_set, nullptr, nullptr, nullptr);
186 if (num_fds < 0) {
187 LOG(ERROR) << "Select call returned error on restarter thread: "
188 << strerror(errno);
189 }
190 if (num_fds > 0) {
191 // Try the communication fd, it's the most likely to be set
192 if (read_set.IsSet(thread_comm_monitor_)) {
193 --num_fds;
194 ConsumeNotifications(thread_comm_monitor_);
195 }
196 }
197 {
198 std::lock_guard<std::mutex> lock(processes_mutex_);
199 // Keep track of the number of file descriptors ready for read, chances
200 // are we don't need to go over the entire list of subprocesses
201 auto it = monitored_processes_.begin();
202 while (it != monitored_processes_.end()) {
203 auto control_socket = it->proc->control_socket();
204 bool keep_monitoring = true;
205 if (read_set.IsSet(control_socket)) {
206 --num_fds;
207 keep_monitoring = it->on_control_socket_ready_cb(&(*it));
208 }
209 if (keep_monitoring) {
210 ++it;
211 } else {
212 it = monitored_processes_.erase(it);
213 }
214 }
215 }
216 assert(num_fds == 0);
217 } while (true);
218 }
219
220 } // namespace cuttlefish
221