From 5c9eb7c0a5302405738aa47eba8e3a0c5f2116b4 Mon Sep 17 00:00:00 2001 From: Tom Kirchner Date: Wed, 10 Mar 2021 17:10:04 -0800 Subject: [PATCH] apiserver: reap exited child processes We spawn some background processes, like thar-be-settings, when they might take some time and we don't want to hold up API clients. Unless we `wait` for those children when they exit, their process ID lives on as a zombie. This adds a SIGCHLD handler to apiserver to call wait when any child exits. --- sources/Cargo.lock | 5 ++- sources/api/apiserver/Cargo.toml | 1 + sources/api/apiserver/src/bin/apiserver.rs | 45 +++++++++++++++++-- .../api/apiserver/src/server/controller.rs | 3 +- 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/sources/Cargo.lock b/sources/Cargo.lock index 72febe5c06d..2d9475a57f5 100644 --- a/sources/Cargo.lock +++ b/sources/Cargo.lock @@ -424,6 +424,7 @@ dependencies = [ "semver 0.11.0", "serde", "serde_json", + "signal-hook", "simplelog", "snafu", "thar-be-updates", @@ -2855,9 +2856,9 @@ dependencies = [ [[package]] name = "signal-hook" -version = "0.3.4" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "780f5e3fe0c66f67197236097d89de1e86216f1f6fdeaf47c442f854ab46c240" +checksum = "6aa894ef3fade0ee7243422f4fbbd6c2b48e6de767e621d37ef65f2310f53cea" dependencies = [ "libc", "signal-hook-registry", diff --git a/sources/api/apiserver/Cargo.toml b/sources/api/apiserver/Cargo.toml index d77c0f749d8..a9e436bb112 100644 --- a/sources/api/apiserver/Cargo.toml +++ b/sources/api/apiserver/Cargo.toml @@ -25,6 +25,7 @@ percent-encoding = "2.1" semver = "0.11" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +signal-hook = "0.3" simplelog = "0.9" snafu = "0.6" thar-be-updates = { path = "../thar-be-updates" } diff --git a/sources/api/apiserver/src/bin/apiserver.rs b/sources/api/apiserver/src/bin/apiserver.rs index 6d4ebeb492c..cd8716faef4 100644 --- a/sources/api/apiserver/src/bin/apiserver.rs +++ b/sources/api/apiserver/src/bin/apiserver.rs @@ -6,13 +6,18 @@ extern crate log; use libc::gid_t; -use nix::unistd::Gid; +use nix::{ + sys::wait::{waitpid, WaitPidFlag, WaitStatus}, + unistd::{Gid, Pid}, +}; +use signal_hook::{consts::SIGCHLD, iterator::Signals}; use simplelog::{Config as LogConfig, LevelFilter, SimpleLogger}; use snafu::{ensure, ResultExt}; use std::env; use std::path::Path; use std::process; use std::str::FromStr; +use std::thread; use apiserver::serve; @@ -32,6 +37,9 @@ mod error { #[snafu(display("{}", source))] Server { source: apiserver::server::Error }, + #[snafu(display("Failed to set up signal handler: {}", source))] + Signal { source: std::io::Error }, + #[snafu(display("Logger setup error: {}", source))] Logger { source: log::SetLoggerError }, } @@ -131,8 +139,7 @@ async fn run() -> Result<()> { let args = parse_args(env::args()); // SimpleLogger will send errors to stderr and anything less to stdout. - SimpleLogger::init(args.log_level, LogConfig::default()) - .context(error::Logger)?; + SimpleLogger::init(args.log_level, LogConfig::default()).context(error::Logger)?; // Make sure the datastore exists ensure!( @@ -140,6 +147,38 @@ async fn run() -> Result<()> { error::NonexistentDatastore ); + // We need to handle some actions asynchronously without holding up API clients, like invoking + // thar-be-settings to apply changes to the system, so we fork them off and don't wait for + // their status or output at that point. Instead, when we receive a SIGCHLD signal telling us + // that child processes have exited, we wait for them in this thread so that they don't become + // zombie processes. + let mut signals = Signals::new(&[SIGCHLD]).context(error::Signal)?; + thread::spawn(move || { + for signal in signals.forever() { + match signal { + SIGCHLD => { + // We loop as long as we have children to reap. + loop { + if let Ok(status) = waitpid(Pid::from_raw(-1), Some(WaitPidFlag::WNOHANG)) { + // You can get a CHLD signal in other scenarios, even if a child is + // still running; we only want to continue if we're making progress + // with reaping exited children. Otherwise, stop and wait for another + // signal. + if !matches!(status, WaitStatus::Exited(..)) { + break; + } + } else { + // There's not much we can do to recover from an error in waiting, so + // we'll just try again next loop. + break; + } + } + } + _ => {} // this space for rent! + } + } + }); + // Each request makes its own handle to the datastore; there's no locking or // synchronization yet. Therefore, only use 1 thread for safety. let threads = 1; diff --git a/sources/api/apiserver/src/server/controller.rs b/sources/api/apiserver/src/server/controller.rs index 625d7924b06..65db140d841 100644 --- a/sources/api/apiserver/src/server/controller.rs +++ b/sources/api/apiserver/src/server/controller.rs @@ -372,7 +372,8 @@ where .context(error::ConfigApplierStart)?; } - // Leave config applier to run in the background; we can't wait for it + // Leave config applier to run in the background; we can't wait for it. + // The apiserver binary waits for any exited children so they don't become zombies. Ok(()) }