From c2206e21e0f27fbb7610180fc9dc3cb2fe1c4c16 Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Mon, 18 Jul 2022 14:11:01 +0200 Subject: [PATCH] api daemons: periodically unpark a tokio thread to ensure progress The underlying issue seems to be the case when the thread that runs the IO driver is polling its own tasks, while that happens the IO driver/poller won't run and thus work stealing won't happen, meaning that idle and parked threads will keep being parked even if there's pending work they could do. A promising solution for tokio is proposed in its issue tracker [0], but it wasn't yet implemented. So, as stop gap spawn a separate thread that periodically spawns a no-op ready future in the runtime which would unpark a worker in the aforementioned case and thus should break the bogus idleness. Choose a 3s period for that without any overly elaborate reasons, our main goal is to ensure we accept incoming connections and 3s is well below a HTTP timeout and leaves some room for high network latencies while not invoking to much additional wakeups for systems that are really idling. [0]: https://github.com/tokio-rs/tokio/issues/4730#issuecomment-1147975074 Link: https://github.com/tokio-rs/tokio/issues/4730 Signed-off-by: Thomas Lamprecht --- src/bin/proxmox-backup-api.rs | 11 +++++++++++ src/bin/proxmox-backup-proxy.rs | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/bin/proxmox-backup-api.rs b/src/bin/proxmox-backup-api.rs index dda4b638..441b3427 100644 --- a/src/bin/proxmox-backup-api.rs +++ b/src/bin/proxmox-backup-api.rs @@ -170,6 +170,17 @@ async fn run() -> Result<(), Error> { bail!("unable to start daemon - {}", err); } + // stop gap for https://github.com/tokio-rs/tokio/issues/4730 where the thread holding the + // IO-driver may block progress completely if it starts polling its own tasks (blocks). + // So, trigger a notify to parked threads, as we're immediately ready the woken up thread will + // acquire the IO driver, if blocked, before going to sleep, which allows progress again + // TODO: remove once tokio solves this at their level (see proposals in linked comments) + let rt_handle = tokio::runtime::Handle::current(); + std::thread::spawn(move || loop { + rt_handle.spawn(std::future::ready(())); + std::thread::sleep(std::time::Duration::from_secs(3)); + }); + server.await?; log::info!("server shutting down, waiting for active workers to complete"); proxmox_rest_server::last_worker_future().await?; diff --git a/src/bin/proxmox-backup-proxy.rs b/src/bin/proxmox-backup-proxy.rs index 8b9a0c9e..4c57b2dd 100644 --- a/src/bin/proxmox-backup-proxy.rs +++ b/src/bin/proxmox-backup-proxy.rs @@ -340,6 +340,17 @@ async fn run() -> Result<(), Error> { bail!("unable to start daemon - {}", err); } + // stop gap for https://github.com/tokio-rs/tokio/issues/4730 where the thread holding the + // IO-driver may block progress completely if it starts polling its own tasks (blocks). + // So, trigger a notify to parked threads, as we're immediately ready the woken up thread will + // acquire the IO driver, if blocked, before going to sleep, which allows progress again + // TODO: remove once tokio solves this at their level (see proposals in linked comments) + let rt_handle = tokio::runtime::Handle::current(); + std::thread::spawn(move || loop { + rt_handle.spawn(std::future::ready(())); + std::thread::sleep(Duration::from_secs(3)); + }); + start_task_scheduler(); start_stat_generator(); start_traffic_control_updater();