file-restore-daemon: add watchdog module

Add a watchdog that will automatically shut down the VM after 10
minutes, if no API call is received.

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
This commit is contained in:
Stefan Reiter 2021-03-31 12:21:53 +02:00 committed by Thomas Lamprecht
parent dd9cef56fc
commit a26ebad5f9
5 changed files with 68 additions and 7 deletions

View File

@ -8,5 +8,8 @@ use proxmox::api::api;
pub struct RestoreDaemonStatus {
/// VM uptime in seconds
pub uptime: i64,
/// time left until auto-shutdown, keep in mind that this is useless when 'keep-timeout' is
/// not set, as then the status call will have reset the timer before returning the value
pub timeout: i64,
}

View File

@ -45,6 +45,8 @@ fn main() -> Result<(), Error> {
}
async fn run() -> Result<(), Error> {
watchdog_init();
let auth_config = Arc::new(
auth::ticket_auth().map_err(|err| format_err!("reading ticket file failed: {}", err))?,
);

View File

@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method;
use proxmox_backup::api2::types::*;
use super::{watchdog_remaining, watchdog_ping};
// NOTE: All API endpoints must have Permission::Superuser, as the configs for authentication do
// not exist within the restore VM. Safety is guaranteed by checking a ticket via a custom ApiAuth.
@ -27,22 +29,32 @@ fn read_uptime() -> Result<f32, Error> {
}
#[api(
input: {
properties: {
"keep-timeout": {
type: bool,
description: "If true, do not reset the watchdog timer on this API call.",
default: false,
optional: true,
},
},
},
access: {
description: "Permissions are handled outside restore VM.",
permission: &Permission::Superuser,
description: "Permissions are handled outside restore VM. This call can be made without a ticket, but keep-timeout is always assumed 'true' then.",
permission: &Permission::World,
},
returns: {
type: RestoreDaemonStatus,
}
)]
/// General status information
fn status(
_param: Value,
_info: &ApiMethod,
_rpcenv: &mut dyn RpcEnvironment,
) -> Result<RestoreDaemonStatus, Error> {
fn status(rpcenv: &mut dyn RpcEnvironment, keep_timeout: bool) -> Result<RestoreDaemonStatus, Error> {
if !keep_timeout && rpcenv.get_auth_id().is_some() {
watchdog_ping();
}
Ok(RestoreDaemonStatus {
uptime: read_uptime()? as i64,
timeout: watchdog_remaining(),
})
}

View File

@ -3,3 +3,6 @@ mod api;
pub use api::*;
pub mod auth;
mod watchdog;
pub use watchdog::*;

View File

@ -0,0 +1,41 @@
//! Tokio-based watchdog that shuts down the VM if not pinged for TIMEOUT
use std::sync::atomic::{AtomicI64, Ordering};
use proxmox::tools::time::epoch_i64;
const TIMEOUT: i64 = 600; // seconds
static TRIGGERED: AtomicI64 = AtomicI64::new(0);
fn handle_expired() -> ! {
use nix::sys::reboot;
println!("watchdog expired, shutting down");
let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err();
println!("'reboot' syscall failed: {}", err);
std::process::exit(1);
}
async fn watchdog_loop() {
use tokio::time::{sleep, Duration};
loop {
let remaining = watchdog_remaining();
if remaining <= 0 {
handle_expired();
}
sleep(Duration::from_secs(remaining as u64)).await;
}
}
/// Initialize watchdog
pub fn watchdog_init() {
watchdog_ping();
tokio::spawn(watchdog_loop());
}
/// Trigger watchdog keepalive
pub fn watchdog_ping() {
TRIGGERED.fetch_max(epoch_i64(), Ordering::AcqRel);
}
/// Returns the remaining time before watchdog expiry in seconds
pub fn watchdog_remaining() -> i64 {
TIMEOUT - (epoch_i64() - TRIGGERED.load(Ordering::Acquire))
}