file-restore-daemon: add watchdog module
Add a watchdog that will automatically shut down the VM after 10 minutes, if no API call is received. Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
This commit is contained in:
parent
dd9cef56fc
commit
a26ebad5f9
|
@ -8,5 +8,8 @@ use proxmox::api::api;
|
||||||
pub struct RestoreDaemonStatus {
|
pub struct RestoreDaemonStatus {
|
||||||
/// VM uptime in seconds
|
/// VM uptime in seconds
|
||||||
pub uptime: i64,
|
pub uptime: i64,
|
||||||
|
/// time left until auto-shutdown, keep in mind that this is useless when 'keep-timeout' is
|
||||||
|
/// not set, as then the status call will have reset the timer before returning the value
|
||||||
|
pub timeout: i64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,8 @@ fn main() -> Result<(), Error> {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn run() -> Result<(), Error> {
|
async fn run() -> Result<(), Error> {
|
||||||
|
watchdog_init();
|
||||||
|
|
||||||
let auth_config = Arc::new(
|
let auth_config = Arc::new(
|
||||||
auth::ticket_auth().map_err(|err| format_err!("reading ticket file failed: {}", err))?,
|
auth::ticket_auth().map_err(|err| format_err!("reading ticket file failed: {}", err))?,
|
||||||
);
|
);
|
||||||
|
|
|
@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method;
|
||||||
|
|
||||||
use proxmox_backup::api2::types::*;
|
use proxmox_backup::api2::types::*;
|
||||||
|
|
||||||
|
use super::{watchdog_remaining, watchdog_ping};
|
||||||
|
|
||||||
// NOTE: All API endpoints must have Permission::Superuser, as the configs for authentication do
|
// NOTE: All API endpoints must have Permission::Superuser, as the configs for authentication do
|
||||||
// not exist within the restore VM. Safety is guaranteed by checking a ticket via a custom ApiAuth.
|
// not exist within the restore VM. Safety is guaranteed by checking a ticket via a custom ApiAuth.
|
||||||
|
|
||||||
|
@ -27,22 +29,32 @@ fn read_uptime() -> Result<f32, Error> {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[api(
|
#[api(
|
||||||
|
input: {
|
||||||
|
properties: {
|
||||||
|
"keep-timeout": {
|
||||||
|
type: bool,
|
||||||
|
description: "If true, do not reset the watchdog timer on this API call.",
|
||||||
|
default: false,
|
||||||
|
optional: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
access: {
|
access: {
|
||||||
description: "Permissions are handled outside restore VM.",
|
description: "Permissions are handled outside restore VM. This call can be made without a ticket, but keep-timeout is always assumed 'true' then.",
|
||||||
permission: &Permission::Superuser,
|
permission: &Permission::World,
|
||||||
},
|
},
|
||||||
returns: {
|
returns: {
|
||||||
type: RestoreDaemonStatus,
|
type: RestoreDaemonStatus,
|
||||||
}
|
}
|
||||||
)]
|
)]
|
||||||
/// General status information
|
/// General status information
|
||||||
fn status(
|
fn status(rpcenv: &mut dyn RpcEnvironment, keep_timeout: bool) -> Result<RestoreDaemonStatus, Error> {
|
||||||
_param: Value,
|
if !keep_timeout && rpcenv.get_auth_id().is_some() {
|
||||||
_info: &ApiMethod,
|
watchdog_ping();
|
||||||
_rpcenv: &mut dyn RpcEnvironment,
|
}
|
||||||
) -> Result<RestoreDaemonStatus, Error> {
|
|
||||||
Ok(RestoreDaemonStatus {
|
Ok(RestoreDaemonStatus {
|
||||||
uptime: read_uptime()? as i64,
|
uptime: read_uptime()? as i64,
|
||||||
|
timeout: watchdog_remaining(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,3 +3,6 @@ mod api;
|
||||||
pub use api::*;
|
pub use api::*;
|
||||||
|
|
||||||
pub mod auth;
|
pub mod auth;
|
||||||
|
|
||||||
|
mod watchdog;
|
||||||
|
pub use watchdog::*;
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
//! Tokio-based watchdog that shuts down the VM if not pinged for TIMEOUT
|
||||||
|
use std::sync::atomic::{AtomicI64, Ordering};
|
||||||
|
use proxmox::tools::time::epoch_i64;
|
||||||
|
|
||||||
|
const TIMEOUT: i64 = 600; // seconds
|
||||||
|
static TRIGGERED: AtomicI64 = AtomicI64::new(0);
|
||||||
|
|
||||||
|
fn handle_expired() -> ! {
|
||||||
|
use nix::sys::reboot;
|
||||||
|
println!("watchdog expired, shutting down");
|
||||||
|
let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err();
|
||||||
|
println!("'reboot' syscall failed: {}", err);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn watchdog_loop() {
|
||||||
|
use tokio::time::{sleep, Duration};
|
||||||
|
loop {
|
||||||
|
let remaining = watchdog_remaining();
|
||||||
|
if remaining <= 0 {
|
||||||
|
handle_expired();
|
||||||
|
}
|
||||||
|
sleep(Duration::from_secs(remaining as u64)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initialize watchdog
|
||||||
|
pub fn watchdog_init() {
|
||||||
|
watchdog_ping();
|
||||||
|
tokio::spawn(watchdog_loop());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Trigger watchdog keepalive
|
||||||
|
pub fn watchdog_ping() {
|
||||||
|
TRIGGERED.fetch_max(epoch_i64(), Ordering::AcqRel);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the remaining time before watchdog expiry in seconds
|
||||||
|
pub fn watchdog_remaining() -> i64 {
|
||||||
|
TIMEOUT - (epoch_i64() - TRIGGERED.load(Ordering::Acquire))
|
||||||
|
}
|
Loading…
Reference in New Issue