file-restore-daemon: add watchdog module

Add a watchdog that will automatically shut down the VM after 10 minutes, if no API call is received. Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
2021-03-31 12:21:53 +02:00
parent dd9cef56fc
commit a26ebad5f9
5 changed files with 68 additions and 7 deletions
--- a/src/api2/types/file_restore.rs
+++ b/src/api2/types/file_restore.rs
@ -8,5 +8,8 @@ use proxmox::api::api;
 pub struct RestoreDaemonStatus {
    /// VM uptime in seconds
    pub uptime: i64,
    /// time left until auto-shutdown, keep in mind that this is useless when 'keep-timeout' is
    /// not set, as then the status call will have reset the timer before returning the value
    pub timeout: i64,
 }
--- a/src/bin/proxmox-restore-daemon.rs
+++ b/src/bin/proxmox-restore-daemon.rs
@ -45,6 +45,8 @@ fn main() -> Result<(), Error> {
 }
 async fn run() -> Result<(), Error> {
    watchdog_init();
    let auth_config = Arc::new(
        auth::ticket_auth().map_err(|err| format_err!("reading ticket file failed: {}", err))?,
    );
--- a/src/bin/proxmox_restore_daemon/api.rs
+++ b/src/bin/proxmox_restore_daemon/api.rs
@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method;
 use proxmox_backup::api2::types::*;
 use super::{watchdog_remaining, watchdog_ping};
 // NOTE: All API endpoints must have Permission::Superuser, as the configs for authentication do
 // not exist within the restore VM. Safety is guaranteed by checking a ticket via a custom ApiAuth.
@ -27,22 +29,32 @@ fn read_uptime() -> Result<f32, Error> {
 }
 #[api(
    input: {
        properties: {
            "keep-timeout": {
                type: bool,
                description: "If true, do not reset the watchdog timer on this API call.",
                default: false,
                optional: true,
            },
        },
    },
    access: {
-        description: "Permissions are handled outside restore VM.",
+        description: "Permissions are handled outside restore VM. This call can be made without a ticket, but keep-timeout is always assumed 'true' then.",
-        permission: &Permission::Superuser,
+        permission: &Permission::World,
    },
    returns: {
        type: RestoreDaemonStatus,
    }
 )]
 /// General status information
-fn status(
+fn status(rpcenv: &mut dyn RpcEnvironment, keep_timeout: bool) -> Result<RestoreDaemonStatus, Error> {
-    _param: Value,
+    if !keep_timeout && rpcenv.get_auth_id().is_some() {
-    _info: &ApiMethod,
+        watchdog_ping();
-    _rpcenv: &mut dyn RpcEnvironment,
+    }
 ) -> Result<RestoreDaemonStatus, Error> {
    Ok(RestoreDaemonStatus {
        uptime: read_uptime()? as i64,
        timeout: watchdog_remaining(),
    })
 }
--- a/src/bin/proxmox_restore_daemon/mod.rs
+++ b/src/bin/proxmox_restore_daemon/mod.rs
@ -3,3 +3,6 @@ mod api;
 pub use api::*;
 pub mod auth;
 mod watchdog;
 pub use watchdog::*;
--- a/src/bin/proxmox_restore_daemon/watchdog.rs
+++ b/src/bin/proxmox_restore_daemon/watchdog.rs
@ -0,0 +1,41 @@
 //! Tokio-based watchdog that shuts down the VM if not pinged for TIMEOUT
 use std::sync::atomic::{AtomicI64, Ordering};
 use proxmox::tools::time::epoch_i64;
 const TIMEOUT: i64 = 600; // seconds
 static TRIGGERED: AtomicI64 = AtomicI64::new(0);
 fn handle_expired() -> ! {
    use nix::sys::reboot;
    println!("watchdog expired, shutting down");
    let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err();
    println!("'reboot' syscall failed: {}", err);
    std::process::exit(1);
 }
 async fn watchdog_loop() {
    use tokio::time::{sleep, Duration};
    loop {
        let remaining = watchdog_remaining();
        if remaining <= 0 {
            handle_expired();
        }
        sleep(Duration::from_secs(remaining as u64)).await;
    }
 }
 /// Initialize watchdog
 pub fn watchdog_init() {
    watchdog_ping();
    tokio::spawn(watchdog_loop());
 }
 /// Trigger watchdog keepalive
 pub fn watchdog_ping() {
    TRIGGERED.fetch_max(epoch_i64(), Ordering::AcqRel);
 }
 /// Returns the remaining time before watchdog expiry in seconds
 pub fn watchdog_remaining() -> i64 {
    TIMEOUT - (epoch_i64() - TRIGGERED.load(Ordering::Acquire))
 }