file-restore-daemon: add watchdog module

Add a watchdog that will automatically shut down the VM after 10 minutes, if no API call is received. Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
2021-03-31 12:21:53 +02:00
parent dd9cef56fc
commit a26ebad5f9
5 changed files with 68 additions and 7 deletions
--- a/src/api2/types/file_restore.rs
+++ b/src/api2/types/file_restore.rs
@ -8,5 +8,8 @@ use proxmox::api::api;
 pub struct RestoreDaemonStatus {
    /// VM uptime in seconds
    pub uptime: i64,
+    /// time left until auto-shutdown, keep in mind that this is useless when 'keep-timeout' is
+    /// not set, as then the status call will have reset the timer before returning the value
+    pub timeout: i64,
 }

--- a/src/bin/proxmox-restore-daemon.rs
+++ b/src/bin/proxmox-restore-daemon.rs
@ -45,6 +45,8 @@ fn main() -> Result<(), Error> {
 }

 async fn run() -> Result<(), Error> {
+    watchdog_init();
+
    let auth_config = Arc::new(
        auth::ticket_auth().map_err(|err| format_err!("reading ticket file failed: {}", err))?,
    );
--- a/src/bin/proxmox_restore_daemon/api.rs
+++ b/src/bin/proxmox_restore_daemon/api.rs
@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method;

 use proxmox_backup::api2::types::*;

+use super::{watchdog_remaining, watchdog_ping};
+
 // NOTE: All API endpoints must have Permission::Superuser, as the configs for authentication do
 // not exist within the restore VM. Safety is guaranteed by checking a ticket via a custom ApiAuth.

@ -27,22 +29,32 @@ fn read_uptime() -> Result<f32, Error> {
 }

 #[api(
+    input: {
+        properties: {
+            "keep-timeout": {
+                type: bool,
+                description: "If true, do not reset the watchdog timer on this API call.",
+                default: false,
+                optional: true,
+            },
+        },
+    },
    access: {
-        description: "Permissions are handled outside restore VM.",
-        permission: &Permission::Superuser,
+        description: "Permissions are handled outside restore VM. This call can be made without a ticket, but keep-timeout is always assumed 'true' then.",
+        permission: &Permission::World,
    },
    returns: {
        type: RestoreDaemonStatus,
    }
 )]
 /// General status information
-fn status(
-    _param: Value,
-    _info: &ApiMethod,
-    _rpcenv: &mut dyn RpcEnvironment,
-) -> Result<RestoreDaemonStatus, Error> {
+fn status(rpcenv: &mut dyn RpcEnvironment, keep_timeout: bool) -> Result<RestoreDaemonStatus, Error> {
+    if !keep_timeout && rpcenv.get_auth_id().is_some() {
+        watchdog_ping();
+    }
    Ok(RestoreDaemonStatus {
        uptime: read_uptime()? as i64,
+        timeout: watchdog_remaining(),
    })
 }

--- a/src/bin/proxmox_restore_daemon/mod.rs
+++ b/src/bin/proxmox_restore_daemon/mod.rs
@ -3,3 +3,6 @@ mod api;
 pub use api::*;

 pub mod auth;
+
+mod watchdog;
+pub use watchdog::*;
--- a/src/bin/proxmox_restore_daemon/watchdog.rs
+++ b/src/bin/proxmox_restore_daemon/watchdog.rs
@ -0,0 +1,41 @@
+//! Tokio-based watchdog that shuts down the VM if not pinged for TIMEOUT
+use std::sync::atomic::{AtomicI64, Ordering};
+use proxmox::tools::time::epoch_i64;
+
+const TIMEOUT: i64 = 600; // seconds
+static TRIGGERED: AtomicI64 = AtomicI64::new(0);
+
+fn handle_expired() -> ! {
+    use nix::sys::reboot;
+    println!("watchdog expired, shutting down");
+    let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err();
+    println!("'reboot' syscall failed: {}", err);
+    std::process::exit(1);
+}
+
+async fn watchdog_loop() {
+    use tokio::time::{sleep, Duration};
+    loop {
+        let remaining = watchdog_remaining();
+        if remaining <= 0 {
+            handle_expired();
+        }
+        sleep(Duration::from_secs(remaining as u64)).await;
+    }
+}
+
+/// Initialize watchdog
+pub fn watchdog_init() {
+    watchdog_ping();
+    tokio::spawn(watchdog_loop());
+}
+
+/// Trigger watchdog keepalive
+pub fn watchdog_ping() {
+    TRIGGERED.fetch_max(epoch_i64(), Ordering::AcqRel);
+}
+
+/// Returns the remaining time before watchdog expiry in seconds
+pub fn watchdog_remaining() -> i64 {
+    TIMEOUT - (epoch_i64() - TRIGGERED.load(Ordering::Acquire))
+}