proxmox-backup/proxmox-file-restore/src/qemu_helper.rs
Thomas Lamprecht 1f8b29f578 file restore: scale per-round delay up dynamically
Avoids latency for restore-VMs that are finished fast but not ready
yet the first round while not checking to often for slower ones, iow,
we assume that the start up distribution is looking like a chi-square
Χ² with k=3.

With 25*round we get at max 45 rounds totalling to 25.875 s delay and
1.125 max between-round delay, which still provides an ok reaction
time.

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
2022-01-26 16:12:58 +01:00

338 lines
11 KiB
Rust

//! Helper to start a QEMU VM for single file restore.
use std::fs::{File, OpenOptions};
use std::io::prelude::*;
use std::os::unix::io::AsRawFd;
use std::path::PathBuf;
use std::time::{Instant, Duration};
use anyhow::{bail, format_err, Error};
use tokio::time;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use proxmox_sys::fs::{create_path, file_read_string, make_tmp_file, CreateOptions};
use proxmox_sys::fd::fd_change_cloexec;
use proxmox_sys::logrotate::LogRotate;
use pbs_client::{VsockClient, DEFAULT_VSOCK_PORT};
use crate::{cpio, backup_user};
use super::SnapRestoreDetails;
const PBS_VM_NAME: &str = "pbs-restore-vm";
const MAX_CID_TRIES: u64 = 32;
fn create_restore_log_dir() -> Result<String, Error> {
let logpath = format!("{}/file-restore", pbs_buildcfg::PROXMOX_BACKUP_LOG_DIR);
proxmox_lang::try_block!({
let backup_user = backup_user()?;
let opts = CreateOptions::new()
.owner(backup_user.uid)
.group(backup_user.gid);
let opts_root = CreateOptions::new()
.owner(nix::unistd::ROOT)
.group(nix::unistd::Gid::from_raw(0));
create_path(pbs_buildcfg::PROXMOX_BACKUP_LOG_DIR, None, Some(opts))?;
create_path(&logpath, None, Some(opts_root))?;
Ok(())
})
.map_err(|err: Error| format_err!("unable to create file-restore log dir - {}", err))?;
Ok(logpath)
}
fn validate_img_existance(debug: bool) -> Result<(), Error> {
let kernel = PathBuf::from(pbs_buildcfg::PROXMOX_BACKUP_KERNEL_FN);
let initramfs = PathBuf::from(if debug {
pbs_buildcfg::PROXMOX_BACKUP_INITRAMFS_DBG_FN
} else {
pbs_buildcfg::PROXMOX_BACKUP_INITRAMFS_FN
});
if !kernel.exists() || !initramfs.exists() {
bail!("cannot run file-restore VM: package 'proxmox-backup-restore-image' is not (correctly) installed");
}
Ok(())
}
pub fn try_kill_vm(pid: i32) -> Result<(), Error> {
let pid = Pid::from_raw(pid);
if kill(pid, None).is_ok() {
// process is running (and we could kill it), check if it is actually ours
// (if it errors assume we raced with the process's death and ignore it)
if let Ok(cmdline) = file_read_string(format!("/proc/{}/cmdline", pid)) {
if cmdline.split('\0').any(|a| a == PBS_VM_NAME) {
// yes, it's ours, kill it brutally with SIGKILL, no reason to take
// any chances - in this state it's most likely broken anyway
if let Err(err) = kill(pid, Signal::SIGKILL) {
bail!(
"reaping broken VM (pid {}) with SIGKILL failed: {}",
pid,
err
);
}
}
}
}
Ok(())
}
async fn create_temp_initramfs(ticket: &str, debug: bool) -> Result<(File, String), Error> {
use std::ffi::CString;
use tokio::fs::File;
let (tmp_file, tmp_path) =
make_tmp_file("/tmp/file-restore-qemu.initramfs.tmp", CreateOptions::new())?;
nix::unistd::unlink(&tmp_path)?;
fd_change_cloexec(tmp_file.as_raw_fd(), false)?;
let initramfs = if debug {
pbs_buildcfg::PROXMOX_BACKUP_INITRAMFS_DBG_FN
} else {
pbs_buildcfg::PROXMOX_BACKUP_INITRAMFS_FN
};
let mut f = File::from_std(tmp_file);
let mut base = File::open(initramfs).await?;
tokio::io::copy(&mut base, &mut f).await?;
let name = CString::new("ticket").unwrap();
cpio::append_file(
&mut f,
ticket.as_bytes(),
&name,
0,
(libc::S_IFREG | 0o400) as u16,
0,
0,
0,
ticket.len() as u32,
)
.await?;
cpio::append_trailer(&mut f).await?;
let tmp_file = f.into_std().await;
let path = format!("/dev/fd/{}", &tmp_file.as_raw_fd());
Ok((tmp_file, path))
}
pub async fn start_vm(
// u16 so we can do wrapping_add without going too high
mut cid: u16,
details: &SnapRestoreDetails,
files: impl Iterator<Item = String>,
ticket: &str,
) -> Result<(i32, i32), Error> {
if std::env::var("PBS_PASSWORD").is_err() {
bail!("environment variable PBS_PASSWORD has to be set for QEMU VM restore");
}
let debug = if let Ok(val) = std::env::var("PBS_QEMU_DEBUG") {
!val.is_empty()
} else {
false
};
validate_img_existance(debug)?;
let pid;
let (mut pid_file, pid_path) = make_tmp_file("/tmp/file-restore-qemu.pid.tmp", CreateOptions::new())?;
nix::unistd::unlink(&pid_path)?;
fd_change_cloexec(pid_file.as_raw_fd(), false)?;
let (_ramfs_pid, ramfs_path) = create_temp_initramfs(ticket, debug).await?;
let logpath = create_restore_log_dir()?;
let logfile = &format!("{}/qemu.log", logpath);
let mut logrotate = LogRotate::new(logfile, false, Some(16), None)?;
if let Err(err) = logrotate.do_rotate() {
eprintln!("warning: logrotate for QEMU log file failed - {}", err);
}
let mut logfd = OpenOptions::new()
.append(true)
.create_new(true)
.open(logfile)?;
fd_change_cloexec(logfd.as_raw_fd(), false)?;
// preface log file with start timestamp so one can see how long QEMU took to start
writeln!(logfd, "[{}] PBS file restore VM log", {
let now = proxmox_time::epoch_i64();
proxmox_time::epoch_to_rfc3339(now)?
},)?;
let base_args = [
"-chardev",
&format!(
"file,id=log,path=/dev/null,logfile=/dev/fd/{},logappend=on",
logfd.as_raw_fd()
),
"-serial",
"chardev:log",
"-vnc",
"none",
"-enable-kvm",
"-kernel",
pbs_buildcfg::PROXMOX_BACKUP_KERNEL_FN,
"-initrd",
&ramfs_path,
"-append",
&format!(
"{} panic=1 zfs_arc_min=0 zfs_arc_max=0",
if debug { "debug" } else { "quiet" }
),
"-daemonize",
"-pidfile",
&format!("/dev/fd/{}", pid_file.as_raw_fd()),
"-name",
PBS_VM_NAME,
];
// Generate drive arguments for all fidx files in backup snapshot
let mut drives = Vec::new();
let mut id = 0;
for file in files {
if !file.ends_with(".img.fidx") {
continue;
}
drives.push("-drive".to_owned());
let keyfile = if let Some(ref keyfile) = details.keyfile {
format!(",,keyfile={}", keyfile)
} else {
"".to_owned()
};
drives.push(format!(
"file=pbs:repository={},,snapshot={},,archive={}{},read-only=on,if=none,id=drive{}",
details.repo, details.snapshot, file, keyfile, id
));
// a PCI bus can only support 32 devices, so add a new one every 32
let bus = (id / 32) + 2;
if id % 32 == 0 {
drives.push("-device".to_owned());
drives.push(format!("pci-bridge,id=bridge{},chassis_nr={}", bus, bus));
}
drives.push("-device".to_owned());
// drive serial is used by VM to map .fidx files to /dev paths
let serial = file.strip_suffix(".img.fidx").unwrap_or(&file);
drives.push(format!(
"virtio-blk-pci,drive=drive{},serial={},bus=bridge{}",
id, serial, bus
));
id += 1;
}
let ram = if debug {
1024
} else {
// add more RAM if many drives are given
match id {
f if f < 10 => 192,
f if f < 20 => 256,
_ => 384,
}
};
// Try starting QEMU in a loop to retry if we fail because of a bad 'cid' value
let mut attempts = 0;
loop {
let mut qemu_cmd = std::process::Command::new("qemu-system-x86_64");
qemu_cmd.args(base_args.iter());
qemu_cmd.arg("-m");
qemu_cmd.arg(ram.to_string());
qemu_cmd.args(&drives);
qemu_cmd.arg("-device");
qemu_cmd.arg(format!(
"vhost-vsock-pci,guest-cid={},disable-legacy=on",
cid
));
if debug {
let debug_args = [
"-chardev",
&format!(
"socket,id=debugser,path=/run/proxmox-backup/file-restore-serial-{}.sock,server,nowait",
cid
),
"-serial",
"chardev:debugser",
];
qemu_cmd.args(debug_args.iter());
}
qemu_cmd.stdout(std::process::Stdio::null());
qemu_cmd.stderr(std::process::Stdio::piped());
let res = tokio::task::block_in_place(|| qemu_cmd.spawn()?.wait_with_output())?;
if res.status.success() {
// at this point QEMU is already daemonized and running, so if anything fails we
// technically leave behind a zombie-VM... this shouldn't matter, as it will stop
// itself soon enough (timer), and the following operations are unlikely to fail
let mut pidstr = String::new();
pid_file.read_to_string(&mut pidstr)?;
pid = pidstr.trim_end().parse().map_err(|err| {
format_err!("cannot parse PID returned by QEMU ('{}'): {}", &pidstr, err)
})?;
break;
} else {
let out = String::from_utf8_lossy(&res.stderr);
if out.contains("unable to set guest cid: Address already in use") {
attempts += 1;
if attempts >= MAX_CID_TRIES {
bail!("CID '{}' in use, but max attempts reached, aborting", cid);
}
// CID in use, try next higher one
eprintln!("CID '{}' in use by other VM, attempting next one", cid);
// skip special-meaning low values
cid = cid.wrapping_add(1).max(10);
} else {
eprint!("{}", out);
bail!("Starting VM failed. See output above for more information.");
}
}
}
// QEMU has started successfully, now wait for virtio socket to become ready
let pid_t = Pid::from_raw(pid);
let start_poll = Instant::now();
let mut round = 1;
loop {
let client = VsockClient::new(cid as i32, DEFAULT_VSOCK_PORT, Some(ticket.to_owned()));
if let Ok(Ok(_)) =
time::timeout(Duration::from_secs(2), client.get("api2/json/status", None)).await
{
if debug {
eprintln!(
"Connect to '/run/proxmox-backup/file-restore-serial-{}.sock' for shell access",
cid
)
}
return Ok((pid, cid as i32));
}
if kill(pid_t, None).is_err() { // check if QEMU process exited in between
bail!("VM exited before connection could be established");
}
if Instant::now().duration_since(start_poll) > Duration::from_secs(25) {
break;
}
time::sleep(Duration::from_millis(round * 25)).await;
round += 1;
}
// start failed
if let Err(err) = try_kill_vm(pid) {
eprintln!("killing failed VM failed: {}", err);
}
bail!("starting VM timed out");
}