2021-03-31 10:21:52 +00:00
|
|
|
///! Daemon binary to run inside a micro-VM for secure single file restore of disk images
|
2021-09-01 12:37:11 +00:00
|
|
|
use std::fs::File;
|
|
|
|
use std::io::prelude::*;
|
2021-03-31 10:21:52 +00:00
|
|
|
use std::os::unix::{
|
|
|
|
io::{FromRawFd, RawFd},
|
|
|
|
net,
|
|
|
|
};
|
|
|
|
use std::path::Path;
|
2021-03-31 10:21:54 +00:00
|
|
|
use std::sync::{Arc, Mutex};
|
2021-03-31 10:21:52 +00:00
|
|
|
|
2021-09-01 12:37:11 +00:00
|
|
|
use anyhow::{bail, format_err, Error};
|
|
|
|
use lazy_static::lazy_static;
|
|
|
|
use log::{error, info};
|
2021-03-31 10:21:52 +00:00
|
|
|
use tokio::sync::mpsc;
|
|
|
|
use tokio_stream::wrappers::ReceiverStream;
|
|
|
|
|
2021-10-08 09:19:37 +00:00
|
|
|
use proxmox_router::RpcEnvironmentType;
|
2021-03-31 10:21:52 +00:00
|
|
|
|
2021-09-01 12:37:11 +00:00
|
|
|
use pbs_client::DEFAULT_VSOCK_PORT;
|
2021-10-05 09:01:05 +00:00
|
|
|
use proxmox_rest_server::{ApiConfig, RestServer};
|
2021-07-23 06:19:35 +00:00
|
|
|
|
2021-03-31 10:21:52 +00:00
|
|
|
mod proxmox_restore_daemon;
|
|
|
|
use proxmox_restore_daemon::*;
|
|
|
|
|
|
|
|
/// Maximum amount of pending requests. If saturated, virtio-vsock returns ETIMEDOUT immediately.
|
|
|
|
/// We should never have more than a few requests in queue, so use a low number.
|
|
|
|
pub const MAX_PENDING: usize = 32;
|
|
|
|
|
|
|
|
/// Will be present in base initramfs
|
|
|
|
pub const VM_DETECT_FILE: &str = "/restore-vm-marker";
|
|
|
|
|
2021-03-31 10:21:54 +00:00
|
|
|
lazy_static! {
|
|
|
|
/// The current disks state. Use for accessing data on the attached snapshots.
|
|
|
|
pub static ref DISK_STATE: Arc<Mutex<DiskState>> = {
|
|
|
|
Arc::new(Mutex::new(DiskState::scan().unwrap()))
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2021-03-31 10:21:52 +00:00
|
|
|
/// This is expected to be run by 'proxmox-file-restore' within a mini-VM
|
|
|
|
fn main() -> Result<(), Error> {
|
Set MMAP_THRESHOLD to a fixed value (128K)
glibc's malloc has a misguided heuristic to detect transient allocations that
will just result in allocation sizes below 32 MiB never using mmap.
That it turn means that those relatively big allocations are on the heap where
cleanup and returning memory to the OS is harder to do and easier to be blocked
by long living, small allocations at the top (end) of the heap.
Observing the malloc size distribution in a file-level backup run:
@size:
[0] 14 | |
[1] 25214 |@@@@@ |
[2, 4) 9090 |@ |
[4, 8) 12987 |@@ |
[8, 16) 93453 |@@@@@@@@@@@@@@@@@@@@ |
[16, 32) 30255 |@@@@@@ |
[32, 64) 237445 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[64, 128) 32692 |@@@@@@@ |
[128, 256) 22296 |@@@@ |
[256, 512) 16177 |@@@ |
[512, 1K) 5139 |@ |
[1K, 2K) 3352 | |
[2K, 4K) 214 | |
[4K, 8K) 1568 | |
[8K, 16K) 95 | |
[16K, 32K) 3457 | |
[32K, 64K) 3175 | |
[64K, 128K) 161 | |
[128K, 256K) 453 | |
[256K, 512K) 93 | |
[512K, 1M) 74 | |
[1M, 2M) 774 | |
[2M, 4M) 319 | |
[4M, 8M) 700 | |
[8M, 16M) 93 | |
[16M, 32M) 18 | |
We see that all allocations will be on the heap, and that while most
allocations are small, the relatively few big ones will still make up most of
the RSS and if blocked from being released back to the OS result in much higher
peak and average usage for the program than actually required.
Avoiding the "dynamic" mmap-threshold increasement algorithm and fixing it at
the original default of 128 KiB reduces RSS size by factor 10-20 when running
backups. As with memory mappings other mappings or the heap can never block
freeing the memory fully back to the OS.
But, the drawback of using mmap is more wasted space for unaligned or small
allocation sizes, and the fact that the kernel allegedly zeros out the data
before giving it to user space. The former doesn't really matter for us when
using it only for allocations bigger than 128 KiB, and the latter is a
trade-off, using 10 to 20 times less memory brings its own performance
improvement possibilities for the whole system after all ;-)
Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
[ Thomas: added to comment & commit message + extra-empty-line fixes ]
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
2022-01-26 06:10:59 +00:00
|
|
|
pbs_tools::setup_libc_malloc_opts();
|
|
|
|
|
2021-03-31 10:21:52 +00:00
|
|
|
if !Path::new(VM_DETECT_FILE).exists() {
|
2021-07-03 17:51:14 +00:00
|
|
|
bail!(
|
|
|
|
"This binary is not supposed to be run manually, use 'proxmox-file-restore' instead."
|
|
|
|
);
|
2021-03-31 10:21:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// don't have a real syslog (and no persistance), so use env_logger to print to a log file (via
|
|
|
|
// stdout to a serial terminal attached by QEMU)
|
2021-12-16 10:12:36 +00:00
|
|
|
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
|
2021-03-31 10:21:52 +00:00
|
|
|
.write_style(env_logger::WriteStyle::Never)
|
2021-07-03 19:28:17 +00:00
|
|
|
.format_timestamp_millis()
|
2021-03-31 10:21:52 +00:00
|
|
|
.init();
|
|
|
|
|
2021-07-23 06:24:25 +00:00
|
|
|
info!("setup basic system environment...");
|
|
|
|
setup_system_env().map_err(|err| format_err!("system environment setup failed: {}", err))?;
|
2021-07-03 19:03:13 +00:00
|
|
|
|
2021-03-31 10:21:54 +00:00
|
|
|
// scan all attached disks now, before starting the API
|
|
|
|
// this will panic and stop the VM if anything goes wrong
|
2021-07-03 17:51:32 +00:00
|
|
|
info!("scanning all disks...");
|
2021-03-31 10:21:54 +00:00
|
|
|
{
|
|
|
|
let _disk_state = DISK_STATE.lock().unwrap();
|
|
|
|
}
|
|
|
|
|
2021-07-03 17:51:32 +00:00
|
|
|
info!("disk scan complete, starting main runtime...");
|
|
|
|
|
2021-11-19 16:36:06 +00:00
|
|
|
proxmox_async::runtime::main(run())
|
2021-03-31 10:21:52 +00:00
|
|
|
}
|
|
|
|
|
2021-07-23 06:10:55 +00:00
|
|
|
/// ensure we have our /run dirs, system users and stuff like that setup
|
|
|
|
fn setup_system_env() -> Result<(), Error> {
|
|
|
|
// the API may save some stuff there, e.g., the memcon tracking file
|
|
|
|
// we do not care much, but it's way less headache to just create it
|
|
|
|
std::fs::create_dir_all("/run/proxmox-backup")?;
|
|
|
|
|
2021-07-23 06:19:35 +00:00
|
|
|
// we now ensure that all lock files are owned by the backup user, and as we reuse the
|
|
|
|
// specialized REST module from pbs api/daemon we have some checks there for user/acl stuff
|
|
|
|
// that gets locked, and thus needs the backup system user to work.
|
|
|
|
std::fs::create_dir_all("/etc")?;
|
|
|
|
let mut passwd = File::create("/etc/passwd")?;
|
|
|
|
writeln!(passwd, "root:x:0:0:root:/root:/bin/sh")?;
|
|
|
|
writeln!(passwd, "backup:x:34:34:backup:/var/backups:/usr/sbin/nologin")?;
|
|
|
|
|
|
|
|
let mut group = File::create("/etc/group")?;
|
|
|
|
writeln!(group, "root:x:0:")?;
|
|
|
|
writeln!(group, "backup:x:34:")?;
|
|
|
|
|
2021-07-23 06:10:55 +00:00
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2021-09-21 05:58:47 +00:00
|
|
|
|
2021-03-31 10:21:52 +00:00
|
|
|
async fn run() -> Result<(), Error> {
|
2021-03-31 10:21:53 +00:00
|
|
|
watchdog_init();
|
|
|
|
|
2021-10-05 09:01:05 +00:00
|
|
|
let adaptor = StaticAuthAdapter::new()
|
|
|
|
.map_err(|err| format_err!("reading ticket file failed: {}", err))?;
|
|
|
|
|
|
|
|
let config = ApiConfig::new("", &ROUTER, RpcEnvironmentType::PUBLIC, adaptor)?;
|
2021-03-31 10:21:52 +00:00
|
|
|
let rest_server = RestServer::new(config);
|
|
|
|
|
|
|
|
let vsock_fd = get_vsock_fd()?;
|
|
|
|
let connections = accept_vsock_connections(vsock_fd);
|
|
|
|
let receiver_stream = ReceiverStream::new(connections);
|
|
|
|
let acceptor = hyper::server::accept::from_stream(receiver_stream);
|
|
|
|
|
|
|
|
hyper::Server::builder(acceptor).serve(rest_server).await?;
|
|
|
|
|
|
|
|
bail!("hyper server exited");
|
|
|
|
}
|
|
|
|
|
|
|
|
fn accept_vsock_connections(
|
|
|
|
vsock_fd: RawFd,
|
|
|
|
) -> mpsc::Receiver<Result<tokio::net::UnixStream, Error>> {
|
|
|
|
use nix::sys::socket::*;
|
|
|
|
let (sender, receiver) = mpsc::channel(MAX_PENDING);
|
|
|
|
|
|
|
|
tokio::spawn(async move {
|
|
|
|
loop {
|
|
|
|
let stream: Result<tokio::net::UnixStream, Error> = tokio::task::block_in_place(|| {
|
|
|
|
// we need to accept manually, as UnixListener aborts if socket type != AF_UNIX ...
|
|
|
|
let client_fd = accept(vsock_fd)?;
|
|
|
|
let stream = unsafe { net::UnixStream::from_raw_fd(client_fd) };
|
|
|
|
stream.set_nonblocking(true)?;
|
|
|
|
tokio::net::UnixStream::from_std(stream).map_err(|err| err.into())
|
|
|
|
});
|
|
|
|
|
|
|
|
match stream {
|
|
|
|
Ok(stream) => {
|
|
|
|
if sender.send(Ok(stream)).await.is_err() {
|
|
|
|
error!("connection accept channel was closed");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Err(err) => {
|
|
|
|
error!("error accepting vsock connetion: {}", err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
receiver
|
|
|
|
}
|
|
|
|
|
|
|
|
fn get_vsock_fd() -> Result<RawFd, Error> {
|
|
|
|
use nix::sys::socket::*;
|
|
|
|
let sock_fd = socket(
|
|
|
|
AddressFamily::Vsock,
|
|
|
|
SockType::Stream,
|
|
|
|
SockFlag::empty(),
|
|
|
|
None,
|
|
|
|
)?;
|
|
|
|
let sock_addr = VsockAddr::new(libc::VMADDR_CID_ANY, DEFAULT_VSOCK_PORT as u32);
|
|
|
|
bind(sock_fd, &SockAddr::Vsock(sock_addr))?;
|
|
|
|
listen(sock_fd, MAX_PENDING)?;
|
|
|
|
Ok(sock_fd)
|
|
|
|
}
|