2020-07-29 11:29:13 +00:00
|
|
|
use std::collections::HashSet;
|
2020-09-01 09:17:13 +00:00
|
|
|
use std::sync::{Arc, Mutex};
|
|
|
|
use std::sync::atomic::{Ordering, AtomicUsize};
|
|
|
|
use std::time::Instant;
|
2020-10-20 08:08:24 +00:00
|
|
|
use nix::dir::Dir;
|
2020-07-29 11:29:13 +00:00
|
|
|
|
2020-08-25 15:30:27 +00:00
|
|
|
use anyhow::{bail, format_err, Error};
|
2020-06-24 11:11:45 +00:00
|
|
|
|
2020-09-26 08:23:44 +00:00
|
|
|
use crate::{
|
2020-10-20 09:10:05 +00:00
|
|
|
server::WorkerTask,
|
2020-09-26 08:23:44 +00:00
|
|
|
api2::types::*,
|
2020-10-20 09:10:05 +00:00
|
|
|
config::jobstate::Job,
|
|
|
|
config::verify::VerificationJobConfig,
|
2020-09-26 08:23:44 +00:00
|
|
|
backup::{
|
|
|
|
DataStore,
|
|
|
|
DataBlob,
|
|
|
|
BackupGroup,
|
|
|
|
BackupDir,
|
|
|
|
BackupInfo,
|
|
|
|
IndexFile,
|
|
|
|
CryptMode,
|
|
|
|
FileInfo,
|
|
|
|
ArchiveType,
|
|
|
|
archive_type,
|
|
|
|
},
|
2020-10-12 09:46:34 +00:00
|
|
|
server::UPID,
|
|
|
|
task::TaskState,
|
|
|
|
task_log,
|
|
|
|
tools::ParallelHandler,
|
2020-10-14 12:16:33 +00:00
|
|
|
tools::fs::lock_dir_noblock_shared,
|
2020-06-24 11:11:45 +00:00
|
|
|
};
|
|
|
|
|
2020-09-01 09:17:13 +00:00
|
|
|
fn verify_blob(datastore: Arc<DataStore>, backup_dir: &BackupDir, info: &FileInfo) -> Result<(), Error> {
|
2020-06-24 11:11:45 +00:00
|
|
|
|
2020-07-28 08:23:16 +00:00
|
|
|
let blob = datastore.load_blob(backup_dir, &info.filename)?;
|
2020-06-24 11:11:45 +00:00
|
|
|
|
2020-07-29 11:29:13 +00:00
|
|
|
let raw_size = blob.raw_size();
|
2020-06-24 11:11:45 +00:00
|
|
|
if raw_size != info.size {
|
|
|
|
bail!("wrong size ({} != {})", info.size, raw_size);
|
|
|
|
}
|
|
|
|
|
2020-07-28 08:23:16 +00:00
|
|
|
let csum = openssl::sha::sha256(blob.raw_data());
|
2020-06-24 11:11:45 +00:00
|
|
|
if csum != info.csum {
|
|
|
|
bail!("wrong index checksum");
|
|
|
|
}
|
|
|
|
|
2020-08-03 12:10:43 +00:00
|
|
|
match blob.crypt_mode()? {
|
|
|
|
CryptMode::Encrypt => Ok(()),
|
|
|
|
CryptMode::None => {
|
|
|
|
// digest already verified above
|
|
|
|
blob.decode(None, None)?;
|
|
|
|
Ok(())
|
|
|
|
},
|
|
|
|
CryptMode::SignOnly => bail!("Invalid CryptMode for blob"),
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-07 15:30:33 +00:00
|
|
|
fn rename_corrupted_chunk(
|
|
|
|
datastore: Arc<DataStore>,
|
|
|
|
digest: &[u8;32],
|
2020-10-12 09:46:34 +00:00
|
|
|
worker: &dyn TaskState,
|
2020-09-07 15:30:33 +00:00
|
|
|
) {
|
|
|
|
let (path, digest_str) = datastore.chunk_path(digest);
|
|
|
|
|
|
|
|
let mut counter = 0;
|
|
|
|
let mut new_path = path.clone();
|
2020-09-08 10:29:53 +00:00
|
|
|
loop {
|
2020-09-07 15:30:33 +00:00
|
|
|
new_path.set_file_name(format!("{}.{}.bad", digest_str, counter));
|
2020-09-08 10:29:53 +00:00
|
|
|
if new_path.exists() && counter < 9 { counter += 1; } else { break; }
|
2020-09-07 15:30:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
match std::fs::rename(&path, &new_path) {
|
|
|
|
Ok(_) => {
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(worker, "corrupted chunk renamed to {:?}", &new_path);
|
2020-09-07 15:30:33 +00:00
|
|
|
},
|
|
|
|
Err(err) => {
|
|
|
|
match err.kind() {
|
|
|
|
std::io::ErrorKind::NotFound => { /* ignored */ },
|
2020-10-12 09:46:34 +00:00
|
|
|
_ => task_log!(worker, "could not rename corrupted chunk {:?} - {}", &path, err)
|
2020-09-07 15:30:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2020-06-26 06:14:45 +00:00
|
|
|
fn verify_index_chunks(
|
2020-09-01 09:17:13 +00:00
|
|
|
datastore: Arc<DataStore>,
|
|
|
|
index: Box<dyn IndexFile + Send>,
|
|
|
|
verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
|
|
|
corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
|
2020-08-10 11:25:08 +00:00
|
|
|
crypt_mode: CryptMode,
|
2020-10-12 09:46:34 +00:00
|
|
|
worker: Arc<dyn TaskState + Send + Sync>,
|
2020-06-26 06:14:45 +00:00
|
|
|
) -> Result<(), Error> {
|
|
|
|
|
2020-10-01 12:48:49 +00:00
|
|
|
let errors = Arc::new(AtomicUsize::new(0));
|
2020-06-26 06:14:45 +00:00
|
|
|
|
2020-09-01 09:17:13 +00:00
|
|
|
let start_time = Instant::now();
|
2020-06-26 06:14:45 +00:00
|
|
|
|
2020-09-01 09:17:13 +00:00
|
|
|
let mut read_bytes = 0;
|
|
|
|
let mut decoded_bytes = 0;
|
2020-08-25 06:52:24 +00:00
|
|
|
|
2020-09-26 09:14:37 +00:00
|
|
|
let worker2 = Arc::clone(&worker);
|
|
|
|
let datastore2 = Arc::clone(&datastore);
|
|
|
|
let corrupt_chunks2 = Arc::clone(&corrupt_chunks);
|
|
|
|
let verified_chunks2 = Arc::clone(&verified_chunks);
|
2020-10-01 12:48:49 +00:00
|
|
|
let errors2 = Arc::clone(&errors);
|
2020-09-26 09:14:37 +00:00
|
|
|
|
|
|
|
let decoder_pool = ParallelHandler::new(
|
|
|
|
"verify chunk decoder", 4,
|
|
|
|
move |(chunk, digest, size): (DataBlob, [u8;32], u64)| {
|
|
|
|
let chunk_crypt_mode = match chunk.crypt_mode() {
|
|
|
|
Err(err) => {
|
|
|
|
corrupt_chunks2.lock().unwrap().insert(digest);
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(worker2, "can't verify chunk, unknown CryptMode - {}", err);
|
2020-09-26 09:14:37 +00:00
|
|
|
errors2.fetch_add(1, Ordering::SeqCst);
|
|
|
|
return Ok(());
|
|
|
|
},
|
|
|
|
Ok(mode) => mode,
|
|
|
|
};
|
|
|
|
|
|
|
|
if chunk_crypt_mode != crypt_mode {
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(
|
|
|
|
worker2,
|
2020-09-26 09:14:37 +00:00
|
|
|
"chunk CryptMode {:?} does not match index CryptMode {:?}",
|
|
|
|
chunk_crypt_mode,
|
|
|
|
crypt_mode
|
2020-10-12 09:46:34 +00:00
|
|
|
);
|
2020-09-26 09:14:37 +00:00
|
|
|
errors2.fetch_add(1, Ordering::SeqCst);
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Err(err) = chunk.verify_unencrypted(size as usize, &digest) {
|
|
|
|
corrupt_chunks2.lock().unwrap().insert(digest);
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(worker2, "{}", err);
|
2020-09-26 09:14:37 +00:00
|
|
|
errors2.fetch_add(1, Ordering::SeqCst);
|
2020-10-12 09:46:34 +00:00
|
|
|
rename_corrupted_chunk(datastore2.clone(), &digest, &worker2);
|
2020-09-26 09:14:37 +00:00
|
|
|
} else {
|
|
|
|
verified_chunks2.lock().unwrap().insert(digest);
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
for pos in 0..index.index_count() {
|
2020-07-29 11:29:13 +00:00
|
|
|
|
2020-10-12 09:46:34 +00:00
|
|
|
worker.check_abort()?;
|
2020-09-02 07:50:17 +00:00
|
|
|
crate::tools::fail_on_shutdown()?;
|
2020-09-01 09:17:13 +00:00
|
|
|
|
2020-09-26 09:14:37 +00:00
|
|
|
let info = index.chunk_info(pos).unwrap();
|
|
|
|
let size = info.size();
|
2020-08-10 11:25:08 +00:00
|
|
|
|
2020-09-26 09:14:37 +00:00
|
|
|
if verified_chunks.lock().unwrap().contains(&info.digest) {
|
|
|
|
continue; // already verified
|
|
|
|
}
|
2020-09-01 09:17:13 +00:00
|
|
|
|
2020-09-26 09:14:37 +00:00
|
|
|
if corrupt_chunks.lock().unwrap().contains(&info.digest) {
|
|
|
|
let digest_str = proxmox::tools::digest_to_hex(&info.digest);
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(worker, "chunk {} was marked as corrupt", digest_str);
|
2020-09-01 09:17:13 +00:00
|
|
|
errors.fetch_add(1, Ordering::SeqCst);
|
2020-09-26 09:14:37 +00:00
|
|
|
continue;
|
2020-08-10 11:25:08 +00:00
|
|
|
}
|
|
|
|
|
2020-09-26 09:14:37 +00:00
|
|
|
match datastore.load_chunk(&info.digest) {
|
|
|
|
Err(err) => {
|
|
|
|
corrupt_chunks.lock().unwrap().insert(info.digest);
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(worker, "can't verify chunk, load failed - {}", err);
|
2020-09-26 09:14:37 +00:00
|
|
|
errors.fetch_add(1, Ordering::SeqCst);
|
2020-10-12 09:46:34 +00:00
|
|
|
rename_corrupted_chunk(datastore.clone(), &info.digest, &worker);
|
2020-09-26 09:14:37 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
Ok(chunk) => {
|
|
|
|
read_bytes += chunk.raw_size();
|
|
|
|
decoder_pool.send((chunk, info.digest, size))?;
|
|
|
|
decoded_bytes += size;
|
|
|
|
}
|
2020-07-29 11:29:13 +00:00
|
|
|
}
|
2020-06-26 06:14:45 +00:00
|
|
|
}
|
|
|
|
|
2020-09-26 09:14:37 +00:00
|
|
|
decoder_pool.complete()?;
|
|
|
|
|
2020-09-01 09:17:13 +00:00
|
|
|
let elapsed = start_time.elapsed().as_secs_f64();
|
|
|
|
|
|
|
|
let read_bytes_mib = (read_bytes as f64)/(1024.0*1024.0);
|
|
|
|
let decoded_bytes_mib = (decoded_bytes as f64)/(1024.0*1024.0);
|
|
|
|
|
|
|
|
let read_speed = read_bytes_mib/elapsed;
|
|
|
|
let decode_speed = decoded_bytes_mib/elapsed;
|
|
|
|
|
|
|
|
let error_count = errors.load(Ordering::SeqCst);
|
|
|
|
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(
|
|
|
|
worker,
|
|
|
|
" verified {:.2}/{:.2} MiB in {:.2} seconds, speed {:.2}/{:.2} MiB/s ({} errors)",
|
|
|
|
read_bytes_mib,
|
|
|
|
decoded_bytes_mib,
|
|
|
|
elapsed,
|
|
|
|
read_speed,
|
|
|
|
decode_speed,
|
|
|
|
error_count,
|
|
|
|
);
|
2020-09-01 09:17:13 +00:00
|
|
|
|
|
|
|
if errors.load(Ordering::SeqCst) > 0 {
|
2020-07-30 07:09:03 +00:00
|
|
|
bail!("chunks could not be verified");
|
|
|
|
}
|
|
|
|
|
2020-06-26 06:14:45 +00:00
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2020-07-29 11:29:13 +00:00
|
|
|
fn verify_fixed_index(
|
2020-09-01 09:17:13 +00:00
|
|
|
datastore: Arc<DataStore>,
|
2020-07-29 11:29:13 +00:00
|
|
|
backup_dir: &BackupDir,
|
|
|
|
info: &FileInfo,
|
2020-09-01 09:17:13 +00:00
|
|
|
verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
|
|
|
corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
2020-10-12 09:46:34 +00:00
|
|
|
worker: Arc<dyn TaskState + Send + Sync>,
|
2020-07-29 11:29:13 +00:00
|
|
|
) -> Result<(), Error> {
|
2020-06-24 11:11:45 +00:00
|
|
|
|
|
|
|
let mut path = backup_dir.relative_path();
|
|
|
|
path.push(&info.filename);
|
|
|
|
|
|
|
|
let index = datastore.open_fixed_reader(&path)?;
|
|
|
|
|
|
|
|
let (csum, size) = index.compute_csum();
|
|
|
|
if size != info.size {
|
|
|
|
bail!("wrong size ({} != {})", info.size, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
if csum != info.csum {
|
|
|
|
bail!("wrong index checksum");
|
|
|
|
}
|
|
|
|
|
2020-10-12 09:46:34 +00:00
|
|
|
verify_index_chunks(
|
|
|
|
datastore,
|
|
|
|
Box::new(index),
|
|
|
|
verified_chunks,
|
|
|
|
corrupt_chunks,
|
|
|
|
info.chunk_crypt_mode(),
|
|
|
|
worker,
|
|
|
|
)
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
|
2020-07-29 11:29:13 +00:00
|
|
|
fn verify_dynamic_index(
|
2020-09-01 09:17:13 +00:00
|
|
|
datastore: Arc<DataStore>,
|
2020-07-29 11:29:13 +00:00
|
|
|
backup_dir: &BackupDir,
|
|
|
|
info: &FileInfo,
|
2020-09-01 09:17:13 +00:00
|
|
|
verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
|
|
|
corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
2020-10-12 09:46:34 +00:00
|
|
|
worker: Arc<dyn TaskState + Send + Sync>,
|
2020-07-29 11:29:13 +00:00
|
|
|
) -> Result<(), Error> {
|
|
|
|
|
2020-06-24 11:11:45 +00:00
|
|
|
let mut path = backup_dir.relative_path();
|
|
|
|
path.push(&info.filename);
|
|
|
|
|
|
|
|
let index = datastore.open_dynamic_reader(&path)?;
|
|
|
|
|
|
|
|
let (csum, size) = index.compute_csum();
|
|
|
|
if size != info.size {
|
|
|
|
bail!("wrong size ({} != {})", info.size, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
if csum != info.csum {
|
|
|
|
bail!("wrong index checksum");
|
|
|
|
}
|
|
|
|
|
2020-10-12 09:46:34 +00:00
|
|
|
verify_index_chunks(
|
|
|
|
datastore,
|
|
|
|
Box::new(index),
|
|
|
|
verified_chunks,
|
|
|
|
corrupt_chunks,
|
|
|
|
info.chunk_crypt_mode(),
|
|
|
|
worker,
|
|
|
|
)
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Verify a single backup snapshot
|
|
|
|
///
|
|
|
|
/// This checks all archives inside a backup snapshot.
|
|
|
|
/// Errors are logged to the worker log.
|
|
|
|
///
|
2020-06-25 10:55:34 +00:00
|
|
|
/// Returns
|
|
|
|
/// - Ok(true) if verify is successful
|
|
|
|
/// - Ok(false) if there were verification errors
|
|
|
|
/// - Err(_) if task was aborted
|
2020-07-29 11:29:13 +00:00
|
|
|
pub fn verify_backup_dir(
|
2020-09-01 09:17:13 +00:00
|
|
|
datastore: Arc<DataStore>,
|
2020-07-29 11:29:13 +00:00
|
|
|
backup_dir: &BackupDir,
|
2020-09-01 09:17:13 +00:00
|
|
|
verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
|
|
|
corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
2020-10-12 09:46:34 +00:00
|
|
|
worker: Arc<dyn TaskState + Send + Sync>,
|
|
|
|
upid: UPID,
|
2020-07-29 11:29:13 +00:00
|
|
|
) -> Result<bool, Error> {
|
2020-10-20 08:08:24 +00:00
|
|
|
let snap_lock = lock_dir_noblock_shared(
|
2020-10-14 12:16:33 +00:00
|
|
|
&datastore.snapshot_path(&backup_dir),
|
|
|
|
"snapshot",
|
|
|
|
"locked by another operation");
|
2020-10-20 08:08:24 +00:00
|
|
|
match snap_lock {
|
|
|
|
Ok(snap_lock) => verify_backup_dir_with_lock(
|
|
|
|
datastore,
|
2020-10-14 12:16:33 +00:00
|
|
|
backup_dir,
|
2020-10-20 08:08:24 +00:00
|
|
|
verified_chunks,
|
|
|
|
corrupt_chunks,
|
|
|
|
worker,
|
|
|
|
upid,
|
|
|
|
snap_lock
|
|
|
|
),
|
|
|
|
Err(err) => {
|
|
|
|
task_log!(
|
|
|
|
worker,
|
|
|
|
"SKIPPED: verify {}:{} - could not acquire snapshot lock: {}",
|
|
|
|
datastore.name(),
|
|
|
|
backup_dir,
|
|
|
|
err,
|
|
|
|
);
|
|
|
|
Ok(true)
|
|
|
|
}
|
2020-10-14 12:16:33 +00:00
|
|
|
}
|
2020-10-20 08:08:24 +00:00
|
|
|
}
|
2020-10-14 12:16:33 +00:00
|
|
|
|
2020-10-20 08:08:24 +00:00
|
|
|
/// See verify_backup_dir
|
|
|
|
pub fn verify_backup_dir_with_lock(
|
|
|
|
datastore: Arc<DataStore>,
|
|
|
|
backup_dir: &BackupDir,
|
|
|
|
verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
|
|
|
corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
|
|
|
worker: Arc<dyn TaskState + Send + Sync>,
|
|
|
|
upid: UPID,
|
|
|
|
_snap_lock: Dir,
|
|
|
|
) -> Result<bool, Error> {
|
2020-10-16 07:31:12 +00:00
|
|
|
let manifest = match datastore.load_manifest(&backup_dir) {
|
2020-07-31 08:25:30 +00:00
|
|
|
Ok((manifest, _)) => manifest,
|
2020-06-24 11:11:45 +00:00
|
|
|
Err(err) => {
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(
|
|
|
|
worker,
|
|
|
|
"verify {}:{} - manifest load error: {}",
|
|
|
|
datastore.name(),
|
|
|
|
backup_dir,
|
|
|
|
err,
|
|
|
|
);
|
2020-06-25 10:55:34 +00:00
|
|
|
return Ok(false);
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(worker, "verify {}:{}", datastore.name(), backup_dir);
|
2020-06-24 11:11:45 +00:00
|
|
|
|
|
|
|
let mut error_count = 0;
|
|
|
|
|
2020-09-15 08:19:23 +00:00
|
|
|
let mut verify_result = VerifyState::Ok;
|
2020-06-24 11:11:45 +00:00
|
|
|
for info in manifest.files() {
|
|
|
|
let result = proxmox::try_block!({
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(worker, " check {}", info.filename);
|
2020-06-24 11:11:45 +00:00
|
|
|
match archive_type(&info.filename)? {
|
2020-07-30 07:09:04 +00:00
|
|
|
ArchiveType::FixedIndex =>
|
|
|
|
verify_fixed_index(
|
2020-09-01 09:17:13 +00:00
|
|
|
datastore.clone(),
|
2020-07-30 07:09:04 +00:00
|
|
|
&backup_dir,
|
|
|
|
info,
|
2020-09-01 09:17:13 +00:00
|
|
|
verified_chunks.clone(),
|
|
|
|
corrupt_chunks.clone(),
|
|
|
|
worker.clone(),
|
2020-07-30 07:09:04 +00:00
|
|
|
),
|
|
|
|
ArchiveType::DynamicIndex =>
|
|
|
|
verify_dynamic_index(
|
2020-09-01 09:17:13 +00:00
|
|
|
datastore.clone(),
|
2020-07-30 07:09:04 +00:00
|
|
|
&backup_dir,
|
|
|
|
info,
|
2020-09-01 09:17:13 +00:00
|
|
|
verified_chunks.clone(),
|
|
|
|
corrupt_chunks.clone(),
|
|
|
|
worker.clone(),
|
2020-07-30 07:09:04 +00:00
|
|
|
),
|
2020-09-01 09:17:13 +00:00
|
|
|
ArchiveType::Blob => verify_blob(datastore.clone(), &backup_dir, info),
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
});
|
2020-06-25 10:55:34 +00:00
|
|
|
|
2020-10-12 09:46:34 +00:00
|
|
|
worker.check_abort()?;
|
2020-09-02 07:50:17 +00:00
|
|
|
crate::tools::fail_on_shutdown()?;
|
2020-06-25 10:55:34 +00:00
|
|
|
|
2020-06-24 11:11:45 +00:00
|
|
|
if let Err(err) = result {
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(
|
|
|
|
worker,
|
|
|
|
"verify {}:{}/{} failed: {}",
|
|
|
|
datastore.name(),
|
|
|
|
backup_dir,
|
|
|
|
info.filename,
|
|
|
|
err,
|
|
|
|
);
|
2020-06-24 11:11:45 +00:00
|
|
|
error_count += 1;
|
2020-09-15 08:19:23 +00:00
|
|
|
verify_result = VerifyState::Failed;
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
2020-08-25 15:30:27 +00:00
|
|
|
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
|
2020-08-25 15:30:27 +00:00
|
|
|
let verify_state = SnapshotVerifyState {
|
2020-09-15 08:19:23 +00:00
|
|
|
state: verify_result,
|
2020-10-12 09:46:34 +00:00
|
|
|
upid,
|
2020-08-25 15:30:27 +00:00
|
|
|
};
|
2020-10-16 07:31:12 +00:00
|
|
|
let verify_state = serde_json::to_value(verify_state)?;
|
|
|
|
datastore.update_manifest(&backup_dir, |manifest| {
|
|
|
|
manifest.unprotected["verify_state"] = verify_state;
|
|
|
|
}).map_err(|err| format_err!("unable to update manifest blob - {}", err))?;
|
2020-08-25 15:30:27 +00:00
|
|
|
|
2020-06-25 10:55:34 +00:00
|
|
|
Ok(error_count == 0)
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
|
2020-06-25 10:55:34 +00:00
|
|
|
/// Verify all backups inside a backup group
|
|
|
|
///
|
|
|
|
/// Errors are logged to the worker log.
|
|
|
|
///
|
|
|
|
/// Returns
|
2020-09-02 05:43:04 +00:00
|
|
|
/// - Ok((count, failed_dirs)) where failed_dirs had verification errors
|
2020-06-25 10:55:34 +00:00
|
|
|
/// - Err(_) if task was aborted
|
2020-09-01 11:33:04 +00:00
|
|
|
pub fn verify_backup_group(
|
|
|
|
datastore: Arc<DataStore>,
|
|
|
|
group: &BackupGroup,
|
|
|
|
verified_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
|
|
|
corrupt_chunks: Arc<Mutex<HashSet<[u8;32]>>>,
|
2020-09-02 05:43:04 +00:00
|
|
|
progress: Option<(usize, usize)>, // (done, snapshot_count)
|
2020-10-12 09:46:34 +00:00
|
|
|
worker: Arc<dyn TaskState + Send + Sync>,
|
|
|
|
upid: &UPID,
|
2020-09-02 05:43:04 +00:00
|
|
|
) -> Result<(usize, Vec<String>), Error> {
|
2020-06-24 11:11:45 +00:00
|
|
|
|
2020-07-30 07:09:05 +00:00
|
|
|
let mut errors = Vec::new();
|
2020-06-24 11:11:45 +00:00
|
|
|
let mut list = match group.list_backups(&datastore.base_path()) {
|
|
|
|
Ok(list) => list,
|
|
|
|
Err(err) => {
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(
|
|
|
|
worker,
|
|
|
|
"verify group {}:{} - unable to list backups: {}",
|
|
|
|
datastore.name(),
|
|
|
|
group,
|
|
|
|
err,
|
|
|
|
);
|
2020-09-02 05:43:04 +00:00
|
|
|
return Ok((0, errors));
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(worker, "verify group {}:{}", datastore.name(), group);
|
2020-06-24 11:11:45 +00:00
|
|
|
|
2020-09-02 05:43:04 +00:00
|
|
|
let (done, snapshot_count) = progress.unwrap_or((0, list.len()));
|
|
|
|
|
|
|
|
let mut count = 0;
|
2020-06-24 11:11:45 +00:00
|
|
|
BackupInfo::sort_list(&mut list, false); // newest first
|
|
|
|
for info in list {
|
2020-09-02 05:43:04 +00:00
|
|
|
count += 1;
|
2020-10-12 09:46:34 +00:00
|
|
|
if !verify_backup_dir(
|
|
|
|
datastore.clone(),
|
|
|
|
&info.backup_dir,
|
|
|
|
verified_chunks.clone(),
|
|
|
|
corrupt_chunks.clone(),
|
|
|
|
worker.clone(),
|
|
|
|
upid.clone(),
|
|
|
|
)? {
|
2020-07-30 07:09:05 +00:00
|
|
|
errors.push(info.backup_dir.to_string());
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
2020-09-02 05:43:04 +00:00
|
|
|
if snapshot_count != 0 {
|
|
|
|
let pos = done + count;
|
|
|
|
let percentage = ((pos as f64) * 100.0)/(snapshot_count as f64);
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(
|
|
|
|
worker,
|
|
|
|
"percentage done: {:.2}% ({} of {} snapshots)",
|
|
|
|
percentage,
|
|
|
|
pos,
|
|
|
|
snapshot_count,
|
|
|
|
);
|
2020-09-02 05:43:04 +00:00
|
|
|
}
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
|
2020-09-02 05:43:04 +00:00
|
|
|
Ok((count, errors))
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
|
2020-06-25 10:55:34 +00:00
|
|
|
/// Verify all backups inside a datastore
|
|
|
|
///
|
|
|
|
/// Errors are logged to the worker log.
|
|
|
|
///
|
|
|
|
/// Returns
|
2020-07-30 07:09:05 +00:00
|
|
|
/// - Ok(failed_dirs) where failed_dirs had verification errors
|
2020-06-25 10:55:34 +00:00
|
|
|
/// - Err(_) if task was aborted
|
2020-10-12 09:46:34 +00:00
|
|
|
pub fn verify_all_backups(
|
|
|
|
datastore: Arc<DataStore>,
|
|
|
|
worker: Arc<dyn TaskState + Send + Sync>,
|
|
|
|
upid: &UPID,
|
|
|
|
) -> Result<Vec<String>, Error> {
|
2020-07-30 07:09:05 +00:00
|
|
|
let mut errors = Vec::new();
|
2020-06-24 11:11:45 +00:00
|
|
|
|
2020-08-25 06:38:47 +00:00
|
|
|
let mut list = match BackupGroup::list_groups(&datastore.base_path()) {
|
2020-09-10 06:54:29 +00:00
|
|
|
Ok(list) => list
|
|
|
|
.into_iter()
|
|
|
|
.filter(|group| !(group.backup_type() == "host" && group.backup_id() == "benchmark"))
|
|
|
|
.collect::<Vec<BackupGroup>>(),
|
2020-06-24 11:11:45 +00:00
|
|
|
Err(err) => {
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(
|
|
|
|
worker,
|
|
|
|
"verify datastore {} - unable to list backups: {}",
|
|
|
|
datastore.name(),
|
|
|
|
err,
|
|
|
|
);
|
2020-07-30 07:09:05 +00:00
|
|
|
return Ok(errors);
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-08-25 06:38:47 +00:00
|
|
|
list.sort_unstable();
|
|
|
|
|
2020-09-02 05:43:04 +00:00
|
|
|
let mut snapshot_count = 0;
|
|
|
|
for group in list.iter() {
|
|
|
|
snapshot_count += group.list_backups(&datastore.base_path())?.len();
|
|
|
|
}
|
|
|
|
|
2020-09-01 11:33:04 +00:00
|
|
|
// start with 16384 chunks (up to 65GB)
|
|
|
|
let verified_chunks = Arc::new(Mutex::new(HashSet::with_capacity(1024*16)));
|
|
|
|
|
|
|
|
// start with 64 chunks since we assume there are few corrupt ones
|
|
|
|
let corrupt_chunks = Arc::new(Mutex::new(HashSet::with_capacity(64)));
|
|
|
|
|
2020-10-12 09:46:34 +00:00
|
|
|
task_log!(worker, "verify datastore {} ({} snapshots)", datastore.name(), snapshot_count);
|
2020-06-24 11:11:45 +00:00
|
|
|
|
2020-09-02 05:43:04 +00:00
|
|
|
let mut done = 0;
|
2020-06-24 11:11:45 +00:00
|
|
|
for group in list {
|
2020-09-02 05:43:04 +00:00
|
|
|
let (count, mut group_errors) = verify_backup_group(
|
2020-09-01 11:33:04 +00:00
|
|
|
datastore.clone(),
|
|
|
|
&group,
|
|
|
|
verified_chunks.clone(),
|
|
|
|
corrupt_chunks.clone(),
|
2020-09-02 05:43:04 +00:00
|
|
|
Some((done, snapshot_count)),
|
2020-09-01 11:33:04 +00:00
|
|
|
worker.clone(),
|
2020-10-12 09:46:34 +00:00
|
|
|
upid,
|
2020-09-01 11:33:04 +00:00
|
|
|
)?;
|
2020-07-30 07:09:05 +00:00
|
|
|
errors.append(&mut group_errors);
|
2020-09-02 05:43:04 +00:00
|
|
|
|
|
|
|
done += count;
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
|
|
|
|
2020-07-30 07:09:05 +00:00
|
|
|
Ok(errors)
|
2020-06-24 11:11:45 +00:00
|
|
|
}
|
2020-10-20 09:10:05 +00:00
|
|
|
|
|
|
|
/// Runs a verification job.
|
|
|
|
pub fn do_verification_job(
|
|
|
|
mut job: Job,
|
|
|
|
verification_job: VerificationJobConfig,
|
|
|
|
userid: &Userid,
|
|
|
|
schedule: Option<String>,
|
|
|
|
) -> Result<String, Error> {
|
|
|
|
let datastore = DataStore::lookup_datastore(&verification_job.store)?;
|
|
|
|
|
|
|
|
let mut backups_to_verify = BackupInfo::list_backups(&datastore.base_path())?;
|
|
|
|
if verification_job.ignore_verified.unwrap_or(true) {
|
|
|
|
backups_to_verify.retain(|backup_info| {
|
|
|
|
let manifest = match datastore.load_manifest(&backup_info.backup_dir) {
|
|
|
|
Ok((manifest, _)) => manifest,
|
|
|
|
Err(_) => return false,
|
|
|
|
};
|
|
|
|
|
|
|
|
let raw_verify_state = manifest.unprotected["verify_state"].clone();
|
|
|
|
let last_state = match serde_json::from_value::<SnapshotVerifyState>(raw_verify_state) {
|
|
|
|
Ok(last_state) => last_state,
|
|
|
|
Err(_) => return true,
|
|
|
|
};
|
|
|
|
|
|
|
|
let now = proxmox::tools::time::epoch_i64();
|
|
|
|
let days_since_last_verify = (now - last_state.upid.starttime) / 86400;
|
|
|
|
verification_job.outdated_after.is_some()
|
|
|
|
&& days_since_last_verify > verification_job.outdated_after.unwrap()
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
let job_id = job.jobname().to_string();
|
|
|
|
let worker_type = job.jobtype().to_string();
|
|
|
|
let upid_str = WorkerTask::new_thread(
|
|
|
|
&worker_type,
|
|
|
|
Some(job.jobname().to_string()),
|
|
|
|
userid.clone(),
|
|
|
|
false,
|
|
|
|
move |worker| {
|
|
|
|
job.start(&worker.upid().to_string())?;
|
|
|
|
|
|
|
|
task_log!(worker,"Starting datastore verify job '{}'", job_id);
|
|
|
|
task_log!(worker,"verifying {} backups", backups_to_verify.len());
|
|
|
|
if let Some(event_str) = schedule {
|
|
|
|
task_log!(worker,"task triggered by schedule '{}'", event_str);
|
|
|
|
}
|
|
|
|
|
|
|
|
let verified_chunks = Arc::new(Mutex::new(HashSet::with_capacity(1024 * 16)));
|
|
|
|
let corrupt_chunks = Arc::new(Mutex::new(HashSet::with_capacity(64)));
|
|
|
|
let result = proxmox::try_block!({
|
|
|
|
let mut failed_dirs: Vec<String> = Vec::new();
|
|
|
|
|
|
|
|
for backup_info in backups_to_verify {
|
|
|
|
let verification_result = verify_backup_dir(
|
|
|
|
datastore.clone(),
|
|
|
|
&backup_info.backup_dir,
|
|
|
|
verified_chunks.clone(),
|
|
|
|
corrupt_chunks.clone(),
|
|
|
|
worker.clone(),
|
|
|
|
worker.upid().clone()
|
|
|
|
);
|
|
|
|
|
|
|
|
if let Ok(false) = verification_result {
|
|
|
|
failed_dirs.push(backup_info.backup_dir.to_string());
|
|
|
|
} // otherwise successful or aborted
|
|
|
|
}
|
|
|
|
|
|
|
|
if !failed_dirs.is_empty() {
|
|
|
|
task_log!(worker,"Failed to verify following snapshots:",);
|
|
|
|
for dir in failed_dirs {
|
|
|
|
task_log!(worker, "\t{}", dir)
|
|
|
|
}
|
|
|
|
bail!("verification failed - please check the log for details");
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
});
|
|
|
|
|
|
|
|
let status = worker.create_state(&result);
|
|
|
|
|
|
|
|
match job.finish(status) {
|
|
|
|
Err(err) => eprintln!(
|
|
|
|
"could not finish job state for {}: {}",
|
|
|
|
job.jobtype().to_string(),
|
|
|
|
err
|
|
|
|
),
|
|
|
|
Ok(_) => (),
|
|
|
|
}
|
|
|
|
|
|
|
|
result
|
|
|
|
},
|
|
|
|
)?;
|
|
|
|
Ok(upid_str)
|
|
|
|
}
|