datastore/api/backup: prepare for fix of #3935 by adding lock helpers

to avoid duplicate code, add helpers for locking groups and snapshots
to the BackupGroup and BackupDir traits respectively and refactor
existing code to use them.

this also adapts error handling by adding relevant context to each
locking helper call site. otherwise, we might loose valuable
information useful for debugging. note, however, that users that
relied on specific error messages will break.

Signed-off-by: Shannon Sterz <s.sterz@proxmox.com>
Acked-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
This commit is contained in:
Shannon Sterz 2025-03-26 12:44:11 +01:00 committed by Thomas Lamprecht
parent 27ba2c0318
commit e2c1866b13
8 changed files with 106 additions and 95 deletions

View File

@ -3,9 +3,11 @@ use std::os::unix::io::RawFd;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
use anyhow::{bail, format_err, Error}; use anyhow::{bail, format_err, Context, Error};
use proxmox_sys::fs::{lock_dir_noblock, replace_file, CreateOptions}; use proxmox_sys::fs::{
lock_dir_noblock, lock_dir_noblock_shared, replace_file, CreateOptions, DirLockGuard,
};
use pbs_api_types::{ use pbs_api_types::{
Authid, BackupGroupDeleteStats, BackupNamespace, BackupType, GroupFilter, VerifyState, Authid, BackupGroupDeleteStats, BackupNamespace, BackupType, GroupFilter, VerifyState,
@ -199,9 +201,10 @@ impl BackupGroup {
/// Returns `BackupGroupDeleteStats`, containing the number of deleted snapshots /// Returns `BackupGroupDeleteStats`, containing the number of deleted snapshots
/// and number of protected snaphsots, which therefore were not removed. /// and number of protected snaphsots, which therefore were not removed.
pub fn destroy(&self) -> Result<BackupGroupDeleteStats, Error> { pub fn destroy(&self) -> Result<BackupGroupDeleteStats, Error> {
let _guard = self
.lock()
.with_context(|| format!("while destroying group '{self:?}'"))?;
let path = self.full_group_path(); let path = self.full_group_path();
let _guard =
proxmox_sys::fs::lock_dir_noblock(&path, "backup group", "possible running backup")?;
log::info!("removing backup group {:?}", path); log::info!("removing backup group {:?}", path);
let mut delete_stats = BackupGroupDeleteStats::default(); let mut delete_stats = BackupGroupDeleteStats::default();
@ -237,6 +240,15 @@ impl BackupGroup {
self.store self.store
.set_owner(&self.ns, self.as_ref(), auth_id, force) .set_owner(&self.ns, self.as_ref(), auth_id, force)
} }
/// Lock a group exclusively
pub fn lock(&self) -> Result<DirLockGuard, Error> {
lock_dir_noblock(
&self.full_group_path(),
"backup group",
"possible running backup",
)
}
} }
impl AsRef<pbs_api_types::BackupNamespace> for BackupGroup { impl AsRef<pbs_api_types::BackupNamespace> for BackupGroup {
@ -442,15 +454,33 @@ impl BackupDir {
.map_err(|err| format_err!("unable to acquire manifest lock {:?} - {}", &path, err)) .map_err(|err| format_err!("unable to acquire manifest lock {:?} - {}", &path, err))
} }
/// Lock this snapshot exclusively
pub fn lock(&self) -> Result<DirLockGuard, Error> {
lock_dir_noblock(
&self.full_path(),
"snapshot",
"backup is running or snapshot is in use",
)
}
/// Acquire a shared lock on this snapshot
pub fn lock_shared(&self) -> Result<DirLockGuard, Error> {
lock_dir_noblock_shared(
&self.full_path(),
"snapshot",
"backup is running or snapshot is in use, could not acquire shared lock",
)
}
/// Destroy the whole snapshot, bails if it's protected /// Destroy the whole snapshot, bails if it's protected
/// ///
/// Setting `force` to true skips locking and thus ignores if the backup is currently in use. /// Setting `force` to true skips locking and thus ignores if the backup is currently in use.
pub fn destroy(&self, force: bool) -> Result<(), Error> { pub fn destroy(&self, force: bool) -> Result<(), Error> {
let full_path = self.full_path();
let (_guard, _manifest_guard); let (_guard, _manifest_guard);
if !force { if !force {
_guard = lock_dir_noblock(&full_path, "snapshot", "possibly running or in use")?; _guard = self
.lock()
.with_context(|| format!("while destroying snapshot '{self:?}'"))?;
_manifest_guard = self.lock_manifest()?; _manifest_guard = self.lock_manifest()?;
} }
@ -458,6 +488,7 @@ impl BackupDir {
bail!("cannot remove protected snapshot"); // use special error type? bail!("cannot remove protected snapshot"); // use special error type?
} }
let full_path = self.full_path();
log::info!("removing backup snapshot {:?}", full_path); log::info!("removing backup snapshot {:?}", full_path);
std::fs::remove_dir_all(&full_path).map_err(|err| { std::fs::remove_dir_all(&full_path).map_err(|err| {
format_err!("removing backup snapshot {:?} failed - {}", full_path, err,) format_err!("removing backup snapshot {:?} failed - {}", full_path, err,)

View File

@ -5,7 +5,7 @@ use std::os::unix::io::AsRawFd;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::{Arc, LazyLock, Mutex}; use std::sync::{Arc, LazyLock, Mutex};
use anyhow::{bail, format_err, Error}; use anyhow::{bail, format_err, Context, Error};
use nix::unistd::{unlinkat, UnlinkatFlags}; use nix::unistd::{unlinkat, UnlinkatFlags};
use tracing::{info, warn}; use tracing::{info, warn};
@ -13,8 +13,8 @@ use proxmox_human_byte::HumanByte;
use proxmox_schema::ApiType; use proxmox_schema::ApiType;
use proxmox_sys::error::SysError; use proxmox_sys::error::SysError;
use proxmox_sys::fs::DirLockGuard;
use proxmox_sys::fs::{file_read_optional_string, replace_file, CreateOptions}; use proxmox_sys::fs::{file_read_optional_string, replace_file, CreateOptions};
use proxmox_sys::fs::{lock_dir_noblock, DirLockGuard};
use proxmox_sys::linux::procfs::MountInfo; use proxmox_sys::linux::procfs::MountInfo;
use proxmox_sys::process_locker::ProcessLockSharedGuard; use proxmox_sys::process_locker::ProcessLockSharedGuard;
use proxmox_worker_task::WorkerTaskContext; use proxmox_worker_task::WorkerTaskContext;
@ -774,41 +774,35 @@ impl DataStore {
/// ///
/// This also acquires an exclusive lock on the directory and returns the lock guard. /// This also acquires an exclusive lock on the directory and returns the lock guard.
pub fn create_locked_backup_group( pub fn create_locked_backup_group(
&self, self: &Arc<Self>,
ns: &BackupNamespace, ns: &BackupNamespace,
backup_group: &pbs_api_types::BackupGroup, backup_group: &pbs_api_types::BackupGroup,
auth_id: &Authid, auth_id: &Authid,
) -> Result<(Authid, DirLockGuard), Error> { ) -> Result<(Authid, DirLockGuard), Error> {
// create intermediate path first: let backup_group = self.backup_group(ns.clone(), backup_group.clone());
let mut full_path = self.base_path();
for ns in ns.components() {
full_path.push("ns");
full_path.push(ns);
}
full_path.push(backup_group.ty.as_str());
std::fs::create_dir_all(&full_path)?;
full_path.push(&backup_group.id); // create intermediate path first
let full_path = backup_group.full_group_path();
// create the last component now std::fs::create_dir_all(full_path.parent().ok_or_else(|| {
format_err!("could not construct parent path for group {backup_group:?}")
})?)?;
// now create the group, this allows us to check whether it existed before
match std::fs::create_dir(&full_path) { match std::fs::create_dir(&full_path) {
Ok(_) => { Ok(_) => {
let guard = lock_dir_noblock( let guard = backup_group.lock().with_context(|| {
&full_path, format!("while creating new locked backup group '{backup_group:?}'")
"backup group", })?;
"another backup is already running", self.set_owner(ns, backup_group.group(), auth_id, false)?;
)?; let owner = self.get_owner(ns, backup_group.group())?; // just to be sure
self.set_owner(ns, backup_group, auth_id, false)?;
let owner = self.get_owner(ns, backup_group)?; // just to be sure
Ok((owner, guard)) Ok((owner, guard))
} }
Err(ref err) if err.kind() == io::ErrorKind::AlreadyExists => { Err(ref err) if err.kind() == io::ErrorKind::AlreadyExists => {
let guard = lock_dir_noblock( let guard = backup_group.lock().with_context(|| {
&full_path, format!("while creating locked backup group '{backup_group:?}'")
"backup group", })?;
"another backup is already running", let owner = self.get_owner(ns, backup_group.group())?; // just to be sure
)?;
let owner = self.get_owner(ns, backup_group)?; // just to be sure
Ok((owner, guard)) Ok((owner, guard))
} }
Err(err) => bail!("unable to create backup group {:?} - {}", full_path, err), Err(err) => bail!("unable to create backup group {:?} - {}", full_path, err),
@ -819,29 +813,25 @@ impl DataStore {
/// ///
/// The BackupGroup directory needs to exist. /// The BackupGroup directory needs to exist.
pub fn create_locked_backup_dir( pub fn create_locked_backup_dir(
&self, self: &Arc<Self>,
ns: &BackupNamespace, ns: &BackupNamespace,
backup_dir: &pbs_api_types::BackupDir, backup_dir: &pbs_api_types::BackupDir,
) -> Result<(PathBuf, bool, DirLockGuard), Error> { ) -> Result<(PathBuf, bool, DirLockGuard), Error> {
let full_path = self.snapshot_path(ns, backup_dir); let backup_dir = self.backup_dir(ns.clone(), backup_dir.clone())?;
let relative_path = full_path.strip_prefix(self.base_path()).map_err(|err| { let relative_path = backup_dir.relative_path();
format_err!(
"failed to produce correct path for backup {backup_dir} in namespace {ns}: {err}"
)
})?;
let lock = || { match std::fs::create_dir(backup_dir.full_path()) {
lock_dir_noblock( Ok(_) => {
&full_path, let guard = backup_dir.lock().with_context(|| {
"snapshot", format!("while creating new locked snapshot '{backup_dir:?}'")
"internal error - tried creating snapshot that's already in use", })?;
) Ok((relative_path, true, guard))
}; }
match std::fs::create_dir(&full_path) {
Ok(_) => Ok((relative_path.to_owned(), true, lock()?)),
Err(ref e) if e.kind() == io::ErrorKind::AlreadyExists => { Err(ref e) if e.kind() == io::ErrorKind::AlreadyExists => {
Ok((relative_path.to_owned(), false, lock()?)) let guard = backup_dir
.lock()
.with_context(|| format!("while creating locked snapshot '{backup_dir:?}'"))?;
Ok((relative_path, false, guard))
} }
Err(e) => Err(e.into()), Err(e) => Err(e.into()),
} }
@ -1305,7 +1295,9 @@ impl DataStore {
bail!("snapshot {} does not exist!", backup_dir.dir()); bail!("snapshot {} does not exist!", backup_dir.dir());
} }
let _guard = lock_dir_noblock(&full_path, "snapshot", "possibly running or in use")?; let _guard = backup_dir.lock().with_context(|| {
format!("while updating the protection status of snapshot '{backup_dir:?}'")
})?;
let protected_path = backup_dir.protected_file(); let protected_path = backup_dir.protected_file();
if protection { if protection {

View File

@ -4,11 +4,9 @@ use std::path::Path;
use std::rc::Rc; use std::rc::Rc;
use std::sync::Arc; use std::sync::Arc;
use anyhow::{bail, Error}; use anyhow::{bail, Context, Error};
use nix::dir::Dir; use nix::dir::Dir;
use proxmox_sys::fs::lock_dir_noblock_shared;
use pbs_api_types::{ use pbs_api_types::{
print_store_and_ns, ArchiveType, BackupNamespace, Operation, CLIENT_LOG_BLOB_NAME, print_store_and_ns, ArchiveType, BackupNamespace, Operation, CLIENT_LOG_BLOB_NAME,
MANIFEST_BLOB_NAME, MANIFEST_BLOB_NAME,
@ -48,8 +46,9 @@ impl SnapshotReader {
bail!("snapshot {} does not exist!", snapshot.dir()); bail!("snapshot {} does not exist!", snapshot.dir());
} }
let locked_dir = let locked_dir = snapshot
lock_dir_noblock_shared(&snapshot_path, "snapshot", "locked by another operation")?; .lock_shared()
.with_context(|| format!("while trying to read snapshot '{snapshot:?}'"))?;
let datastore_name = datastore.name().to_string(); let datastore_name = datastore.name().to_string();
let manifest = match snapshot.load_manifest() { let manifest = match snapshot.load_manifest() {

View File

@ -1,4 +1,4 @@
use anyhow::{bail, format_err, Error}; use anyhow::{bail, format_err, Context, Error};
use nix::dir::Dir; use nix::dir::Dir;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
@ -8,7 +8,7 @@ use ::serde::Serialize;
use serde_json::{json, Value}; use serde_json::{json, Value};
use proxmox_router::{RpcEnvironment, RpcEnvironmentType}; use proxmox_router::{RpcEnvironment, RpcEnvironmentType};
use proxmox_sys::fs::{lock_dir_noblock_shared, replace_file, CreateOptions}; use proxmox_sys::fs::{replace_file, CreateOptions};
use pbs_api_types::Authid; use pbs_api_types::Authid;
use pbs_datastore::backup_info::{BackupDir, BackupInfo}; use pbs_datastore::backup_info::{BackupDir, BackupInfo};
@ -645,12 +645,12 @@ impl BackupEnvironment {
// Downgrade to shared lock, the backup itself is finished // Downgrade to shared lock, the backup itself is finished
drop(excl_snap_lock); drop(excl_snap_lock);
let snap_lock = lock_dir_noblock_shared( let snap_lock = self.backup_dir.lock_shared().with_context(|| {
&self.backup_dir.full_path(), format!(
"snapshot", "while trying to verify snapshot '{:?}' after completion",
"snapshot is already locked by another operation", self.backup_dir
)?; )
})?;
let worker_id = format!( let worker_id = format!(
"{}:{}/{}/{:08X}", "{}:{}/{}/{:08X}",
self.datastore.name(), self.datastore.name(),

View File

@ -1,6 +1,6 @@
//! Backup protocol (HTTP2 upgrade) //! Backup protocol (HTTP2 upgrade)
use anyhow::{bail, format_err, Error}; use anyhow::{bail, format_err, Context, Error};
use futures::*; use futures::*;
use hex::FromHex; use hex::FromHex;
use hyper::header::{HeaderValue, CONNECTION, UPGRADE}; use hyper::header::{HeaderValue, CONNECTION, UPGRADE};
@ -17,7 +17,6 @@ use proxmox_router::{
}; };
use proxmox_schema::*; use proxmox_schema::*;
use proxmox_sortable_macro::sortable; use proxmox_sortable_macro::sortable;
use proxmox_sys::fs::lock_dir_noblock_shared;
use pbs_api_types::{ use pbs_api_types::{
ArchiveType, Authid, BackupNamespace, BackupType, Operation, VerifyState, ArchiveType, Authid, BackupNamespace, BackupType, Operation, VerifyState,
@ -186,12 +185,10 @@ fn upgrade_to_backup_protocol(
} }
// lock last snapshot to prevent forgetting/pruning it during backup // lock last snapshot to prevent forgetting/pruning it during backup
let full_path = last.backup_dir.full_path(); let guard = last.backup_dir
Some(lock_dir_noblock_shared( .lock_shared()
&full_path, .with_context(|| format!("while locking last snapshot during backup '{last:?}'"))?;
"snapshot", Some(guard)
"base snapshot is already locked by another operation",
)?)
} else { } else {
None None
}; };

View File

@ -1,6 +1,6 @@
//! Backup reader/restore protocol (HTTP2 upgrade) //! Backup reader/restore protocol (HTTP2 upgrade)
use anyhow::{bail, format_err, Error}; use anyhow::{bail, format_err, Context, Error};
use futures::*; use futures::*;
use hex::FromHex; use hex::FromHex;
use hyper::header::{self, HeaderValue, CONNECTION, UPGRADE}; use hyper::header::{self, HeaderValue, CONNECTION, UPGRADE};
@ -16,7 +16,6 @@ use proxmox_router::{
}; };
use proxmox_schema::{BooleanSchema, ObjectSchema}; use proxmox_schema::{BooleanSchema, ObjectSchema};
use proxmox_sortable_macro::sortable; use proxmox_sortable_macro::sortable;
use proxmox_sys::fs::lock_dir_noblock_shared;
use pbs_api_types::{ use pbs_api_types::{
ArchiveType, Authid, Operation, BACKUP_ARCHIVE_NAME_SCHEMA, BACKUP_ID_SCHEMA, ArchiveType, Authid, Operation, BACKUP_ARCHIVE_NAME_SCHEMA, BACKUP_ID_SCHEMA,
@ -129,11 +128,9 @@ fn upgrade_to_backup_reader_protocol(
bail!("snapshot {} does not exist.", backup_dir.dir()); bail!("snapshot {} does not exist.", backup_dir.dir());
} }
let _guard = lock_dir_noblock_shared( let _guard = backup_dir
&backup_dir.full_path(), .lock_shared()
"snapshot", .with_context(|| format!("while reading snapshot '{backup_dir:?}'"))?;
"locked by another operation",
)?;
let path = datastore.base_path(); let path = datastore.base_path();

View File

@ -4,10 +4,9 @@ use std::sync::{Arc, Mutex};
use std::time::Instant; use std::time::Instant;
use anyhow::{bail, Error}; use anyhow::{bail, Error};
use nix::dir::Dir; use proxmox_sys::fs::DirLockGuard;
use tracing::{error, info, warn}; use tracing::{error, info, warn};
use proxmox_sys::fs::lock_dir_noblock_shared;
use proxmox_worker_task::WorkerTaskContext; use proxmox_worker_task::WorkerTaskContext;
use pbs_api_types::{ use pbs_api_types::{
@ -307,11 +306,8 @@ pub fn verify_backup_dir(
return Ok(true); return Ok(true);
} }
let snap_lock = lock_dir_noblock_shared( let snap_lock = backup_dir.lock_shared();
&backup_dir.full_path(),
"snapshot",
"locked by another operation",
);
match snap_lock { match snap_lock {
Ok(snap_lock) => { Ok(snap_lock) => {
verify_backup_dir_with_lock(verify_worker, backup_dir, upid, filter, snap_lock) verify_backup_dir_with_lock(verify_worker, backup_dir, upid, filter, snap_lock)
@ -334,7 +330,7 @@ pub fn verify_backup_dir_with_lock(
backup_dir: &BackupDir, backup_dir: &BackupDir,
upid: UPID, upid: UPID,
filter: Option<&dyn Fn(&BackupManifest) -> bool>, filter: Option<&dyn Fn(&BackupManifest) -> bool>,
_snap_lock: Dir, _snap_lock: DirLockGuard,
) -> Result<bool, Error> { ) -> Result<bool, Error> {
let datastore_name = verify_worker.datastore.name(); let datastore_name = verify_worker.datastore.name();
let backup_dir_name = backup_dir.dir(); let backup_dir_name = backup_dir.dir();

View File

@ -10,6 +10,7 @@ use std::time::Duration;
use anyhow::{bail, format_err, Context, Error}; use anyhow::{bail, format_err, Context, Error};
use futures::{future::FutureExt, select}; use futures::{future::FutureExt, select};
use hyper::http::StatusCode; use hyper::http::StatusCode;
use proxmox_sys::fs::DirLockGuard;
use serde_json::json; use serde_json::json;
use tracing::{info, warn}; use tracing::{info, warn};
@ -105,7 +106,7 @@ pub(crate) struct RemoteSourceReader {
} }
pub(crate) struct LocalSourceReader { pub(crate) struct LocalSourceReader {
pub(crate) _dir_lock: Arc<Mutex<proxmox_sys::fs::DirLockGuard>>, pub(crate) _dir_lock: Arc<Mutex<DirLockGuard>>,
pub(crate) path: PathBuf, pub(crate) path: PathBuf,
pub(crate) datastore: Arc<DataStore>, pub(crate) datastore: Arc<DataStore>,
} }
@ -478,13 +479,11 @@ impl SyncSource for LocalSource {
dir: &BackupDir, dir: &BackupDir,
) -> Result<Arc<dyn SyncSourceReader>, Error> { ) -> Result<Arc<dyn SyncSourceReader>, Error> {
let dir = self.store.backup_dir(ns.clone(), dir.clone())?; let dir = self.store.backup_dir(ns.clone(), dir.clone())?;
let dir_lock = proxmox_sys::fs::lock_dir_noblock_shared( let guard = dir
&dir.full_path(), .lock()
"snapshot", .with_context(|| format!("while reading snapshot '{dir:?}' for a sync job"))?;
"locked by another operation",
)?;
Ok(Arc::new(LocalSourceReader { Ok(Arc::new(LocalSourceReader {
_dir_lock: Arc::new(Mutex::new(dir_lock)), _dir_lock: Arc::new(Mutex::new(guard)),
path: dir.full_path(), path: dir.full_path(),
datastore: dir.datastore().clone(), datastore: dir.datastore().clone(),
})) }))