mirror of
https://git.proxmox.com/git/mirror_zfs
synced 2025-04-28 11:40:17 +00:00
BRT: Rework structures and locks to be per-vdev
While block cloning operation from the beginning was made per-vdev, before this change most of its data were protected by two pool- wide locks. It created lots of lock contention in many workload. This change makes most of block cloning data structures per-vdev, which allows to lock them separately. The only pool-wide lock now it spa_brt_lock, protecting array of per-vdev pointers and in most cases taken as reader. Also this splits per-vdev locks into three different ones: bv_pending_lock protects the AVL-tree of pending operations in open context, bv_mos_entries_lock protects BRT ZAP object from while being prefetched, and bv_lock protects the rest of per-vdev context during TXG commit process. There should be no functional difference aside of some optimizations. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Pawel Jakub Dawidek <pjd@FreeBSD.org> Reviewed-by: Brian Atkinson <batkinson@lanl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #16740
This commit is contained in:
parent
309ce6303f
commit
fd6e8c1d2a
@ -2119,9 +2119,6 @@ dump_brt(spa_t *spa)
|
||||
return;
|
||||
}
|
||||
|
||||
brt_t *brt = spa->spa_brt;
|
||||
VERIFY(brt);
|
||||
|
||||
char count[32], used[32], saved[32];
|
||||
zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
|
||||
zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
|
||||
@ -2132,11 +2129,8 @@ dump_brt(spa_t *spa)
|
||||
if (dump_opt['T'] < 2)
|
||||
return;
|
||||
|
||||
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
|
||||
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
|
||||
if (brtvd == NULL)
|
||||
continue;
|
||||
|
||||
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
|
||||
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
|
||||
if (!brtvd->bv_initiated) {
|
||||
printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
|
||||
continue;
|
||||
@ -2160,20 +2154,21 @@ dump_brt(spa_t *spa)
|
||||
if (!do_histo)
|
||||
printf("\n%-16s %-10s\n", "DVA", "REFCNT");
|
||||
|
||||
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
|
||||
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
|
||||
if (brtvd == NULL || !brtvd->bv_initiated)
|
||||
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
|
||||
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
|
||||
if (!brtvd->bv_initiated)
|
||||
continue;
|
||||
|
||||
uint64_t counts[64] = {};
|
||||
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t *za = zap_attribute_alloc();
|
||||
for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
|
||||
for (zap_cursor_init(&zc, spa->spa_meta_objset,
|
||||
brtvd->bv_mos_entries);
|
||||
zap_cursor_retrieve(&zc, za) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
uint64_t refcnt;
|
||||
VERIFY0(zap_lookup_uint64(brt->brt_mos,
|
||||
VERIFY0(zap_lookup_uint64(spa->spa_meta_objset,
|
||||
brtvd->bv_mos_entries,
|
||||
(const uint64_t *)za->za_name, 1,
|
||||
za->za_integer_length, za->za_num_integers,
|
||||
@ -8227,14 +8222,11 @@ dump_mos_leaks(spa_t *spa)
|
||||
}
|
||||
}
|
||||
|
||||
if (spa->spa_brt != NULL) {
|
||||
brt_t *brt = spa->spa_brt;
|
||||
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
|
||||
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
|
||||
if (brtvd != NULL && brtvd->bv_initiated) {
|
||||
mos_obj_refd(brtvd->bv_mos_brtvdev);
|
||||
mos_obj_refd(brtvd->bv_mos_entries);
|
||||
}
|
||||
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
|
||||
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
|
||||
if (brtvd->bv_initiated) {
|
||||
mos_obj_refd(brtvd->bv_mos_brtvdev);
|
||||
mos_obj_refd(brtvd->bv_mos_entries);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -86,28 +86,38 @@ typedef struct brt_vdev_phys {
|
||||
uint64_t bvp_savedspace;
|
||||
} brt_vdev_phys_t;
|
||||
|
||||
typedef struct brt_vdev {
|
||||
struct brt_vdev {
|
||||
/*
|
||||
* Pending changes from open contexts.
|
||||
*/
|
||||
kmutex_t bv_pending_lock;
|
||||
avl_tree_t bv_pending_tree[TXG_SIZE];
|
||||
/*
|
||||
* Protects bv_mos_*.
|
||||
*/
|
||||
krwlock_t bv_mos_entries_lock ____cacheline_aligned;
|
||||
/*
|
||||
* Protects all the fields starting from bv_initiated.
|
||||
*/
|
||||
krwlock_t bv_lock ____cacheline_aligned;
|
||||
/*
|
||||
* VDEV id.
|
||||
*/
|
||||
uint64_t bv_vdevid;
|
||||
/*
|
||||
* Is the structure initiated?
|
||||
* (bv_entcount and bv_bitmap are allocated?)
|
||||
*/
|
||||
boolean_t bv_initiated;
|
||||
uint64_t bv_vdevid ____cacheline_aligned;
|
||||
/*
|
||||
* Object number in the MOS for the entcount array and brt_vdev_phys.
|
||||
*/
|
||||
uint64_t bv_mos_brtvdev;
|
||||
/*
|
||||
* Object number in the MOS for the entries table.
|
||||
* Object number in the MOS and dnode for the entries table.
|
||||
*/
|
||||
uint64_t bv_mos_entries;
|
||||
dnode_t *bv_mos_entries_dnode;
|
||||
/*
|
||||
* Entries to sync.
|
||||
* Is the structure initiated?
|
||||
* (bv_entcount and bv_bitmap are allocated?)
|
||||
*/
|
||||
avl_tree_t bv_tree;
|
||||
boolean_t bv_initiated;
|
||||
/*
|
||||
* Does the bv_entcount[] array needs byte swapping?
|
||||
*/
|
||||
@ -120,6 +130,26 @@ typedef struct brt_vdev {
|
||||
* This is the array with BRT entry count per BRT_RANGESIZE.
|
||||
*/
|
||||
uint16_t *bv_entcount;
|
||||
/*
|
||||
* bv_entcount[] potentially can be a bit too big to sychronize it all
|
||||
* when we just changed few entcounts. The fields below allow us to
|
||||
* track updates to bv_entcount[] array since the last sync.
|
||||
* A single bit in the bv_bitmap represents as many entcounts as can
|
||||
* fit into a single BRT_BLOCKSIZE.
|
||||
* For example we have 65536 entcounts in the bv_entcount array
|
||||
* (so the whole array is 128kB). We updated bv_entcount[2] and
|
||||
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
|
||||
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
|
||||
*/
|
||||
ulong_t *bv_bitmap;
|
||||
/*
|
||||
* bv_entcount[] needs updating on disk.
|
||||
*/
|
||||
boolean_t bv_entcount_dirty;
|
||||
/*
|
||||
* brt_vdev_phys needs updating on disk.
|
||||
*/
|
||||
boolean_t bv_meta_dirty;
|
||||
/*
|
||||
* Sum of all bv_entcount[]s.
|
||||
*/
|
||||
@ -133,45 +163,10 @@ typedef struct brt_vdev {
|
||||
*/
|
||||
uint64_t bv_savedspace;
|
||||
/*
|
||||
* brt_vdev_phys needs updating on disk.
|
||||
* Entries to sync.
|
||||
*/
|
||||
boolean_t bv_meta_dirty;
|
||||
/*
|
||||
* bv_entcount[] needs updating on disk.
|
||||
*/
|
||||
boolean_t bv_entcount_dirty;
|
||||
/*
|
||||
* bv_entcount[] potentially can be a bit too big to sychronize it all
|
||||
* when we just changed few entcounts. The fields below allow us to
|
||||
* track updates to bv_entcount[] array since the last sync.
|
||||
* A single bit in the bv_bitmap represents as many entcounts as can
|
||||
* fit into a single BRT_BLOCKSIZE.
|
||||
* For example we have 65536 entcounts in the bv_entcount array
|
||||
* (so the whole array is 128kB). We updated bv_entcount[2] and
|
||||
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
|
||||
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
|
||||
*/
|
||||
ulong_t *bv_bitmap;
|
||||
uint64_t bv_nblocks;
|
||||
} brt_vdev_t;
|
||||
|
||||
/*
|
||||
* In-core brt
|
||||
*/
|
||||
typedef struct brt {
|
||||
krwlock_t brt_lock;
|
||||
spa_t *brt_spa;
|
||||
#define brt_mos brt_spa->spa_meta_objset
|
||||
uint64_t brt_rangesize;
|
||||
uint64_t brt_usedspace;
|
||||
uint64_t brt_savedspace;
|
||||
avl_tree_t brt_pending_tree[TXG_SIZE];
|
||||
kmutex_t brt_pending_lock[TXG_SIZE];
|
||||
/* Sum of all entries across all bv_trees. */
|
||||
uint64_t brt_nentries;
|
||||
brt_vdev_t *brt_vdevs;
|
||||
uint64_t brt_nvdevs;
|
||||
} brt_t;
|
||||
avl_tree_t bv_tree;
|
||||
};
|
||||
|
||||
/* Size of bre_offset / sizeof (uint64_t). */
|
||||
#define BRT_KEY_WORDS (1)
|
||||
@ -188,7 +183,7 @@ typedef struct brt_entry {
|
||||
|
||||
typedef struct brt_pending_entry {
|
||||
blkptr_t bpe_bp;
|
||||
int bpe_count;
|
||||
uint64_t bpe_count;
|
||||
avl_node_t bpe_node;
|
||||
} brt_pending_entry_t;
|
||||
|
||||
|
@ -53,6 +53,7 @@ extern "C" {
|
||||
/*
|
||||
* Forward references that lots of things need.
|
||||
*/
|
||||
typedef struct brt_vdev brt_vdev_t;
|
||||
typedef struct spa spa_t;
|
||||
typedef struct vdev vdev_t;
|
||||
typedef struct metaslab metaslab_t;
|
||||
|
@ -412,8 +412,12 @@ struct spa {
|
||||
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
|
||||
uint64_t spa_dedup_checksum; /* default dedup checksum */
|
||||
uint64_t spa_dspace; /* dspace in normal class */
|
||||
uint64_t spa_rdspace; /* raw (non-dedup) --//-- */
|
||||
boolean_t spa_active_ddt_prune; /* ddt prune process active */
|
||||
struct brt *spa_brt; /* in-core BRT */
|
||||
brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */
|
||||
uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */
|
||||
uint64_t spa_brt_rangesize; /* pool's BRT range size */
|
||||
krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */
|
||||
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
|
||||
kmutex_t spa_proc_lock; /* protects spa_proc* */
|
||||
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
|
||||
|
783
module/zfs/brt.c
783
module/zfs/brt.c
File diff suppressed because it is too large
Load Diff
@ -1870,13 +1870,7 @@ spa_get_slop_space(spa_t *spa)
|
||||
if (spa->spa_dedup_dspace == ~0ULL)
|
||||
spa_update_dspace(spa);
|
||||
|
||||
/*
|
||||
* spa_get_dspace() includes the space only logically "used" by
|
||||
* deduplicated data, so since it's not useful to reserve more
|
||||
* space with more deduplicated data, we subtract that out here.
|
||||
*/
|
||||
space =
|
||||
spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
|
||||
space = spa->spa_rdspace;
|
||||
slop = MIN(space >> spa_slop_shift, spa_max_slop);
|
||||
|
||||
/*
|
||||
@ -1912,8 +1906,7 @@ spa_get_checkpoint_space(spa_t *spa)
|
||||
void
|
||||
spa_update_dspace(spa_t *spa)
|
||||
{
|
||||
spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
|
||||
ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
|
||||
spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa));
|
||||
if (spa->spa_nonallocating_dspace > 0) {
|
||||
/*
|
||||
* Subtract the space provided by all non-allocating vdevs that
|
||||
@ -1933,9 +1926,11 @@ spa_update_dspace(spa_t *spa)
|
||||
* doesn't matter that the data we are moving may be
|
||||
* allocated twice (on the old device and the new device).
|
||||
*/
|
||||
ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace);
|
||||
spa->spa_dspace -= spa->spa_nonallocating_dspace;
|
||||
ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace);
|
||||
spa->spa_rdspace -= spa->spa_nonallocating_dspace;
|
||||
}
|
||||
spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) +
|
||||
brt_get_dspace(spa);
|
||||
}
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user