mirror of
				https://git.proxmox.com/git/mirror_zfs
				synced 2025-10-25 20:28:24 +00:00 
			
		
		
		
	Switch refcount tracking from lists to AVL-trees.
With large number of tracked references list searches under the lock become too expensive, creating enormous lock contention. On my tests with ZFS_DEBUG enabled this increases write throughput with 32KB blocks from ~1.2GB/s to ~7.5GB/s. Reviewed-by: Brian Atkinson <batkinson@lanl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #14970
This commit is contained in:
		
							parent
							
								
									8af1104f83
								
							
						
					
					
						commit
						d057807ede
					
				| @ -27,6 +27,7 @@ | ||||
| #define	_SYS_ZFS_REFCOUNT_H | ||||
| 
 | ||||
| #include <sys/inttypes.h> | ||||
| #include <sys/avl.h> | ||||
| #include <sys/list.h> | ||||
| #include <sys/zfs_context.h> | ||||
| 
 | ||||
| @ -43,19 +44,22 @@ extern "C" { | ||||
| 
 | ||||
| #ifdef	ZFS_DEBUG | ||||
| typedef struct reference { | ||||
| 	list_node_t ref_link; | ||||
| 	union { | ||||
| 		avl_node_t a; | ||||
| 		list_node_t l; | ||||
| 	} ref_link; | ||||
| 	const void *ref_holder; | ||||
| 	uint64_t ref_number; | ||||
| 	uint8_t *ref_removed; | ||||
| 	boolean_t ref_search; | ||||
| } reference_t; | ||||
| 
 | ||||
| typedef struct refcount { | ||||
| 	kmutex_t rc_mtx; | ||||
| 	boolean_t rc_tracked; | ||||
| 	list_t rc_list; | ||||
| 	list_t rc_removed; | ||||
| 	uint64_t rc_count; | ||||
| 	uint64_t rc_removed_count; | ||||
| 	kmutex_t rc_mtx; | ||||
| 	avl_tree_t rc_tree; | ||||
| 	list_t rc_removed; | ||||
| 	uint_t rc_removed_count; | ||||
| 	boolean_t rc_tracked; | ||||
| } zfs_refcount_t; | ||||
| 
 | ||||
| /*
 | ||||
|  | ||||
| @ -36,33 +36,40 @@ int reference_tracking_enable = B_FALSE; | ||||
| static uint_t reference_history = 3; /* tunable */ | ||||
| 
 | ||||
| static kmem_cache_t *reference_cache; | ||||
| static kmem_cache_t *reference_history_cache; | ||||
| 
 | ||||
| void | ||||
| zfs_refcount_init(void) | ||||
| { | ||||
| 	reference_cache = kmem_cache_create("reference_cache", | ||||
| 	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); | ||||
| 
 | ||||
| 	reference_history_cache = kmem_cache_create("reference_history_cache", | ||||
| 	    sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| zfs_refcount_fini(void) | ||||
| { | ||||
| 	kmem_cache_destroy(reference_cache); | ||||
| 	kmem_cache_destroy(reference_history_cache); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| zfs_refcount_compare(const void *x1, const void *x2) | ||||
| { | ||||
| 	const reference_t *r1 = (const reference_t *)x1; | ||||
| 	const reference_t *r2 = (const reference_t *)x2; | ||||
| 
 | ||||
| 	int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder); | ||||
| 	int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number); | ||||
| 	int cmp = cmp1 ? cmp1 : cmp2; | ||||
| 	return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2)); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| zfs_refcount_create(zfs_refcount_t *rc) | ||||
| { | ||||
| 	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); | ||||
| 	list_create(&rc->rc_list, sizeof (reference_t), | ||||
| 	    offsetof(reference_t, ref_link)); | ||||
| 	avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t), | ||||
| 	    offsetof(reference_t, ref_link.a)); | ||||
| 	list_create(&rc->rc_removed, sizeof (reference_t), | ||||
| 	    offsetof(reference_t, ref_link)); | ||||
| 	    offsetof(reference_t, ref_link.l)); | ||||
| 	rc->rc_count = 0; | ||||
| 	rc->rc_removed_count = 0; | ||||
| 	rc->rc_tracked = reference_tracking_enable; | ||||
| @ -86,16 +93,15 @@ void | ||||
| zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) | ||||
| { | ||||
| 	reference_t *ref; | ||||
| 	void *cookie = NULL; | ||||
| 
 | ||||
| 	ASSERT3U(rc->rc_count, ==, number); | ||||
| 	while ((ref = list_remove_head(&rc->rc_list))) | ||||
| 	while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL) | ||||
| 		kmem_cache_free(reference_cache, ref); | ||||
| 	list_destroy(&rc->rc_list); | ||||
| 	avl_destroy(&rc->rc_tree); | ||||
| 
 | ||||
| 	while ((ref = list_remove_head(&rc->rc_removed))) { | ||||
| 		kmem_cache_free(reference_history_cache, ref->ref_removed); | ||||
| 	while ((ref = list_remove_head(&rc->rc_removed))) | ||||
| 		kmem_cache_free(reference_cache, ref); | ||||
| 	} | ||||
| 	list_destroy(&rc->rc_removed); | ||||
| 	mutex_destroy(&rc->rc_mtx); | ||||
| } | ||||
| @ -121,10 +127,10 @@ zfs_refcount_count(zfs_refcount_t *rc) | ||||
| int64_t | ||||
| zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) | ||||
| { | ||||
| 	reference_t *ref = NULL; | ||||
| 	reference_t *ref; | ||||
| 	int64_t count; | ||||
| 
 | ||||
| 	if (!rc->rc_tracked) { | ||||
| 	if (likely(!rc->rc_tracked)) { | ||||
| 		count = atomic_add_64_nv(&(rc)->rc_count, number); | ||||
| 		ASSERT3U(count, >=, number); | ||||
| 		return (count); | ||||
| @ -133,8 +139,9 @@ zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) | ||||
| 	ref = kmem_cache_alloc(reference_cache, KM_SLEEP); | ||||
| 	ref->ref_holder = holder; | ||||
| 	ref->ref_number = number; | ||||
| 	ref->ref_search = B_FALSE; | ||||
| 	mutex_enter(&rc->rc_mtx); | ||||
| 	list_insert_head(&rc->rc_list, ref); | ||||
| 	avl_add(&rc->rc_tree, ref); | ||||
| 	rc->rc_count += number; | ||||
| 	count = rc->rc_count; | ||||
| 	mutex_exit(&rc->rc_mtx); | ||||
| @ -151,7 +158,7 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder) | ||||
| void | ||||
| zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) | ||||
| { | ||||
| 	if (!rc->rc_tracked) | ||||
| 	if (likely(!rc->rc_tracked)) | ||||
| 		(void) zfs_refcount_add_many(rc, number, holder); | ||||
| 	else for (; number > 0; number--) | ||||
| 		(void) zfs_refcount_add(rc, holder); | ||||
| @ -161,34 +168,34 @@ int64_t | ||||
| zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, | ||||
|     const void *holder) | ||||
| { | ||||
| 	reference_t *ref; | ||||
| 	reference_t *ref, s; | ||||
| 	int64_t count; | ||||
| 
 | ||||
| 	if (!rc->rc_tracked) { | ||||
| 	if (likely(!rc->rc_tracked)) { | ||||
| 		count = atomic_add_64_nv(&(rc)->rc_count, -number); | ||||
| 		ASSERT3S(count, >=, 0); | ||||
| 		return (count); | ||||
| 	} | ||||
| 
 | ||||
| 	s.ref_holder = holder; | ||||
| 	s.ref_number = number; | ||||
| 	s.ref_search = B_TRUE; | ||||
| 	mutex_enter(&rc->rc_mtx); | ||||
| 	ASSERT3U(rc->rc_count, >=, number); | ||||
| 	for (ref = list_head(&rc->rc_list); ref; | ||||
| 	    ref = list_next(&rc->rc_list, ref)) { | ||||
| 		if (ref->ref_holder == holder && ref->ref_number == number) { | ||||
| 			list_remove(&rc->rc_list, ref); | ||||
| 	ref = avl_find(&rc->rc_tree, &s, NULL); | ||||
| 	if (unlikely(ref == NULL)) { | ||||
| 		panic("No such hold %p on refcount %llx", holder, | ||||
| 		    (u_longlong_t)(uintptr_t)rc); | ||||
| 		return (-1); | ||||
| 	} | ||||
| 	avl_remove(&rc->rc_tree, ref); | ||||
| 	if (reference_history > 0) { | ||||
| 				ref->ref_removed = | ||||
| 				    kmem_cache_alloc(reference_history_cache, | ||||
| 				    KM_SLEEP); | ||||
| 		list_insert_head(&rc->rc_removed, ref); | ||||
| 				rc->rc_removed_count++; | ||||
| 				if (rc->rc_removed_count > reference_history) { | ||||
| 					ref = list_tail(&rc->rc_removed); | ||||
| 					list_remove(&rc->rc_removed, ref); | ||||
| 					kmem_cache_free(reference_history_cache, | ||||
| 					    ref->ref_removed); | ||||
| 		if (rc->rc_removed_count >= reference_history) { | ||||
| 			ref = list_remove_tail(&rc->rc_removed); | ||||
| 			kmem_cache_free(reference_cache, ref); | ||||
| 					rc->rc_removed_count--; | ||||
| 		} else { | ||||
| 			rc->rc_removed_count++; | ||||
| 		} | ||||
| 	} else { | ||||
| 		kmem_cache_free(reference_cache, ref); | ||||
| @ -198,11 +205,6 @@ zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, | ||||
| 	mutex_exit(&rc->rc_mtx); | ||||
| 	return (count); | ||||
| } | ||||
| 	} | ||||
| 	panic("No such hold %p on refcount %llx", holder, | ||||
| 	    (u_longlong_t)(uintptr_t)rc); | ||||
| 	return (-1); | ||||
| } | ||||
| 
 | ||||
| int64_t | ||||
| zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) | ||||
| @ -213,7 +215,7 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) | ||||
| void | ||||
| zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) | ||||
| { | ||||
| 	if (!rc->rc_tracked) | ||||
| 	if (likely(!rc->rc_tracked)) | ||||
| 		(void) zfs_refcount_remove_many(rc, number, holder); | ||||
| 	else for (; number > 0; number--) | ||||
| 		(void) zfs_refcount_remove(rc, holder); | ||||
| @ -222,31 +224,38 @@ zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) | ||||
| void | ||||
| zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) | ||||
| { | ||||
| 	int64_t count, removed_count; | ||||
| 	list_t list, removed; | ||||
| 	avl_tree_t tree; | ||||
| 	list_t removed; | ||||
| 	reference_t *ref; | ||||
| 	void *cookie = NULL; | ||||
| 	uint64_t count; | ||||
| 	uint_t removed_count; | ||||
| 
 | ||||
| 	list_create(&list, sizeof (reference_t), | ||||
| 	    offsetof(reference_t, ref_link)); | ||||
| 	avl_create(&tree, zfs_refcount_compare, sizeof (reference_t), | ||||
| 	    offsetof(reference_t, ref_link.a)); | ||||
| 	list_create(&removed, sizeof (reference_t), | ||||
| 	    offsetof(reference_t, ref_link)); | ||||
| 	    offsetof(reference_t, ref_link.l)); | ||||
| 
 | ||||
| 	mutex_enter(&src->rc_mtx); | ||||
| 	count = src->rc_count; | ||||
| 	removed_count = src->rc_removed_count; | ||||
| 	src->rc_count = 0; | ||||
| 	src->rc_removed_count = 0; | ||||
| 	list_move_tail(&list, &src->rc_list); | ||||
| 	avl_swap(&tree, &src->rc_tree); | ||||
| 	list_move_tail(&removed, &src->rc_removed); | ||||
| 	mutex_exit(&src->rc_mtx); | ||||
| 
 | ||||
| 	mutex_enter(&dst->rc_mtx); | ||||
| 	dst->rc_count += count; | ||||
| 	dst->rc_removed_count += removed_count; | ||||
| 	list_move_tail(&dst->rc_list, &list); | ||||
| 	if (avl_is_empty(&dst->rc_tree)) | ||||
| 		avl_swap(&dst->rc_tree, &tree); | ||||
| 	else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL) | ||||
| 		avl_add(&dst->rc_tree, ref); | ||||
| 	list_move_tail(&dst->rc_removed, &removed); | ||||
| 	mutex_exit(&dst->rc_mtx); | ||||
| 
 | ||||
| 	list_destroy(&list); | ||||
| 	avl_destroy(&tree); | ||||
| 	list_destroy(&removed); | ||||
| } | ||||
| 
 | ||||
| @ -254,23 +263,19 @@ void | ||||
| zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, | ||||
|     const void *current_holder, const void *new_holder) | ||||
| { | ||||
| 	reference_t *ref; | ||||
| 	boolean_t found = B_FALSE; | ||||
| 	reference_t *ref, s; | ||||
| 
 | ||||
| 	if (!rc->rc_tracked) | ||||
| 	if (likely(!rc->rc_tracked)) | ||||
| 		return; | ||||
| 
 | ||||
| 	s.ref_holder = current_holder; | ||||
| 	s.ref_number = number; | ||||
| 	s.ref_search = B_TRUE; | ||||
| 	mutex_enter(&rc->rc_mtx); | ||||
| 	for (ref = list_head(&rc->rc_list); ref; | ||||
| 	    ref = list_next(&rc->rc_list, ref)) { | ||||
| 		if (ref->ref_holder == current_holder && | ||||
| 		    ref->ref_number == number) { | ||||
| 	ref = avl_find(&rc->rc_tree, &s, NULL); | ||||
| 	ASSERT(ref); | ||||
| 	ref->ref_holder = new_holder; | ||||
| 			found = B_TRUE; | ||||
| 			break; | ||||
| 		} | ||||
| 	} | ||||
| 	ASSERT(found); | ||||
| 	avl_update(&rc->rc_tree, ref); | ||||
| 	mutex_exit(&rc->rc_mtx); | ||||
| } | ||||
| 
 | ||||
| @ -290,21 +295,23 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, | ||||
| boolean_t | ||||
| zfs_refcount_held(zfs_refcount_t *rc, const void *holder) | ||||
| { | ||||
| 	reference_t *ref; | ||||
| 	reference_t *ref, s; | ||||
| 	avl_index_t idx; | ||||
| 	boolean_t res; | ||||
| 
 | ||||
| 	if (!rc->rc_tracked) | ||||
| 	if (likely(!rc->rc_tracked)) | ||||
| 		return (zfs_refcount_count(rc) > 0); | ||||
| 
 | ||||
| 	s.ref_holder = holder; | ||||
| 	s.ref_number = 0; | ||||
| 	s.ref_search = B_TRUE; | ||||
| 	mutex_enter(&rc->rc_mtx); | ||||
| 	for (ref = list_head(&rc->rc_list); ref; | ||||
| 	    ref = list_next(&rc->rc_list, ref)) { | ||||
| 		if (ref->ref_holder == holder) { | ||||
| 	ref = avl_find(&rc->rc_tree, &s, &idx); | ||||
| 	if (likely(ref == NULL)) | ||||
| 		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); | ||||
| 	res = ref && ref->ref_holder == holder; | ||||
| 	mutex_exit(&rc->rc_mtx); | ||||
| 			return (B_TRUE); | ||||
| 		} | ||||
| 	} | ||||
| 	mutex_exit(&rc->rc_mtx); | ||||
| 	return (B_FALSE); | ||||
| 	return (res); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @ -315,21 +322,23 @@ zfs_refcount_held(zfs_refcount_t *rc, const void *holder) | ||||
| boolean_t | ||||
| zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) | ||||
| { | ||||
| 	reference_t *ref; | ||||
| 	reference_t *ref, s; | ||||
| 	avl_index_t idx; | ||||
| 	boolean_t res; | ||||
| 
 | ||||
| 	if (!rc->rc_tracked) | ||||
| 	if (likely(!rc->rc_tracked)) | ||||
| 		return (B_TRUE); | ||||
| 
 | ||||
| 	mutex_enter(&rc->rc_mtx); | ||||
| 	for (ref = list_head(&rc->rc_list); ref; | ||||
| 	    ref = list_next(&rc->rc_list, ref)) { | ||||
| 		if (ref->ref_holder == holder) { | ||||
| 	s.ref_holder = holder; | ||||
| 	s.ref_number = 0; | ||||
| 	s.ref_search = B_TRUE; | ||||
| 	ref = avl_find(&rc->rc_tree, &s, &idx); | ||||
| 	if (likely(ref == NULL)) | ||||
| 		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); | ||||
| 	res = ref == NULL || ref->ref_holder != holder; | ||||
| 	mutex_exit(&rc->rc_mtx); | ||||
| 			return (B_FALSE); | ||||
| 		} | ||||
| 	} | ||||
| 	mutex_exit(&rc->rc_mtx); | ||||
| 	return (B_TRUE); | ||||
| 	return (res); | ||||
| } | ||||
| 
 | ||||
| EXPORT_SYMBOL(zfs_refcount_create); | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Alexander Motin
						Alexander Motin