mirror of
				https://git.proxmox.com/git/mirror_zfs
				synced 2025-10-25 14:19:53 +00:00 
			
		
		
		
	ZFS send should use spill block prefetched from send_reader_thread
Currently, even though send_reader_thread prefetches spill block, do_dump() will not use it and issues its own blocking arc_read. This causes significant performance degradation when sending datasets with lots of spill blocks. For unmodified spill blocks, we also create send_range struct for them in send_reader_thread and issue prefetches for them. We piggyback them on the dnode send_range instead of enqueueing them so we don't break send_range_after check. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Chunwei Chen <david.chen@nutanix.com> Co-authored-by: david.chen <david.chen@nutanix.com> Closes #16701
This commit is contained in:
		
							parent
							
								
									7b6e9675da
								
							
						
					
					
						commit
						5945676bcc
					
				| @ -180,6 +180,8 @@ struct send_range { | |||||||
| 			 */ | 			 */ | ||||||
| 			dnode_phys_t		*dnp; | 			dnode_phys_t		*dnp; | ||||||
| 			blkptr_t		bp; | 			blkptr_t		bp; | ||||||
|  | 			/* Piggyback unmodified spill block */ | ||||||
|  | 			struct send_range	*spill_range; | ||||||
| 		} object; | 		} object; | ||||||
| 		struct srr { | 		struct srr { | ||||||
| 			uint32_t		datablksz; | 			uint32_t		datablksz; | ||||||
| @ -231,6 +233,8 @@ range_free(struct send_range *range) | |||||||
| 		size_t size = sizeof (dnode_phys_t) * | 		size_t size = sizeof (dnode_phys_t) * | ||||||
| 		    (range->sru.object.dnp->dn_extra_slots + 1); | 		    (range->sru.object.dnp->dn_extra_slots + 1); | ||||||
| 		kmem_free(range->sru.object.dnp, size); | 		kmem_free(range->sru.object.dnp, size); | ||||||
|  | 		if (range->sru.object.spill_range) | ||||||
|  | 			range_free(range->sru.object.spill_range); | ||||||
| 	} else if (range->type == DATA) { | 	} else if (range->type == DATA) { | ||||||
| 		mutex_enter(&range->sru.data.lock); | 		mutex_enter(&range->sru.data.lock); | ||||||
| 		while (range->sru.data.io_outstanding) | 		while (range->sru.data.io_outstanding) | ||||||
| @ -617,7 +621,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, | |||||||
| 	drrs->drr_length = blksz; | 	drrs->drr_length = blksz; | ||||||
| 	drrs->drr_toguid = dscp->dsc_toguid; | 	drrs->drr_toguid = dscp->dsc_toguid; | ||||||
| 
 | 
 | ||||||
| 	/* See comment in dump_dnode() for full details */ | 	/* See comment in piggyback_unmodified_spill() for full details */ | ||||||
| 	if (zfs_send_unmodified_spill_blocks && | 	if (zfs_send_unmodified_spill_blocks && | ||||||
| 	    (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) { | 	    (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) { | ||||||
| 		drrs->drr_flags |= DRR_SPILL_UNMODIFIED; | 		drrs->drr_flags |= DRR_SPILL_UNMODIFIED; | ||||||
| @ -793,35 +797,6 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, | |||||||
| 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) | 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) | ||||||
| 		return (SET_ERROR(EINTR)); | 		return (SET_ERROR(EINTR)); | ||||||
| 
 | 
 | ||||||
| 	/*
 |  | ||||||
| 	 * Send DRR_SPILL records for unmodified spill blocks.	This is useful |  | ||||||
| 	 * because changing certain attributes of the object (e.g. blocksize) |  | ||||||
| 	 * can cause old versions of ZFS to incorrectly remove a spill block. |  | ||||||
| 	 * Including these records in the stream forces an up to date version |  | ||||||
| 	 * to always be written ensuring they're never lost.  Current versions |  | ||||||
| 	 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can |  | ||||||
| 	 * ignore these unmodified spill blocks. |  | ||||||
| 	 */ |  | ||||||
| 	if (zfs_send_unmodified_spill_blocks && |  | ||||||
| 	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && |  | ||||||
| 	    (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) { |  | ||||||
| 		struct send_range record; |  | ||||||
| 		blkptr_t *bp = DN_SPILL_BLKPTR(dnp); |  | ||||||
| 
 |  | ||||||
| 		memset(&record, 0, sizeof (struct send_range)); |  | ||||||
| 		record.type = DATA; |  | ||||||
| 		record.object = object; |  | ||||||
| 		record.eos_marker = B_FALSE; |  | ||||||
| 		record.start_blkid = DMU_SPILL_BLKID; |  | ||||||
| 		record.end_blkid = record.start_blkid + 1; |  | ||||||
| 		record.sru.data.bp = *bp; |  | ||||||
| 		record.sru.data.obj_type = dnp->dn_type; |  | ||||||
| 		record.sru.data.datablksz = BP_GET_LSIZE(bp); |  | ||||||
| 
 |  | ||||||
| 		if (do_dump(dscp, &record) != 0) |  | ||||||
| 			return (SET_ERROR(EINTR)); |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	if (dscp->dsc_err != 0) | 	if (dscp->dsc_err != 0) | ||||||
| 		return (SET_ERROR(EINTR)); | 		return (SET_ERROR(EINTR)); | ||||||
| 
 | 
 | ||||||
| @ -911,6 +886,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | |||||||
| 	case OBJECT: | 	case OBJECT: | ||||||
| 		err = dump_dnode(dscp, &range->sru.object.bp, range->object, | 		err = dump_dnode(dscp, &range->sru.object.bp, range->object, | ||||||
| 		    range->sru.object.dnp); | 		    range->sru.object.dnp); | ||||||
|  | 		/* Dump piggybacked unmodified spill block */ | ||||||
|  | 		if (!err && range->sru.object.spill_range) | ||||||
|  | 			err = do_dump(dscp, range->sru.object.spill_range); | ||||||
| 		return (err); | 		return (err); | ||||||
| 	case OBJECT_RANGE: { | 	case OBJECT_RANGE: { | ||||||
| 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); | 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); | ||||||
| @ -939,34 +917,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | |||||||
| 
 | 
 | ||||||
| 		ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); | 		ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); | ||||||
| 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); | 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); | ||||||
| 		if (BP_GET_TYPE(bp) == DMU_OT_SA) { |  | ||||||
| 			arc_flags_t aflags = ARC_FLAG_WAIT; |  | ||||||
| 			zio_flag_t zioflags = ZIO_FLAG_CANFAIL; |  | ||||||
| 
 | 
 | ||||||
| 			if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { |  | ||||||
| 				ASSERT(BP_IS_PROTECTED(bp)); |  | ||||||
| 				zioflags |= ZIO_FLAG_RAW; |  | ||||||
| 			} |  | ||||||
| 
 |  | ||||||
| 			zbookmark_phys_t zb; |  | ||||||
| 			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); |  | ||||||
| 			zb.zb_objset = dmu_objset_id(dscp->dsc_os); |  | ||||||
| 			zb.zb_object = range->object; |  | ||||||
| 			zb.zb_level = 0; |  | ||||||
| 			zb.zb_blkid = range->start_blkid; |  | ||||||
| 
 |  | ||||||
| 			arc_buf_t *abuf = NULL; |  | ||||||
| 			if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa, |  | ||||||
| 			    bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, |  | ||||||
| 			    zioflags, &aflags, &zb) != 0) |  | ||||||
| 				return (SET_ERROR(EIO)); |  | ||||||
| 
 |  | ||||||
| 			err = dump_spill(dscp, bp, zb.zb_object, |  | ||||||
| 			    (abuf == NULL ? NULL : abuf->b_data)); |  | ||||||
| 			if (abuf != NULL) |  | ||||||
| 				arc_buf_destroy(abuf, &abuf); |  | ||||||
| 			return (err); |  | ||||||
| 		} |  | ||||||
| 		if (send_do_embed(bp, dscp->dsc_featureflags)) { | 		if (send_do_embed(bp, dscp->dsc_featureflags)) { | ||||||
| 			err = dump_write_embedded(dscp, range->object, | 			err = dump_write_embedded(dscp, range->object, | ||||||
| 			    range->start_blkid * srdp->datablksz, | 			    range->start_blkid * srdp->datablksz, | ||||||
| @ -975,8 +926,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | |||||||
| 		} | 		} | ||||||
| 		ASSERT(range->object > dscp->dsc_resume_object || | 		ASSERT(range->object > dscp->dsc_resume_object || | ||||||
| 		    (range->object == dscp->dsc_resume_object && | 		    (range->object == dscp->dsc_resume_object && | ||||||
|  | 		    (range->start_blkid == DMU_SPILL_BLKID || | ||||||
| 		    range->start_blkid * srdp->datablksz >= | 		    range->start_blkid * srdp->datablksz >= | ||||||
| 		    dscp->dsc_resume_offset)); | 		    dscp->dsc_resume_offset))); | ||||||
| 		/* it's a level-0 block of a regular object */ | 		/* it's a level-0 block of a regular object */ | ||||||
| 
 | 
 | ||||||
| 		mutex_enter(&srdp->lock); | 		mutex_enter(&srdp->lock); | ||||||
| @ -1006,8 +958,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | |||||||
| 		ASSERT(dscp->dsc_dso->dso_dryrun || | 		ASSERT(dscp->dsc_dso->dso_dryrun || | ||||||
| 		    srdp->abuf != NULL || srdp->abd != NULL); | 		    srdp->abuf != NULL || srdp->abd != NULL); | ||||||
| 
 | 
 | ||||||
| 		uint64_t offset = range->start_blkid * srdp->datablksz; |  | ||||||
| 
 |  | ||||||
| 		char *data = NULL; | 		char *data = NULL; | ||||||
| 		if (srdp->abd != NULL) { | 		if (srdp->abd != NULL) { | ||||||
| 			data = abd_to_buf(srdp->abd); | 			data = abd_to_buf(srdp->abd); | ||||||
| @ -1016,6 +966,14 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | |||||||
| 			data = srdp->abuf->b_data; | 			data = srdp->abuf->b_data; | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
|  | 		if (BP_GET_TYPE(bp) == DMU_OT_SA) { | ||||||
|  | 			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); | ||||||
|  | 			err = dump_spill(dscp, bp, range->object, data); | ||||||
|  | 			return (err); | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		uint64_t offset = range->start_blkid * srdp->datablksz; | ||||||
|  | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * If we have large blocks stored on disk but the send flags | 		 * If we have large blocks stored on disk but the send flags | ||||||
| 		 * don't allow us to send large blocks, we split the data from | 		 * don't allow us to send large blocks, we split the data from | ||||||
| @ -1098,6 +1056,8 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid, | |||||||
| 		range->sru.data.io_outstanding = 0; | 		range->sru.data.io_outstanding = 0; | ||||||
| 		range->sru.data.io_err = 0; | 		range->sru.data.io_err = 0; | ||||||
| 		range->sru.data.io_compressed = B_FALSE; | 		range->sru.data.io_compressed = B_FALSE; | ||||||
|  | 	} else if (type == OBJECT) { | ||||||
|  | 		range->sru.object.spill_range = NULL; | ||||||
| 	} | 	} | ||||||
| 	return (range); | 	return (range); | ||||||
| } | } | ||||||
| @ -1742,6 +1702,45 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, | |||||||
| 	bqueue_enqueue(q, range, datablksz); | 	bqueue_enqueue(q, range, datablksz); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Send DRR_SPILL records for unmodified spill blocks.	This is useful | ||||||
|  |  * because changing certain attributes of the object (e.g. blocksize) | ||||||
|  |  * can cause old versions of ZFS to incorrectly remove a spill block. | ||||||
|  |  * Including these records in the stream forces an up to date version | ||||||
|  |  * to always be written ensuring they're never lost.  Current versions | ||||||
|  |  * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can | ||||||
|  |  * ignore these unmodified spill blocks. | ||||||
|  |  * | ||||||
|  |  * We piggyback the spill_range to dnode range instead of enqueueing it | ||||||
|  |  * so send_range_after won't complain. | ||||||
|  |  */ | ||||||
|  | static uint64_t | ||||||
|  | piggyback_unmodified_spill(struct send_reader_thread_arg *srta, | ||||||
|  |     struct send_range *range) | ||||||
|  | { | ||||||
|  | 	ASSERT3U(range->type, ==, OBJECT); | ||||||
|  | 
 | ||||||
|  | 	dnode_phys_t *dnp = range->sru.object.dnp; | ||||||
|  | 	uint64_t fromtxg = srta->smta->to_arg->fromtxg; | ||||||
|  | 
 | ||||||
|  | 	if (!zfs_send_unmodified_spill_blocks || | ||||||
|  | 	    !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) || | ||||||
|  | 	    !(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= fromtxg)) | ||||||
|  | 		return (0); | ||||||
|  | 
 | ||||||
|  | 	blkptr_t *bp = DN_SPILL_BLKPTR(dnp); | ||||||
|  | 	struct send_range *spill_range = range_alloc(DATA, range->object, | ||||||
|  | 	    DMU_SPILL_BLKID, DMU_SPILL_BLKID+1, B_FALSE); | ||||||
|  | 	spill_range->sru.data.bp = *bp; | ||||||
|  | 	spill_range->sru.data.obj_type = dnp->dn_type; | ||||||
|  | 	spill_range->sru.data.datablksz = BP_GET_LSIZE(bp); | ||||||
|  | 
 | ||||||
|  | 	issue_data_read(srta, spill_range); | ||||||
|  | 	range->sru.object.spill_range = spill_range; | ||||||
|  | 
 | ||||||
|  | 	return (BP_GET_LSIZE(bp)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * This thread is responsible for two things: First, it retrieves the correct |  * This thread is responsible for two things: First, it retrieves the correct | ||||||
|  * blkptr in the to ds if we need to send the data because of something from |  * blkptr in the to ds if we need to send the data because of something from | ||||||
| @ -1773,17 +1772,20 @@ send_reader_thread(void *arg) | |||||||
| 	uint64_t last_obj_exists = B_TRUE; | 	uint64_t last_obj_exists = B_TRUE; | ||||||
| 	while (!range->eos_marker && !srta->cancel && smta->error == 0 && | 	while (!range->eos_marker && !srta->cancel && smta->error == 0 && | ||||||
| 	    err == 0) { | 	    err == 0) { | ||||||
|  | 		uint64_t spill = 0; | ||||||
| 		switch (range->type) { | 		switch (range->type) { | ||||||
| 		case DATA: | 		case DATA: | ||||||
| 			issue_data_read(srta, range); | 			issue_data_read(srta, range); | ||||||
| 			bqueue_enqueue(outq, range, range->sru.data.datablksz); | 			bqueue_enqueue(outq, range, range->sru.data.datablksz); | ||||||
| 			range = get_next_range_nofree(inq, range); | 			range = get_next_range_nofree(inq, range); | ||||||
| 			break; | 			break; | ||||||
| 		case HOLE: |  | ||||||
| 		case OBJECT: | 		case OBJECT: | ||||||
|  | 			spill = piggyback_unmodified_spill(srta, range); | ||||||
|  | 			zfs_fallthrough; | ||||||
|  | 		case HOLE: | ||||||
| 		case OBJECT_RANGE: | 		case OBJECT_RANGE: | ||||||
| 		case REDACT: // Redacted blocks must exist
 | 		case REDACT: // Redacted blocks must exist
 | ||||||
| 			bqueue_enqueue(outq, range, sizeof (*range)); | 			bqueue_enqueue(outq, range, sizeof (*range) + spill); | ||||||
| 			range = get_next_range_nofree(inq, range); | 			range = get_next_range_nofree(inq, range); | ||||||
| 			break; | 			break; | ||||||
| 		case PREVIOUSLY_REDACTED: { | 		case PREVIOUSLY_REDACTED: { | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Chunwei Chen
						Chunwei Chen