mirror of
				https://git.proxmox.com/git/mirror_zfs
				synced 2025-10-26 00:05:17 +00:00 
			
		
		
		
	ZFS send should use spill block prefetched from send_reader_thread
Currently, even though send_reader_thread prefetches spill block, do_dump() will not use it and issues its own blocking arc_read. This causes significant performance degradation when sending datasets with lots of spill blocks. For unmodified spill blocks, we also create send_range struct for them in send_reader_thread and issue prefetches for them. We piggyback them on the dnode send_range instead of enqueueing them so we don't break send_range_after check. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Chunwei Chen <david.chen@nutanix.com> Co-authored-by: david.chen <david.chen@nutanix.com> Closes #16701
This commit is contained in:
		
							parent
							
								
									7b6e9675da
								
							
						
					
					
						commit
						5945676bcc
					
				| @ -180,6 +180,8 @@ struct send_range { | ||||
| 			 */ | ||||
| 			dnode_phys_t		*dnp; | ||||
| 			blkptr_t		bp; | ||||
| 			/* Piggyback unmodified spill block */ | ||||
| 			struct send_range	*spill_range; | ||||
| 		} object; | ||||
| 		struct srr { | ||||
| 			uint32_t		datablksz; | ||||
| @ -231,6 +233,8 @@ range_free(struct send_range *range) | ||||
| 		size_t size = sizeof (dnode_phys_t) * | ||||
| 		    (range->sru.object.dnp->dn_extra_slots + 1); | ||||
| 		kmem_free(range->sru.object.dnp, size); | ||||
| 		if (range->sru.object.spill_range) | ||||
| 			range_free(range->sru.object.spill_range); | ||||
| 	} else if (range->type == DATA) { | ||||
| 		mutex_enter(&range->sru.data.lock); | ||||
| 		while (range->sru.data.io_outstanding) | ||||
| @ -617,7 +621,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, | ||||
| 	drrs->drr_length = blksz; | ||||
| 	drrs->drr_toguid = dscp->dsc_toguid; | ||||
| 
 | ||||
| 	/* See comment in dump_dnode() for full details */ | ||||
| 	/* See comment in piggyback_unmodified_spill() for full details */ | ||||
| 	if (zfs_send_unmodified_spill_blocks && | ||||
| 	    (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) { | ||||
| 		drrs->drr_flags |= DRR_SPILL_UNMODIFIED; | ||||
| @ -793,35 +797,6 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, | ||||
| 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) | ||||
| 		return (SET_ERROR(EINTR)); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Send DRR_SPILL records for unmodified spill blocks.	This is useful | ||||
| 	 * because changing certain attributes of the object (e.g. blocksize) | ||||
| 	 * can cause old versions of ZFS to incorrectly remove a spill block. | ||||
| 	 * Including these records in the stream forces an up to date version | ||||
| 	 * to always be written ensuring they're never lost.  Current versions | ||||
| 	 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can | ||||
| 	 * ignore these unmodified spill blocks. | ||||
| 	 */ | ||||
| 	if (zfs_send_unmodified_spill_blocks && | ||||
| 	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && | ||||
| 	    (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) { | ||||
| 		struct send_range record; | ||||
| 		blkptr_t *bp = DN_SPILL_BLKPTR(dnp); | ||||
| 
 | ||||
| 		memset(&record, 0, sizeof (struct send_range)); | ||||
| 		record.type = DATA; | ||||
| 		record.object = object; | ||||
| 		record.eos_marker = B_FALSE; | ||||
| 		record.start_blkid = DMU_SPILL_BLKID; | ||||
| 		record.end_blkid = record.start_blkid + 1; | ||||
| 		record.sru.data.bp = *bp; | ||||
| 		record.sru.data.obj_type = dnp->dn_type; | ||||
| 		record.sru.data.datablksz = BP_GET_LSIZE(bp); | ||||
| 
 | ||||
| 		if (do_dump(dscp, &record) != 0) | ||||
| 			return (SET_ERROR(EINTR)); | ||||
| 	} | ||||
| 
 | ||||
| 	if (dscp->dsc_err != 0) | ||||
| 		return (SET_ERROR(EINTR)); | ||||
| 
 | ||||
| @ -911,6 +886,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | ||||
| 	case OBJECT: | ||||
| 		err = dump_dnode(dscp, &range->sru.object.bp, range->object, | ||||
| 		    range->sru.object.dnp); | ||||
| 		/* Dump piggybacked unmodified spill block */ | ||||
| 		if (!err && range->sru.object.spill_range) | ||||
| 			err = do_dump(dscp, range->sru.object.spill_range); | ||||
| 		return (err); | ||||
| 	case OBJECT_RANGE: { | ||||
| 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); | ||||
| @ -939,34 +917,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | ||||
| 
 | ||||
| 		ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); | ||||
| 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); | ||||
| 		if (BP_GET_TYPE(bp) == DMU_OT_SA) { | ||||
| 			arc_flags_t aflags = ARC_FLAG_WAIT; | ||||
| 			zio_flag_t zioflags = ZIO_FLAG_CANFAIL; | ||||
| 
 | ||||
| 			if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { | ||||
| 				ASSERT(BP_IS_PROTECTED(bp)); | ||||
| 				zioflags |= ZIO_FLAG_RAW; | ||||
| 			} | ||||
| 
 | ||||
| 			zbookmark_phys_t zb; | ||||
| 			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); | ||||
| 			zb.zb_objset = dmu_objset_id(dscp->dsc_os); | ||||
| 			zb.zb_object = range->object; | ||||
| 			zb.zb_level = 0; | ||||
| 			zb.zb_blkid = range->start_blkid; | ||||
| 
 | ||||
| 			arc_buf_t *abuf = NULL; | ||||
| 			if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa, | ||||
| 			    bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, | ||||
| 			    zioflags, &aflags, &zb) != 0) | ||||
| 				return (SET_ERROR(EIO)); | ||||
| 
 | ||||
| 			err = dump_spill(dscp, bp, zb.zb_object, | ||||
| 			    (abuf == NULL ? NULL : abuf->b_data)); | ||||
| 			if (abuf != NULL) | ||||
| 				arc_buf_destroy(abuf, &abuf); | ||||
| 			return (err); | ||||
| 		} | ||||
| 		if (send_do_embed(bp, dscp->dsc_featureflags)) { | ||||
| 			err = dump_write_embedded(dscp, range->object, | ||||
| 			    range->start_blkid * srdp->datablksz, | ||||
| @ -975,8 +926,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | ||||
| 		} | ||||
| 		ASSERT(range->object > dscp->dsc_resume_object || | ||||
| 		    (range->object == dscp->dsc_resume_object && | ||||
| 		    (range->start_blkid == DMU_SPILL_BLKID || | ||||
| 		    range->start_blkid * srdp->datablksz >= | ||||
| 		    dscp->dsc_resume_offset)); | ||||
| 		    dscp->dsc_resume_offset))); | ||||
| 		/* it's a level-0 block of a regular object */ | ||||
| 
 | ||||
| 		mutex_enter(&srdp->lock); | ||||
| @ -1006,8 +958,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | ||||
| 		ASSERT(dscp->dsc_dso->dso_dryrun || | ||||
| 		    srdp->abuf != NULL || srdp->abd != NULL); | ||||
| 
 | ||||
| 		uint64_t offset = range->start_blkid * srdp->datablksz; | ||||
| 
 | ||||
| 		char *data = NULL; | ||||
| 		if (srdp->abd != NULL) { | ||||
| 			data = abd_to_buf(srdp->abd); | ||||
| @ -1016,6 +966,14 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) | ||||
| 			data = srdp->abuf->b_data; | ||||
| 		} | ||||
| 
 | ||||
| 		if (BP_GET_TYPE(bp) == DMU_OT_SA) { | ||||
| 			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); | ||||
| 			err = dump_spill(dscp, bp, range->object, data); | ||||
| 			return (err); | ||||
| 		} | ||||
| 
 | ||||
| 		uint64_t offset = range->start_blkid * srdp->datablksz; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If we have large blocks stored on disk but the send flags | ||||
| 		 * don't allow us to send large blocks, we split the data from | ||||
| @ -1098,6 +1056,8 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid, | ||||
| 		range->sru.data.io_outstanding = 0; | ||||
| 		range->sru.data.io_err = 0; | ||||
| 		range->sru.data.io_compressed = B_FALSE; | ||||
| 	} else if (type == OBJECT) { | ||||
| 		range->sru.object.spill_range = NULL; | ||||
| 	} | ||||
| 	return (range); | ||||
| } | ||||
| @ -1742,6 +1702,45 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, | ||||
| 	bqueue_enqueue(q, range, datablksz); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Send DRR_SPILL records for unmodified spill blocks.	This is useful | ||||
|  * because changing certain attributes of the object (e.g. blocksize) | ||||
|  * can cause old versions of ZFS to incorrectly remove a spill block. | ||||
|  * Including these records in the stream forces an up to date version | ||||
|  * to always be written ensuring they're never lost.  Current versions | ||||
|  * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can | ||||
|  * ignore these unmodified spill blocks. | ||||
|  * | ||||
|  * We piggyback the spill_range to dnode range instead of enqueueing it | ||||
|  * so send_range_after won't complain. | ||||
|  */ | ||||
| static uint64_t | ||||
| piggyback_unmodified_spill(struct send_reader_thread_arg *srta, | ||||
|     struct send_range *range) | ||||
| { | ||||
| 	ASSERT3U(range->type, ==, OBJECT); | ||||
| 
 | ||||
| 	dnode_phys_t *dnp = range->sru.object.dnp; | ||||
| 	uint64_t fromtxg = srta->smta->to_arg->fromtxg; | ||||
| 
 | ||||
| 	if (!zfs_send_unmodified_spill_blocks || | ||||
| 	    !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) || | ||||
| 	    !(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= fromtxg)) | ||||
| 		return (0); | ||||
| 
 | ||||
| 	blkptr_t *bp = DN_SPILL_BLKPTR(dnp); | ||||
| 	struct send_range *spill_range = range_alloc(DATA, range->object, | ||||
| 	    DMU_SPILL_BLKID, DMU_SPILL_BLKID+1, B_FALSE); | ||||
| 	spill_range->sru.data.bp = *bp; | ||||
| 	spill_range->sru.data.obj_type = dnp->dn_type; | ||||
| 	spill_range->sru.data.datablksz = BP_GET_LSIZE(bp); | ||||
| 
 | ||||
| 	issue_data_read(srta, spill_range); | ||||
| 	range->sru.object.spill_range = spill_range; | ||||
| 
 | ||||
| 	return (BP_GET_LSIZE(bp)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * This thread is responsible for two things: First, it retrieves the correct | ||||
|  * blkptr in the to ds if we need to send the data because of something from | ||||
| @ -1773,17 +1772,20 @@ send_reader_thread(void *arg) | ||||
| 	uint64_t last_obj_exists = B_TRUE; | ||||
| 	while (!range->eos_marker && !srta->cancel && smta->error == 0 && | ||||
| 	    err == 0) { | ||||
| 		uint64_t spill = 0; | ||||
| 		switch (range->type) { | ||||
| 		case DATA: | ||||
| 			issue_data_read(srta, range); | ||||
| 			bqueue_enqueue(outq, range, range->sru.data.datablksz); | ||||
| 			range = get_next_range_nofree(inq, range); | ||||
| 			break; | ||||
| 		case HOLE: | ||||
| 		case OBJECT: | ||||
| 			spill = piggyback_unmodified_spill(srta, range); | ||||
| 			zfs_fallthrough; | ||||
| 		case HOLE: | ||||
| 		case OBJECT_RANGE: | ||||
| 		case REDACT: // Redacted blocks must exist
 | ||||
| 			bqueue_enqueue(outq, range, sizeof (*range)); | ||||
| 			bqueue_enqueue(outq, range, sizeof (*range) + spill); | ||||
| 			range = get_next_range_nofree(inq, range); | ||||
| 			break; | ||||
| 		case PREVIOUSLY_REDACTED: { | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Chunwei Chen
						Chunwei Chen