mirror of
				https://git.proxmox.com/git/mirror_zfs
				synced 2025-10-26 16:21:07 +00:00 
			
		
		
		
	abd_iter_page: rework to handle multipage scatterlists
Previously, abd_iter_page() would assume that every scatterlist would contain a single page (compound or no), because that's all we ever create in abd_alloc_chunks(). However, scatterlists can contain multiple pages of arbitrary provenance, and if we get one of those, we'd get all the math wrong. This reworks things to handle multiple pages in a scatterlist, by properly finding the right page within it for the given offset, and understanding better where the end of the page is and not crossing it. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reported-by: Brian Atkinson <batkinson@lanl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Brian Atkinson <batkinson@lanl.gov> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes #16108
This commit is contained in:
		
							parent
							
								
									9f83eec039
								
							
						
					
					
						commit
						f4f156157d
					
				| @ -1015,10 +1015,50 @@ abd_cache_reap_now(void) | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #if defined(_KERNEL) | #if defined(_KERNEL) | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Yield the next page struct and data offset and size within it, without |  * This is abd_iter_page(), the function underneath abd_iterate_page_func(). | ||||||
|  |  * It yields the next page struct and data offset and size within it, without | ||||||
|  * mapping it into the address space. |  * mapping it into the address space. | ||||||
|  */ |  */ | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * "Compound pages" are a group of pages that can be referenced from a single | ||||||
|  |  * struct page *. Its organised as a "head" page, followed by a series of | ||||||
|  |  * "tail" pages. | ||||||
|  |  * | ||||||
|  |  * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we | ||||||
|  |  * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a | ||||||
|  |  * great many of the IO buffers we get are going to be of this type. | ||||||
|  |  * | ||||||
|  |  * The tail pages are just regular PAGESIZE pages, and can be safely used | ||||||
|  |  * as-is. However, the head page has length covering itself and all the tail | ||||||
|  |  * pages. If the ABD chunk spans multiple pages, then we can use the head page | ||||||
|  |  * and a >PAGESIZE length, which is far more efficient. | ||||||
|  |  * | ||||||
|  |  * Before kernel 4.5 however, compound page heads were refcounted separately | ||||||
|  |  * from tail pages, such that moving back to the head page would require us to | ||||||
|  |  * take a reference to it and releasing it once we're completely finished with | ||||||
|  |  * it. In practice, that means when our caller is done with the ABD, which we | ||||||
|  |  * have no insight into from here. Rather than contort this API to track head | ||||||
|  |  * page references on such ancient kernels, we disable this special compound | ||||||
|  |  * page handling on 4.5, instead just using treating each page within it as a | ||||||
|  |  * regular PAGESIZE page (which it is). This is slightly less efficient, but | ||||||
|  |  * makes everything far simpler. | ||||||
|  |  * | ||||||
|  |  * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the | ||||||
|  |  * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to | ||||||
|  |  * understand compound pages, or not, as required. | ||||||
|  |  */ | ||||||
|  | #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) | ||||||
|  | #define	ABD_ITER_COMPOUND_PAGES		1 | ||||||
|  | #define	ABD_ITER_PAGE_SIZE(page)	\ | ||||||
|  | 	(PageCompound(page) ? page_size(page) : PAGESIZE) | ||||||
|  | #else | ||||||
|  | #undef ABD_ITER_COMPOUND_PAGES | ||||||
|  | #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE) | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| void | void | ||||||
| abd_iter_page(struct abd_iter *aiter) | abd_iter_page(struct abd_iter *aiter) | ||||||
| { | { | ||||||
| @ -1032,6 +1072,12 @@ abd_iter_page(struct abd_iter *aiter) | |||||||
| 	struct page *page; | 	struct page *page; | ||||||
| 	size_t doff, dsize; | 	size_t doff, dsize; | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Find the page, and the start of the data within it. This is computed | ||||||
|  | 	 * differently for linear and scatter ABDs; linear is referenced by | ||||||
|  | 	 * virtual memory location, while scatter is referenced by page | ||||||
|  | 	 * pointer. | ||||||
|  | 	 */ | ||||||
| 	if (abd_is_linear(aiter->iter_abd)) { | 	if (abd_is_linear(aiter->iter_abd)) { | ||||||
| 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); | 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); | ||||||
| 
 | 
 | ||||||
| @ -1044,57 +1090,24 @@ abd_iter_page(struct abd_iter *aiter) | |||||||
| 
 | 
 | ||||||
| 		/* offset of address within the page */ | 		/* offset of address within the page */ | ||||||
| 		doff = offset_in_page(paddr); | 		doff = offset_in_page(paddr); | ||||||
| 
 |  | ||||||
| 		/* total data remaining in abd from this position */ |  | ||||||
| 		dsize = aiter->iter_abd->abd_size - aiter->iter_offset; |  | ||||||
| 	} else { | 	} else { | ||||||
| 		ASSERT(!abd_is_gang(aiter->iter_abd)); | 		ASSERT(!abd_is_gang(aiter->iter_abd)); | ||||||
| 
 | 
 | ||||||
| 		/* current scatter page */ | 		/* current scatter page */ | ||||||
| 		page = sg_page(aiter->iter_sg); | 		page = nth_page(sg_page(aiter->iter_sg), | ||||||
|  | 		    aiter->iter_offset >> PAGE_SHIFT); | ||||||
| 
 | 
 | ||||||
| 		/* position within page */ | 		/* position within page */ | ||||||
| 		doff = aiter->iter_offset; | 		doff = aiter->iter_offset & (PAGESIZE - 1); | ||||||
| 
 |  | ||||||
| 		/* remaining data in scatterlist */ |  | ||||||
| 		dsize = MIN(aiter->iter_sg->length - aiter->iter_offset, |  | ||||||
| 		    aiter->iter_abd->abd_size - aiter->iter_pos); |  | ||||||
| 	} | 	} | ||||||
| 	ASSERT(page); |  | ||||||
| 
 | 
 | ||||||
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) | #ifdef ABD_ITER_COMPOUND_PAGES | ||||||
| 	if (PageTail(page)) { | 	if (PageTail(page)) { | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * This page is part of a "compound page", which is a group of | 		 * If this is a compound tail page, move back to the head, and | ||||||
| 		 * pages that can be referenced from a single struct page *. | 		 * adjust the offset to match. This may let us yield a much | ||||||
| 		 * Its organised as a "head" page, followed by a series of | 		 * larger amount of data from a single logical page, and so | ||||||
| 		 * "tail" pages. | 		 * leave our caller with fewer pages to process. | ||||||
| 		 * |  | ||||||
| 		 * In OpenZFS, compound pages are allocated using the |  | ||||||
| 		 * __GFP_COMP flag, which we get from scatter ABDs and SPL |  | ||||||
| 		 * vmalloc slabs (ie >16K allocations). So a great many of the |  | ||||||
| 		 * IO buffers we get are going to be of this type. |  | ||||||
| 		 * |  | ||||||
| 		 * The tail pages are just regular PAGE_SIZE pages, and can be |  | ||||||
| 		 * safely used as-is. However, the head page has length |  | ||||||
| 		 * covering itself and all the tail pages. If this ABD chunk |  | ||||||
| 		 * spans multiple pages, then we can use the head page and a |  | ||||||
| 		 * >PAGE_SIZE length, which is far more efficient. |  | ||||||
| 		 * |  | ||||||
| 		 * To do this, we need to adjust the offset to be counted from |  | ||||||
| 		 * the head page. struct page for compound pages are stored |  | ||||||
| 		 * contiguously, so we can just adjust by a simple offset. |  | ||||||
| 		 * |  | ||||||
| 		 * Before kernel 4.5, compound page heads were refcounted |  | ||||||
| 		 * separately, such that moving back to the head page would |  | ||||||
| 		 * require us to take a reference to it and releasing it once |  | ||||||
| 		 * we're completely finished with it. In practice, that means |  | ||||||
| 		 * when our caller is done with the ABD, which we have no |  | ||||||
| 		 * insight into from here. Rather than contort this API to |  | ||||||
| 		 * track head page references on such ancient kernels, we just |  | ||||||
| 		 * compile this block out and use the tail pages directly. This |  | ||||||
| 		 * is slightly less efficient, but makes everything far |  | ||||||
| 		 * simpler. |  | ||||||
| 		 */ | 		 */ | ||||||
| 		struct page *head = compound_head(page); | 		struct page *head = compound_head(page); | ||||||
| 		doff += ((page - head) * PAGESIZE); | 		doff += ((page - head) * PAGESIZE); | ||||||
| @ -1102,12 +1115,27 @@ abd_iter_page(struct abd_iter *aiter) | |||||||
| 	} | 	} | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	/* final page and position within it */ | 	ASSERT(page); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Compute the maximum amount of data we can take from this page. This | ||||||
|  | 	 * is the smaller of: | ||||||
|  | 	 * - the remaining space in the page | ||||||
|  | 	 * - the remaining space in this scatterlist entry (which may not cover | ||||||
|  | 	 *   the entire page) | ||||||
|  | 	 * - the remaining space in the abd (which may not cover the entire | ||||||
|  | 	 *   scatterlist entry) | ||||||
|  | 	 */ | ||||||
|  | 	dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff, | ||||||
|  | 	    aiter->iter_abd->abd_size - aiter->iter_pos); | ||||||
|  | 	if (!abd_is_linear(aiter->iter_abd)) | ||||||
|  | 		dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset); | ||||||
|  | 	ASSERT3U(dsize, >, 0); | ||||||
|  | 
 | ||||||
|  | 	/* final iterator outputs */ | ||||||
| 	aiter->iter_page = page; | 	aiter->iter_page = page; | ||||||
| 	aiter->iter_page_doff = doff; | 	aiter->iter_page_doff = doff; | ||||||
| 
 | 	aiter->iter_page_dsize = dsize; | ||||||
| 	/* amount of data in the chunk, up to the end of the page */ |  | ||||||
| 	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Rob N
						Rob N