mirror of
				https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
				synced 2025-10-31 03:13:59 +00:00 
			
		
		
		
	 63e6c5b810
			
		
	
	
		63e6c5b810
		
	
	
	
	
		
			
			A number of our chips like loads and stores to be paired. A small kernel
module testcase shows the improvement of pairing loads and stores in
copy_4k_page:
POWER6: +9%
POWER7: +1.5%
#include <linux/module.h>
#include <linux/mm.h>
#define ITERATIONS 10000000
static int __init copypage_init(void)
{
	struct timespec before, after;
	unsigned long i;
	struct page *destpage, *srcpage;
	char *dest, *src;
	destpage = alloc_page(GFP_KERNEL);
	srcpage = alloc_page(GFP_KERNEL);
	dest = page_address(destpage);
	src = page_address(srcpage);
	getnstimeofday(&before);
	for (i = 0; i < ITERATIONS; i++)
		copy_4K_page(dest, src);
	getnstimeofday(&after);
	free_page((unsigned long)dest);
	free_page((unsigned long)src);
	printk(KERN_DEBUG "copy_4K_page loop took %lu ns\n",
		(after.tv_sec - before.tv_sec) * NSEC_PER_SEC +
		(after.tv_nsec - before.tv_nsec));
	return 0;
}
static void __exit copypage_exit(void)
{
}
module_init(copypage_init)
module_exit(copypage_exit)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Anton Blanchard");
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
		
	
			
		
			
				
	
	
		
			108 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			108 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * Copyright (C) 2008 Mark Nelson, IBM Corp.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU General Public License
 | |
|  * as published by the Free Software Foundation; either version
 | |
|  * 2 of the License, or (at your option) any later version.
 | |
|  */
 | |
| #include <asm/processor.h>
 | |
| #include <asm/ppc_asm.h>
 | |
| #include <asm/asm-offsets.h>
 | |
| 
 | |
|         .section        ".toc","aw"
 | |
| PPC64_CACHES:
 | |
|         .tc             ppc64_caches[TC],ppc64_caches
 | |
|         .section        ".text"
 | |
| 
 | |
| 
 | |
| _GLOBAL(copy_4K_page)
 | |
| 	li	r5,4096		/* 4K page size */
 | |
| BEGIN_FTR_SECTION
 | |
| 	ld      r10,PPC64_CACHES@toc(r2)
 | |
| 	lwz	r11,DCACHEL1LOGLINESIZE(r10)	/* log2 of cache line size */
 | |
| 	lwz     r12,DCACHEL1LINESIZE(r10)	/* get cache line size */
 | |
| 	li	r9,0
 | |
| 	srd	r8,r5,r11
 | |
| 
 | |
| 	mtctr	r8
 | |
| .Lsetup:
 | |
| 	dcbt	r9,r4
 | |
| 	dcbz	r9,r3
 | |
| 	add	r9,r9,r12
 | |
| 	bdnz	.Lsetup
 | |
| END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
 | |
| 	addi	r3,r3,-8
 | |
| 	srdi    r8,r5,7		/* page is copied in 128 byte strides */
 | |
| 	addi	r8,r8,-1	/* one stride copied outside loop */
 | |
| 
 | |
| 	mtctr	r8
 | |
| 
 | |
| 	ld	r5,0(r4)
 | |
| 	ld	r6,8(r4)
 | |
| 	ld	r7,16(r4)
 | |
| 	ldu	r8,24(r4)
 | |
| 1:	std	r5,8(r3)
 | |
| 	std	r6,16(r3)
 | |
| 	ld	r9,8(r4)
 | |
| 	ld	r10,16(r4)
 | |
| 	std	r7,24(r3)
 | |
| 	std	r8,32(r3)
 | |
| 	ld	r11,24(r4)
 | |
| 	ld	r12,32(r4)
 | |
| 	std	r9,40(r3)
 | |
| 	std	r10,48(r3)
 | |
| 	ld	r5,40(r4)
 | |
| 	ld	r6,48(r4)
 | |
| 	std	r11,56(r3)
 | |
| 	std	r12,64(r3)
 | |
| 	ld	r7,56(r4)
 | |
| 	ld	r8,64(r4)
 | |
| 	std	r5,72(r3)
 | |
| 	std	r6,80(r3)
 | |
| 	ld	r9,72(r4)
 | |
| 	ld	r10,80(r4)
 | |
| 	std	r7,88(r3)
 | |
| 	std	r8,96(r3)
 | |
| 	ld	r11,88(r4)
 | |
| 	ld	r12,96(r4)
 | |
| 	std	r9,104(r3)
 | |
| 	std	r10,112(r3)
 | |
| 	ld	r5,104(r4)
 | |
| 	ld	r6,112(r4)
 | |
| 	std	r11,120(r3)
 | |
| 	stdu	r12,128(r3)
 | |
| 	ld	r7,120(r4)
 | |
| 	ldu	r8,128(r4)
 | |
| 	bdnz	1b
 | |
| 
 | |
| 	std	r5,8(r3)
 | |
| 	std	r6,16(r3)
 | |
| 	ld	r9,8(r4)
 | |
| 	ld	r10,16(r4)
 | |
| 	std	r7,24(r3)
 | |
| 	std	r8,32(r3)
 | |
| 	ld	r11,24(r4)
 | |
| 	ld	r12,32(r4)
 | |
| 	std	r9,40(r3)
 | |
| 	std	r10,48(r3)
 | |
| 	ld	r5,40(r4)
 | |
| 	ld	r6,48(r4)
 | |
| 	std	r11,56(r3)
 | |
| 	std	r12,64(r3)
 | |
| 	ld	r7,56(r4)
 | |
| 	ld	r8,64(r4)
 | |
| 	std	r5,72(r3)
 | |
| 	std	r6,80(r3)
 | |
| 	ld	r9,72(r4)
 | |
| 	ld	r10,80(r4)
 | |
| 	std	r7,88(r3)
 | |
| 	std	r8,96(r3)
 | |
| 	ld	r11,88(r4)
 | |
| 	ld	r12,96(r4)
 | |
| 	std	r9,104(r3)
 | |
| 	std	r10,112(r3)
 | |
| 	std	r11,120(r3)
 | |
| 	std	r12,128(r3)
 | |
| 	blr
 |