mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2025-11-17 20:35:37 +00:00
This is to get the changes from:68674f94ff("x86: don't use REP_GOOD or ERMS for small memory copies")20f3337d35("x86: don't use REP_GOOD or ERMS for small memory clearing") This also make the 'perf bench mem' files stop referring to the erms versions that gone away with the above patches. That addresses these perf tools build warning: Warning: Kernel ABI header at 'tools/arch/x86/lib/memcpy_64.S' differs from latest version at 'arch/x86/lib/memcpy_64.S' diff -u tools/arch/x86/lib/memcpy_64.S arch/x86/lib/memcpy_64.S Warning: Kernel ABI header at 'tools/arch/x86/lib/memset_64.S' differs from latest version at 'arch/x86/lib/memset_64.S' diff -u tools/arch/x86/lib/memset_64.S arch/x86/lib/memset_64.S Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
118 lines
2.4 KiB
ArmAsm
118 lines
2.4 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/* Copyright 2002 Andi Kleen, SuSE Labs */
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/cpufeatures.h>
|
|
#include <asm/alternative.h>
|
|
#include <asm/export.h>
|
|
|
|
.section .noinstr.text, "ax"
|
|
|
|
/*
|
|
* ISO C memset - set a memory block to a byte value. This function uses fast
|
|
* string to get better performance than the original function. The code is
|
|
* simpler and shorter than the original function as well.
|
|
*
|
|
* rdi destination
|
|
* rsi value (char)
|
|
* rdx count (bytes)
|
|
*
|
|
* rax original destination
|
|
*
|
|
* The FSRS alternative should be done inline (avoiding the call and
|
|
* the disgusting return handling), but that would require some help
|
|
* from the compiler for better calling conventions.
|
|
*
|
|
* The 'rep stosb' itself is small enough to replace the call, but all
|
|
* the register moves blow up the code. And two of them are "needed"
|
|
* only for the return value that is the same as the source input,
|
|
* which the compiler could/should do much better anyway.
|
|
*/
|
|
SYM_FUNC_START(__memset)
|
|
ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
|
|
|
|
movq %rdi,%r9
|
|
movb %sil,%al
|
|
movq %rdx,%rcx
|
|
rep stosb
|
|
movq %r9,%rax
|
|
RET
|
|
SYM_FUNC_END(__memset)
|
|
EXPORT_SYMBOL(__memset)
|
|
|
|
SYM_FUNC_ALIAS(memset, __memset)
|
|
EXPORT_SYMBOL(memset)
|
|
|
|
SYM_FUNC_START_LOCAL(memset_orig)
|
|
movq %rdi,%r10
|
|
|
|
/* expand byte value */
|
|
movzbl %sil,%ecx
|
|
movabs $0x0101010101010101,%rax
|
|
imulq %rcx,%rax
|
|
|
|
/* align dst */
|
|
movl %edi,%r9d
|
|
andl $7,%r9d
|
|
jnz .Lbad_alignment
|
|
.Lafter_bad_alignment:
|
|
|
|
movq %rdx,%rcx
|
|
shrq $6,%rcx
|
|
jz .Lhandle_tail
|
|
|
|
.p2align 4
|
|
.Lloop_64:
|
|
decq %rcx
|
|
movq %rax,(%rdi)
|
|
movq %rax,8(%rdi)
|
|
movq %rax,16(%rdi)
|
|
movq %rax,24(%rdi)
|
|
movq %rax,32(%rdi)
|
|
movq %rax,40(%rdi)
|
|
movq %rax,48(%rdi)
|
|
movq %rax,56(%rdi)
|
|
leaq 64(%rdi),%rdi
|
|
jnz .Lloop_64
|
|
|
|
/* Handle tail in loops. The loops should be faster than hard
|
|
to predict jump tables. */
|
|
.p2align 4
|
|
.Lhandle_tail:
|
|
movl %edx,%ecx
|
|
andl $63&(~7),%ecx
|
|
jz .Lhandle_7
|
|
shrl $3,%ecx
|
|
.p2align 4
|
|
.Lloop_8:
|
|
decl %ecx
|
|
movq %rax,(%rdi)
|
|
leaq 8(%rdi),%rdi
|
|
jnz .Lloop_8
|
|
|
|
.Lhandle_7:
|
|
andl $7,%edx
|
|
jz .Lende
|
|
.p2align 4
|
|
.Lloop_1:
|
|
decl %edx
|
|
movb %al,(%rdi)
|
|
leaq 1(%rdi),%rdi
|
|
jnz .Lloop_1
|
|
|
|
.Lende:
|
|
movq %r10,%rax
|
|
RET
|
|
|
|
.Lbad_alignment:
|
|
cmpq $7,%rdx
|
|
jbe .Lhandle_7
|
|
movq %rax,(%rdi) /* unaligned store */
|
|
movq $8,%r8
|
|
subq %r9,%r8
|
|
addq %r8,%rdi
|
|
subq %r8,%rdx
|
|
jmp .Lafter_bad_alignment
|
|
.Lfinal:
|
|
SYM_FUNC_END(memset_orig)
|