linux-loongson/tools/testing/selftests/arm64/fp/za-test.S
Catalin Marinas 5a4332062e Merge branches 'for-next/gcs', 'for-next/probes', 'for-next/asm-offsets', 'for-next/tlb', 'for-next/misc', 'for-next/mte', 'for-next/sysreg', 'for-next/stacktrace', 'for-next/hwcap3', 'for-next/kselftest', 'for-next/crc32', 'for-next/guest-cca', 'for-next/haft' and 'for-next/scs', remote-tracking branch 'arm64/for-next/perf' into for-next/core
* arm64/for-next/perf:
  perf: Switch back to struct platform_driver::remove()
  perf: arm_pmuv3: Add support for Samsung Mongoose PMU
  dt-bindings: arm: pmu: Add Samsung Mongoose core compatible
  perf/dwc_pcie: Fix typos in event names
  perf/dwc_pcie: Add support for Ampere SoCs
  ARM: pmuv3: Add missing write_pmuacr()
  perf/marvell: Marvell PEM performance monitor support
  perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access control
  perf/dwc_pcie: Convert the events with mixed case to lowercase
  perf/cxlpmu: Support missing events in 3.1 spec
  perf: imx_perf: add support for i.MX91 platform
  dt-bindings: perf: fsl-imx-ddr: Add i.MX91 compatible
  drivers perf: remove unused field pmu_node

* for-next/gcs: (42 commits)
  : arm64 Guarded Control Stack user-space support
  kselftest/arm64: Fix missing printf() argument in gcs/gcs-stress.c
  arm64/gcs: Fix outdated ptrace documentation
  kselftest/arm64: Ensure stable names for GCS stress test results
  kselftest/arm64: Validate that GCS push and write permissions work
  kselftest/arm64: Enable GCS for the FP stress tests
  kselftest/arm64: Add a GCS stress test
  kselftest/arm64: Add GCS signal tests
  kselftest/arm64: Add test coverage for GCS mode locking
  kselftest/arm64: Add a GCS test program built with the system libc
  kselftest/arm64: Add very basic GCS test program
  kselftest/arm64: Always run signals tests with GCS enabled
  kselftest/arm64: Allow signals tests to specify an expected si_code
  kselftest/arm64: Add framework support for GCS to signal handling tests
  kselftest/arm64: Add GCS as a detected feature in the signal tests
  kselftest/arm64: Verify the GCS hwcap
  arm64: Add Kconfig for Guarded Control Stack (GCS)
  arm64/ptrace: Expose GCS via ptrace and core files
  arm64/signal: Expose GCS state in signal frames
  arm64/signal: Set up and restore the GCS context for signal handlers
  arm64/mm: Implement map_shadow_stack()
  ...

* for-next/probes:
  : Various arm64 uprobes/kprobes cleanups
  arm64: insn: Simulate nop instruction for better uprobe performance
  arm64: probes: Remove probe_opcode_t
  arm64: probes: Cleanup kprobes endianness conversions
  arm64: probes: Move kprobes-specific fields
  arm64: probes: Fix uprobes for big-endian kernels
  arm64: probes: Fix simulate_ldr*_literal()
  arm64: probes: Remove broken LDR (literal) uprobe support

* for-next/asm-offsets:
  : arm64 asm-offsets.c cleanup (remove unused offsets)
  arm64: asm-offsets: remove PREEMPT_DISABLE_OFFSET
  arm64: asm-offsets: remove DMA_{TO,FROM}_DEVICE
  arm64: asm-offsets: remove VM_EXEC and PAGE_SZ
  arm64: asm-offsets: remove MM_CONTEXT_ID
  arm64: asm-offsets: remove COMPAT_{RT_,SIGFRAME_REGS_OFFSET
  arm64: asm-offsets: remove VMA_VM_*
  arm64: asm-offsets: remove TSK_ACTIVE_MM

* for-next/tlb:
  : TLB flushing optimisations
  arm64: optimize flush tlb kernel range
  arm64: tlbflush: add __flush_tlb_range_limit_excess()

* for-next/misc:
  : Miscellaneous patches
  arm64: tls: Fix context-switching of tpidrro_el0 when kpti is enabled
  arm64/ptrace: Clarify documentation of VL configuration via ptrace
  acpi/arm64: remove unnecessary cast
  arm64/mm: Change protval as 'pteval_t' in map_range()
  arm64: uprobes: Optimize cache flushes for xol slot
  acpi/arm64: Adjust error handling procedure in gtdt_parse_timer_block()
  arm64: fix .data.rel.ro size assertion when CONFIG_LTO_CLANG
  arm64/ptdump: Test both PTE_TABLE_BIT and PTE_VALID for block mappings
  arm64/mm: Sanity check PTE address before runtime P4D/PUD folding
  arm64/mm: Drop setting PTE_TYPE_PAGE in pte_mkcont()
  ACPI: GTDT: Tighten the check for the array of platform timer structures
  arm64/fpsimd: Fix a typo
  arm64: Expose ID_AA64ISAR1_EL1.XS to sanitised feature consumers
  arm64: Return early when break handler is found on linked-list
  arm64/mm: Re-organize arch_make_huge_pte()
  arm64/mm: Drop _PROT_SECT_DEFAULT
  arm64: Add command-line override for ID_AA64MMFR0_EL1.ECV
  arm64: head: Drop SWAPPER_TABLE_SHIFT
  arm64: cpufeature: add POE to cpucap_is_possible()
  arm64/mm: Change pgattr_change_is_safe() arguments as pteval_t

* for-next/mte:
  : Various MTE improvements
  selftests: arm64: add hugetlb mte tests
  hugetlb: arm64: add mte support

* for-next/sysreg:
  : arm64 sysreg updates
  arm64/sysreg: Update ID_AA64MMFR1_EL1 to DDI0601 2024-09

* for-next/stacktrace:
  : arm64 stacktrace improvements
  arm64: preserve pt_regs::stackframe during exec*()
  arm64: stacktrace: unwind exception boundaries
  arm64: stacktrace: split unwind_consume_stack()
  arm64: stacktrace: report recovered PCs
  arm64: stacktrace: report source of unwind data
  arm64: stacktrace: move dump_backtrace() to kunwind_stack_walk()
  arm64: use a common struct frame_record
  arm64: pt_regs: swap 'unused' and 'pmr' fields
  arm64: pt_regs: rename "pmr_save" -> "pmr"
  arm64: pt_regs: remove stale big-endian layout
  arm64: pt_regs: assert pt_regs is a multiple of 16 bytes

* for-next/hwcap3:
  : Add AT_HWCAP3 support for arm64 (also wire up AT_HWCAP4)
  arm64: Support AT_HWCAP3
  binfmt_elf: Wire up AT_HWCAP3 at AT_HWCAP4

* for-next/kselftest: (30 commits)
  : arm64 kselftest fixes/cleanups
  kselftest/arm64: Try harder to generate different keys during PAC tests
  kselftest/arm64: Don't leak pipe fds in pac.exec_sign_all()
  kselftest/arm64: Corrupt P0 in the irritator when testing SSVE
  kselftest/arm64: Add FPMR coverage to fp-ptrace
  kselftest/arm64: Expand the set of ZA writes fp-ptrace does
  kselftets/arm64: Use flag bits for features in fp-ptrace assembler code
  kselftest/arm64: Enable build of PAC tests with LLVM=1
  kselftest/arm64: Check that SVCR is 0 in signal handlers
  kselftest/arm64: Fix printf() compiler warnings in the arm64 syscall-abi.c tests
  kselftest/arm64: Fix printf() warning in the arm64 MTE prctl() test
  kselftest/arm64: Fix printf() compiler warnings in the arm64 fp tests
  kselftest/arm64: Fix build with stricter assemblers
  kselftest/arm64: Test signal handler state modification in fp-stress
  kselftest/arm64: Provide a SIGUSR1 handler in the kernel mode FP stress test
  kselftest/arm64: Implement irritators for ZA and ZT
  kselftest/arm64: Remove unused ADRs from irritator handlers
  kselftest/arm64: Correct misleading comments on fp-stress irritators
  kselftest/arm64: Poll less often while waiting for fp-stress children
  kselftest/arm64: Increase frequency of signal delivery in fp-stress
  kselftest/arm64: Fix encoding for SVE B16B16 test
  ...

* for-next/crc32:
  : Optimise CRC32 using PMULL instructions
  arm64/crc32: Implement 4-way interleave using PMULL
  arm64/crc32: Reorganize bit/byte ordering macros
  arm64/lib: Handle CRC-32 alternative in C code

* for-next/guest-cca:
  : Support for running Linux as a guest in Arm CCA
  arm64: Document Arm Confidential Compute
  virt: arm-cca-guest: TSM_REPORT support for realms
  arm64: Enable memory encrypt for Realms
  arm64: mm: Avoid TLBI when marking pages as valid
  arm64: Enforce bounce buffers for realm DMA
  efi: arm64: Map Device with Prot Shared
  arm64: rsi: Map unprotected MMIO as decrypted
  arm64: rsi: Add support for checking whether an MMIO is protected
  arm64: realm: Query IPA size from the RMM
  arm64: Detect if in a realm and set RIPAS RAM
  arm64: rsi: Add RSI definitions

* for-next/haft:
  : Support for arm64 FEAT_HAFT
  arm64: pgtable: Warn unexpected pmdp_test_and_clear_young()
  arm64: Enable ARCH_HAS_NONLEAF_PMD_YOUNG
  arm64: Add support for FEAT_HAFT
  arm64: setup: name 'tcr2' register
  arm64/sysreg: Update ID_AA64MMFR1_EL1 register

* for-next/scs:
  : Dynamic shadow call stack fixes
  arm64/scs: Drop unused prototype __pi_scs_patch_vmlinux()
  arm64/scs: Deal with 64-bit relative offsets in FDE frames
  arm64/scs: Fix handling of DWARF augmentation data in CIE/FDE frames
2024-11-14 12:07:16 +00:00

401 lines
7.0 KiB
ArmAsm

// SPDX-License-Identifier: GPL-2.0-only
// Copyright (C) 2021 ARM Limited.
// Original author: Mark Brown <broonie@kernel.org>
//
// Scalable Matrix Extension ZA context switch test
// Repeatedly writes unique test patterns into each ZA tile
// and reads them back to verify integrity.
//
// for x in `seq 1 NR_CPUS`; do sve-test & pids=$pids\ $! ; done
// (leave it running for as long as you want...)
// kill $pids
#include <asm/unistd.h>
#include "assembler.h"
#include "asm-offsets.h"
#include "sme-inst.h"
.arch_extension sve
#define MAXVL 2048
#define MAXVL_B (MAXVL / 8)
// Declare some storage space to shadow ZA register contents and a
// scratch buffer for a vector.
.pushsection .text
.data
.align 4
zaref:
.space MAXVL_B * MAXVL_B
scratch:
.space MAXVL_B
.popsection
// Trivial memory copy: copy x2 bytes, starting at address x1, to address x0.
// Clobbers x0-x3
function memcpy
cmp x2, #0
b.eq 1f
0: ldrb w3, [x1], #1
strb w3, [x0], #1
subs x2, x2, #1
b.ne 0b
1: ret
endfunction
// Generate a test pattern for storage in ZA
// x0: pid
// x1: row in ZA
// x2: generation
// These values are used to constuct a 32-bit pattern that is repeated in the
// scratch buffer as many times as will fit:
// bits 31:28 generation number (increments once per test_loop)
// bits 27:16 pid
// bits 15: 8 row number
// bits 7: 0 32-bit lane index
function pattern
mov w3, wzr
bfi w3, w0, #16, #12 // PID
bfi w3, w1, #8, #8 // Row
bfi w3, w2, #28, #4 // Generation
ldr x0, =scratch
mov w1, #MAXVL_B / 4
0: str w3, [x0], #4
add w3, w3, #1 // Lane
subs w1, w1, #1
b.ne 0b
ret
endfunction
// Get the address of shadow data for ZA horizontal vector xn
.macro _adrza xd, xn, nrtmp
ldr \xd, =zaref
rdsvl \nrtmp, 1
madd \xd, x\nrtmp, \xn, \xd
.endm
// Set up test pattern in a ZA horizontal vector
// x0: pid
// x1: row number
// x2: generation
function setup_za
mov x4, x30
mov x12, x1 // Use x12 for vector select
bl pattern // Get pattern in scratch buffer
_adrza x0, x12, 2 // Shadow buffer pointer to x0 and x5
mov x5, x0
ldr x1, =scratch
bl memcpy // length set up in x2 by _adrza
_ldr_za 12, 5 // load vector w12 from pointer x5
ret x4
endfunction
// Trivial memory compare: compare x2 bytes starting at address x0 with
// bytes starting at address x1.
// Returns only if all bytes match; otherwise, the program is aborted.
// Clobbers x0-x5.
function memcmp
cbz x2, 2f
stp x0, x1, [sp, #-0x20]!
str x2, [sp, #0x10]
mov x5, #0
0: ldrb w3, [x0, x5]
ldrb w4, [x1, x5]
add x5, x5, #1
cmp w3, w4
b.ne 1f
subs x2, x2, #1
b.ne 0b
1: ldr x2, [sp, #0x10]
ldp x0, x1, [sp], #0x20
b.ne barf
2: ret
endfunction
// Verify that a ZA vector matches its shadow in memory, else abort
// x0: row number
// Clobbers x0-x7 and x12.
function check_za
mov x3, x30
mov x12, x0
_adrza x5, x0, 6 // pointer to expected value in x5
mov x4, x0
ldr x7, =scratch // x7 is scratch
mov x0, x7 // Poison scratch
mov x1, x6
bl memfill_ae
_str_za 12, 7 // save vector w12 to pointer x7
mov x0, x5
mov x1, x7
mov x2, x6
mov x30, x3
b memcmp
endfunction
// Modify the live SME register state, signal return will undo our changes
function irritator_handler
// Increment the irritation signal count (x23):
ldr x0, [x2, #ucontext_regs + 8 * 23]
add x0, x0, #1
str x0, [x2, #ucontext_regs + 8 * 23]
// This will reset ZA to all bits 0
smstop
smstart_za
ret
endfunction
function tickle_handler
// Increment the signal count (x23):
ldr x0, [x2, #ucontext_regs + 8 * 23]
add x0, x0, #1
str x0, [x2, #ucontext_regs + 8 * 23]
ret
endfunction
function terminate_handler
mov w21, w0
mov x20, x2
puts "Terminated by signal "
mov w0, w21
bl putdec
puts ", no error, iterations="
ldr x0, [x20, #ucontext_regs + 8 * 22]
bl putdec
puts ", signals="
ldr x0, [x20, #ucontext_regs + 8 * 23]
bl putdecn
mov x0, #0
mov x8, #__NR_exit
svc #0
endfunction
// w0: signal number
// x1: sa_action
// w2: sa_flags
// Clobbers x0-x6,x8
function setsignal
str x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]!
mov w4, w0
mov x5, x1
mov w6, w2
add x0, sp, #16
mov x1, #sa_sz
bl memclr
mov w0, w4
add x1, sp, #16
str w6, [x1, #sa_flags]
str x5, [x1, #sa_handler]
mov x2, #0
mov x3, #sa_mask_sz
mov x8, #__NR_rt_sigaction
svc #0
cbz w0, 1f
puts "sigaction failure\n"
b .Labort
1: ldr x30, [sp], #((sa_sz + 15) / 16 * 16 + 16)
ret
endfunction
// Main program entry point
.globl _start
function _start
enable_gcs
mov x23, #0 // signal count
mov w0, #SIGINT
adr x1, terminate_handler
mov w2, #SA_SIGINFO
bl setsignal
mov w0, #SIGTERM
adr x1, terminate_handler
mov w2, #SA_SIGINFO
bl setsignal
mov w0, #SIGUSR1
adr x1, irritator_handler
mov w2, #SA_SIGINFO
orr w2, w2, #SA_NODEFER
bl setsignal
mov w0, #SIGUSR2
adr x1, tickle_handler
mov w2, #SA_SIGINFO
orr w2, w2, #SA_NODEFER
bl setsignal
puts "Streaming mode "
smstart_za
// Sanity-check and report the vector length
rdsvl 19, 8
cmp x19, #128
b.lo 1f
cmp x19, #2048
b.hi 1f
tst x19, #(8 - 1)
b.eq 2f
1: puts "bad vector length: "
mov x0, x19
bl putdecn
b .Labort
2: puts "vector length:\t"
mov x0, x19
bl putdec
puts " bits\n"
// Obtain our PID, to ensure test pattern uniqueness between processes
mov x8, #__NR_getpid
svc #0
mov x20, x0
puts "PID:\t"
mov x0, x20
bl putdecn
mov x22, #0 // generation number, increments per iteration
.Ltest_loop:
rdsvl 0, 8
cmp x0, x19
b.ne vl_barf
rdsvl 21, 1 // Set up ZA & shadow with test pattern
0: mov x0, x20
sub x1, x21, #1
mov x2, x22
bl setup_za
subs x21, x21, #1
b.ne 0b
mov x8, #__NR_sched_yield // encourage preemption
1:
svc #0
mrs x0, S3_3_C4_C2_2 // SVCR should have ZA=1,SM=0
and x1, x0, #3
cmp x1, #2
b.ne svcr_barf
rdsvl 21, 1 // Verify that the data made it through
rdsvl 24, 1 // Verify that the data made it through
0: sub x0, x24, x21
bl check_za
subs x21, x21, #1
bne 0b
add x22, x22, #1 // Everything still working
b .Ltest_loop
.Labort:
mov x0, #0
mov x1, #SIGABRT
mov x8, #__NR_kill
svc #0
endfunction
function barf
// fpsimd.c acitivty log dump hack
// ldr w0, =0xdeadc0de
// mov w8, #__NR_exit
// svc #0
// end hack
mrs x13, S3_3_C4_C2_2
smstop
mov x10, x0 // expected data
mov x11, x1 // actual data
mov x12, x2 // data size
puts "Mismatch: PID="
mov x0, x20
bl putdec
puts ", iteration="
mov x0, x22
bl putdec
puts ", row="
mov x0, x21
bl putdecn
puts "\tExpected ["
mov x0, x10
mov x1, x12
bl dumphex
puts "]\n\tGot ["
mov x0, x11
mov x1, x12
bl dumphex
puts "]\n"
puts "\tSVCR: "
mov x0, x13
bl putdecn
mov x8, #__NR_getpid
svc #0
// fpsimd.c acitivty log dump hack
// ldr w0, =0xdeadc0de
// mov w8, #__NR_exit
// svc #0
// ^ end of hack
mov x1, #SIGABRT
mov x8, #__NR_kill
svc #0
// mov x8, #__NR_exit
// mov x1, #1
// svc #0
endfunction
function vl_barf
mov x10, x0
puts "Bad active VL: "
mov x0, x10
bl putdecn
mov x8, #__NR_exit
mov x1, #1
svc #0
endfunction
function svcr_barf
mov x10, x0
puts "Bad SVCR: "
mov x0, x10
bl putdecn
mov x8, #__NR_exit
mov x1, #1
svc #0
endfunction