mirror of
				https://github.com/qemu/qemu.git
				synced 2025-10-31 12:07:31 +00:00 
			
		
		
		
	 23a5e3859f
			
		
	
	
		23a5e3859f
		
	
	
	
	
		
			
			This is SMOPA, SUMOPA, USMOPA_s, UMOPA, for both Int8 and Int16. Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20220708151540.18136-28-richard.henderson@linaro.org Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
		
			
				
	
	
		
			1202 lines
		
	
	
		
			43 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			1202 lines
		
	
	
		
			43 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * ARM SME Operations
 | |
|  *
 | |
|  * Copyright (c) 2022 Linaro, Ltd.
 | |
|  *
 | |
|  * This library is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU Lesser General Public
 | |
|  * License as published by the Free Software Foundation; either
 | |
|  * version 2.1 of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * This library is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public
 | |
|  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 | |
|  */
 | |
| 
 | |
| #include "qemu/osdep.h"
 | |
| #include "cpu.h"
 | |
| #include "internals.h"
 | |
| #include "tcg/tcg-gvec-desc.h"
 | |
| #include "exec/helper-proto.h"
 | |
| #include "exec/cpu_ldst.h"
 | |
| #include "exec/exec-all.h"
 | |
| #include "qemu/int128.h"
 | |
| #include "fpu/softfloat.h"
 | |
| #include "vec_internal.h"
 | |
| #include "sve_ldst_internal.h"
 | |
| 
 | |
| /* ResetSVEState */
 | |
| void arm_reset_sve_state(CPUARMState *env)
 | |
| {
 | |
|     memset(env->vfp.zregs, 0, sizeof(env->vfp.zregs));
 | |
|     /* Recall that FFR is stored as pregs[16]. */
 | |
|     memset(env->vfp.pregs, 0, sizeof(env->vfp.pregs));
 | |
|     vfp_set_fpcr(env, 0x0800009f);
 | |
| }
 | |
| 
 | |
| void helper_set_pstate_sm(CPUARMState *env, uint32_t i)
 | |
| {
 | |
|     if (i == FIELD_EX64(env->svcr, SVCR, SM)) {
 | |
|         return;
 | |
|     }
 | |
|     env->svcr ^= R_SVCR_SM_MASK;
 | |
|     arm_reset_sve_state(env);
 | |
| }
 | |
| 
 | |
| void helper_set_pstate_za(CPUARMState *env, uint32_t i)
 | |
| {
 | |
|     if (i == FIELD_EX64(env->svcr, SVCR, ZA)) {
 | |
|         return;
 | |
|     }
 | |
|     env->svcr ^= R_SVCR_ZA_MASK;
 | |
| 
 | |
|     /*
 | |
|      * ResetSMEState.
 | |
|      *
 | |
|      * SetPSTATE_ZA zeros on enable and disable.  We can zero this only
 | |
|      * on enable: while disabled, the storage is inaccessible and the
 | |
|      * value does not matter.  We're not saving the storage in vmstate
 | |
|      * when disabled either.
 | |
|      */
 | |
|     if (i) {
 | |
|         memset(env->zarray, 0, sizeof(env->zarray));
 | |
|     }
 | |
| }
 | |
| 
 | |
| void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
 | |
| {
 | |
|     uint32_t i;
 | |
| 
 | |
|     /*
 | |
|      * Special case clearing the entire ZA space.
 | |
|      * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
 | |
|      * parts of the ZA storage outside of SVL.
 | |
|      */
 | |
|     if (imm == 0xff) {
 | |
|         memset(env->zarray, 0, sizeof(env->zarray));
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     /*
 | |
|      * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
 | |
|      * so each row is discontiguous within ZA[].
 | |
|      */
 | |
|     for (i = 0; i < svl; i++) {
 | |
|         if (imm & (1 << (i % 8))) {
 | |
|             memset(&env->zarray[i], 0, svl);
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
|  * When considering the ZA storage as an array of elements of
 | |
|  * type T, the index within that array of the Nth element of
 | |
|  * a vertical slice of a tile can be calculated like this,
 | |
|  * regardless of the size of type T. This is because the tiles
 | |
|  * are interleaved, so if type T is size N bytes then row 1 of
 | |
|  * the tile is N rows away from row 0. The division by N to
 | |
|  * convert a byte offset into an array index and the multiplication
 | |
|  * by N to convert from vslice-index-within-the-tile to
 | |
|  * the index within the ZA storage cancel out.
 | |
|  */
 | |
| #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
 | |
| 
 | |
| /*
 | |
|  * When doing byte arithmetic on the ZA storage, the element
 | |
|  * byteoff bytes away in a tile vertical slice is always this
 | |
|  * many bytes away in the ZA storage, regardless of the
 | |
|  * size of the tile element, assuming that byteoff is a multiple
 | |
|  * of the element size. Again this is because of the interleaving
 | |
|  * of the tiles. For instance if we have 1 byte per element then
 | |
|  * each row of the ZA storage has one byte of the vslice data,
 | |
|  * and (counting from 0) byte 8 goes in row 8 of the storage
 | |
|  * at offset (8 * row-size-in-bytes).
 | |
|  * If we have 8 bytes per element then each row of the ZA storage
 | |
|  * has 8 bytes of the data, but there are 8 interleaved tiles and
 | |
|  * so byte 8 of the data goes into row 1 of the tile,
 | |
|  * which is again row 8 of the storage, so the offset is still
 | |
|  * (8 * row-size-in-bytes). Similarly for other element sizes.
 | |
|  */
 | |
| #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
 | |
| 
 | |
| 
 | |
| /*
 | |
|  * Move Zreg vector to ZArray column.
 | |
|  */
 | |
| #define DO_MOVA_C(NAME, TYPE, H)                                        \
 | |
| void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
 | |
| {                                                                       \
 | |
|     int i, oprsz = simd_oprsz(desc);                                    \
 | |
|     for (i = 0; i < oprsz; ) {                                          \
 | |
|         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 | |
|         do {                                                            \
 | |
|             if (pg & 1) {                                               \
 | |
|                 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
 | |
|             }                                                           \
 | |
|             i += sizeof(TYPE);                                          \
 | |
|             pg >>= sizeof(TYPE);                                        \
 | |
|         } while (i & 15);                                               \
 | |
|     }                                                                   \
 | |
| }
 | |
| 
 | |
| DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
 | |
| DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
 | |
| DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
 | |
| 
 | |
| void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
 | |
| {
 | |
|     int i, oprsz = simd_oprsz(desc) / 8;
 | |
|     uint8_t *pg = vg;
 | |
|     uint64_t *n = vn;
 | |
|     uint64_t *a = za;
 | |
| 
 | |
|     for (i = 0; i < oprsz; i++) {
 | |
|         if (pg[H1(i)] & 1) {
 | |
|             a[tile_vslice_index(i)] = n[i];
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
 | |
| {
 | |
|     int i, oprsz = simd_oprsz(desc) / 16;
 | |
|     uint16_t *pg = vg;
 | |
|     Int128 *n = vn;
 | |
|     Int128 *a = za;
 | |
| 
 | |
|     /*
 | |
|      * Int128 is used here simply to copy 16 bytes, and to simplify
 | |
|      * the address arithmetic.
 | |
|      */
 | |
|     for (i = 0; i < oprsz; i++) {
 | |
|         if (pg[H2(i)] & 1) {
 | |
|             a[tile_vslice_index(i)] = n[i];
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| #undef DO_MOVA_C
 | |
| 
 | |
| /*
 | |
|  * Move ZArray column to Zreg vector.
 | |
|  */
 | |
| #define DO_MOVA_Z(NAME, TYPE, H)                                        \
 | |
| void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
 | |
| {                                                                       \
 | |
|     int i, oprsz = simd_oprsz(desc);                                    \
 | |
|     for (i = 0; i < oprsz; ) {                                          \
 | |
|         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 | |
|         do {                                                            \
 | |
|             if (pg & 1) {                                               \
 | |
|                 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
 | |
|             }                                                           \
 | |
|             i += sizeof(TYPE);                                          \
 | |
|             pg >>= sizeof(TYPE);                                        \
 | |
|         } while (i & 15);                                               \
 | |
|     }                                                                   \
 | |
| }
 | |
| 
 | |
| DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
 | |
| DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
 | |
| DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
 | |
| 
 | |
| void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
 | |
| {
 | |
|     int i, oprsz = simd_oprsz(desc) / 8;
 | |
|     uint8_t *pg = vg;
 | |
|     uint64_t *d = vd;
 | |
|     uint64_t *a = za;
 | |
| 
 | |
|     for (i = 0; i < oprsz; i++) {
 | |
|         if (pg[H1(i)] & 1) {
 | |
|             d[i] = a[tile_vslice_index(i)];
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
 | |
| {
 | |
|     int i, oprsz = simd_oprsz(desc) / 16;
 | |
|     uint16_t *pg = vg;
 | |
|     Int128 *d = vd;
 | |
|     Int128 *a = za;
 | |
| 
 | |
|     /*
 | |
|      * Int128 is used here simply to copy 16 bytes, and to simplify
 | |
|      * the address arithmetic.
 | |
|      */
 | |
|     for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
 | |
|         if (pg[H2(i)] & 1) {
 | |
|             d[i] = a[tile_vslice_index(i)];
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| #undef DO_MOVA_Z
 | |
| 
 | |
| /*
 | |
|  * Clear elements in a tile slice comprising len bytes.
 | |
|  */
 | |
| 
 | |
| typedef void ClearFn(void *ptr, size_t off, size_t len);
 | |
| 
 | |
| static void clear_horizontal(void *ptr, size_t off, size_t len)
 | |
| {
 | |
|     memset(ptr + off, 0, len);
 | |
| }
 | |
| 
 | |
| static void clear_vertical_b(void *vptr, size_t off, size_t len)
 | |
| {
 | |
|     for (size_t i = 0; i < len; ++i) {
 | |
|         *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void clear_vertical_h(void *vptr, size_t off, size_t len)
 | |
| {
 | |
|     for (size_t i = 0; i < len; i += 2) {
 | |
|         *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void clear_vertical_s(void *vptr, size_t off, size_t len)
 | |
| {
 | |
|     for (size_t i = 0; i < len; i += 4) {
 | |
|         *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void clear_vertical_d(void *vptr, size_t off, size_t len)
 | |
| {
 | |
|     for (size_t i = 0; i < len; i += 8) {
 | |
|         *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void clear_vertical_q(void *vptr, size_t off, size_t len)
 | |
| {
 | |
|     for (size_t i = 0; i < len; i += 16) {
 | |
|         memset(vptr + tile_vslice_offset(i + off), 0, 16);
 | |
|     }
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Copy elements from an array into a tile slice comprising len bytes.
 | |
|  */
 | |
| 
 | |
| typedef void CopyFn(void *dst, const void *src, size_t len);
 | |
| 
 | |
| static void copy_horizontal(void *dst, const void *src, size_t len)
 | |
| {
 | |
|     memcpy(dst, src, len);
 | |
| }
 | |
| 
 | |
| static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
 | |
| {
 | |
|     const uint8_t *src = vsrc;
 | |
|     uint8_t *dst = vdst;
 | |
|     size_t i;
 | |
| 
 | |
|     for (i = 0; i < len; ++i) {
 | |
|         dst[tile_vslice_index(i)] = src[i];
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
 | |
| {
 | |
|     const uint16_t *src = vsrc;
 | |
|     uint16_t *dst = vdst;
 | |
|     size_t i;
 | |
| 
 | |
|     for (i = 0; i < len / 2; ++i) {
 | |
|         dst[tile_vslice_index(i)] = src[i];
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
 | |
| {
 | |
|     const uint32_t *src = vsrc;
 | |
|     uint32_t *dst = vdst;
 | |
|     size_t i;
 | |
| 
 | |
|     for (i = 0; i < len / 4; ++i) {
 | |
|         dst[tile_vslice_index(i)] = src[i];
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
 | |
| {
 | |
|     const uint64_t *src = vsrc;
 | |
|     uint64_t *dst = vdst;
 | |
|     size_t i;
 | |
| 
 | |
|     for (i = 0; i < len / 8; ++i) {
 | |
|         dst[tile_vslice_index(i)] = src[i];
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
 | |
| {
 | |
|     for (size_t i = 0; i < len; i += 16) {
 | |
|         memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
 | |
|     }
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Host and TLB primitives for vertical tile slice addressing.
 | |
|  */
 | |
| 
 | |
| #define DO_LD(NAME, TYPE, HOST, TLB)                                        \
 | |
| static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 | |
| {                                                                           \
 | |
|     TYPE val = HOST(host);                                                  \
 | |
|     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 | |
| }                                                                           \
 | |
| static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 | |
|                         intptr_t off, target_ulong addr, uintptr_t ra)      \
 | |
| {                                                                           \
 | |
|     TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
 | |
|     *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 | |
| }
 | |
| 
 | |
| #define DO_ST(NAME, TYPE, HOST, TLB)                                        \
 | |
| static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 | |
| {                                                                           \
 | |
|     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 | |
|     HOST(host, val);                                                        \
 | |
| }                                                                           \
 | |
| static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 | |
|                         intptr_t off, target_ulong addr, uintptr_t ra)      \
 | |
| {                                                                           \
 | |
|     TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 | |
|     TLB(env, useronly_clean_ptr(addr), val, ra);                            \
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * The ARMVectorReg elements are stored in host-endian 64-bit units.
 | |
|  * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
 | |
|  * corresponds to storing the two 64-bit pieces in little-endian order.
 | |
|  */
 | |
| #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 | |
| static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 | |
| {                                                                           \
 | |
|     uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
 | |
|     uint64_t *ptr = za + off;                                               \
 | |
|     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 | |
| }                                                                           \
 | |
| static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 | |
| {                                                                           \
 | |
|     HNAME##_host(za, tile_vslice_offset(off), host);                        \
 | |
| }                                                                           \
 | |
| static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 | |
|                                target_ulong addr, uintptr_t ra)             \
 | |
| {                                                                           \
 | |
|     uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
 | |
|     uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
 | |
|     uint64_t *ptr = za + off;                                               \
 | |
|     ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 | |
| }                                                                           \
 | |
| static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 | |
|                                target_ulong addr, uintptr_t ra)             \
 | |
| {                                                                           \
 | |
|     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 | |
| }
 | |
| 
 | |
| #define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 | |
| static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 | |
| {                                                                           \
 | |
|     uint64_t *ptr = za + off;                                               \
 | |
|     HOST(host, ptr[BE]);                                                    \
 | |
|     HOST(host + 1, ptr[!BE]);                                               \
 | |
| }                                                                           \
 | |
| static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 | |
| {                                                                           \
 | |
|     HNAME##_host(za, tile_vslice_offset(off), host);                        \
 | |
| }                                                                           \
 | |
| static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 | |
|                                target_ulong addr, uintptr_t ra)             \
 | |
| {                                                                           \
 | |
|     uint64_t *ptr = za + off;                                               \
 | |
|     TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
 | |
|     TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
 | |
| }                                                                           \
 | |
| static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 | |
|                                target_ulong addr, uintptr_t ra)             \
 | |
| {                                                                           \
 | |
|     HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 | |
| }
 | |
| 
 | |
| DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
 | |
| DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
 | |
| DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
 | |
| DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
 | |
| DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
 | |
| DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
 | |
| DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
 | |
| 
 | |
| DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
 | |
| DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
 | |
| 
 | |
| DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
 | |
| DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
 | |
| DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
 | |
| DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
 | |
| DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
 | |
| DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
 | |
| DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
 | |
| 
 | |
| DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
 | |
| DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
 | |
| 
 | |
| #undef DO_LD
 | |
| #undef DO_ST
 | |
| #undef DO_LDQ
 | |
| #undef DO_STQ
 | |
| 
 | |
| /*
 | |
|  * Common helper for all contiguous predicated loads.
 | |
|  */
 | |
| 
 | |
| static inline QEMU_ALWAYS_INLINE
 | |
| void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
 | |
|              const target_ulong addr, uint32_t desc, const uintptr_t ra,
 | |
|              const int esz, uint32_t mtedesc, bool vertical,
 | |
|              sve_ldst1_host_fn *host_fn,
 | |
|              sve_ldst1_tlb_fn *tlb_fn,
 | |
|              ClearFn *clr_fn,
 | |
|              CopyFn *cpy_fn)
 | |
| {
 | |
|     const intptr_t reg_max = simd_oprsz(desc);
 | |
|     const intptr_t esize = 1 << esz;
 | |
|     intptr_t reg_off, reg_last;
 | |
|     SVEContLdSt info;
 | |
|     void *host;
 | |
|     int flags;
 | |
| 
 | |
|     /* Find the active elements.  */
 | |
|     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 | |
|         /* The entire predicate was false; no load occurs.  */
 | |
|         clr_fn(za, 0, reg_max);
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     /* Probe the page(s).  Exit with exception for any invalid page. */
 | |
|     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
 | |
| 
 | |
|     /* Handle watchpoints for all active elements. */
 | |
|     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 | |
|                               BP_MEM_READ, ra);
 | |
| 
 | |
|     /*
 | |
|      * Handle mte checks for all active elements.
 | |
|      * Since TBI must be set for MTE, !mtedesc => !mte_active.
 | |
|      */
 | |
|     if (mtedesc) {
 | |
|         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 | |
|                                 mtedesc, ra);
 | |
|     }
 | |
| 
 | |
|     flags = info.page[0].flags | info.page[1].flags;
 | |
|     if (unlikely(flags != 0)) {
 | |
| #ifdef CONFIG_USER_ONLY
 | |
|         g_assert_not_reached();
 | |
| #else
 | |
|         /*
 | |
|          * At least one page includes MMIO.
 | |
|          * Any bus operation can fail with cpu_transaction_failed,
 | |
|          * which for ARM will raise SyncExternal.  Perform the load
 | |
|          * into scratch memory to preserve register state until the end.
 | |
|          */
 | |
|         ARMVectorReg scratch = { };
 | |
| 
 | |
|         reg_off = info.reg_off_first[0];
 | |
|         reg_last = info.reg_off_last[1];
 | |
|         if (reg_last < 0) {
 | |
|             reg_last = info.reg_off_split;
 | |
|             if (reg_last < 0) {
 | |
|                 reg_last = info.reg_off_last[0];
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         do {
 | |
|             uint64_t pg = vg[reg_off >> 6];
 | |
|             do {
 | |
|                 if ((pg >> (reg_off & 63)) & 1) {
 | |
|                     tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
 | |
|                 }
 | |
|                 reg_off += esize;
 | |
|             } while (reg_off & 63);
 | |
|         } while (reg_off <= reg_last);
 | |
| 
 | |
|         cpy_fn(za, &scratch, reg_max);
 | |
|         return;
 | |
| #endif
 | |
|     }
 | |
| 
 | |
|     /* The entire operation is in RAM, on valid pages. */
 | |
| 
 | |
|     reg_off = info.reg_off_first[0];
 | |
|     reg_last = info.reg_off_last[0];
 | |
|     host = info.page[0].host;
 | |
| 
 | |
|     if (!vertical) {
 | |
|         memset(za, 0, reg_max);
 | |
|     } else if (reg_off) {
 | |
|         clr_fn(za, 0, reg_off);
 | |
|     }
 | |
| 
 | |
|     while (reg_off <= reg_last) {
 | |
|         uint64_t pg = vg[reg_off >> 6];
 | |
|         do {
 | |
|             if ((pg >> (reg_off & 63)) & 1) {
 | |
|                 host_fn(za, reg_off, host + reg_off);
 | |
|             } else if (vertical) {
 | |
|                 clr_fn(za, reg_off, esize);
 | |
|             }
 | |
|             reg_off += esize;
 | |
|         } while (reg_off <= reg_last && (reg_off & 63));
 | |
|     }
 | |
| 
 | |
|     /*
 | |
|      * Use the slow path to manage the cross-page misalignment.
 | |
|      * But we know this is RAM and cannot trap.
 | |
|      */
 | |
|     reg_off = info.reg_off_split;
 | |
|     if (unlikely(reg_off >= 0)) {
 | |
|         tlb_fn(env, za, reg_off, addr + reg_off, ra);
 | |
|     }
 | |
| 
 | |
|     reg_off = info.reg_off_first[1];
 | |
|     if (unlikely(reg_off >= 0)) {
 | |
|         reg_last = info.reg_off_last[1];
 | |
|         host = info.page[1].host;
 | |
| 
 | |
|         do {
 | |
|             uint64_t pg = vg[reg_off >> 6];
 | |
|             do {
 | |
|                 if ((pg >> (reg_off & 63)) & 1) {
 | |
|                     host_fn(za, reg_off, host + reg_off);
 | |
|                 } else if (vertical) {
 | |
|                     clr_fn(za, reg_off, esize);
 | |
|                 }
 | |
|                 reg_off += esize;
 | |
|             } while (reg_off & 63);
 | |
|         } while (reg_off <= reg_last);
 | |
|     }
 | |
| }
 | |
| 
 | |
| static inline QEMU_ALWAYS_INLINE
 | |
| void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
 | |
|                  target_ulong addr, uint32_t desc, uintptr_t ra,
 | |
|                  const int esz, bool vertical,
 | |
|                  sve_ldst1_host_fn *host_fn,
 | |
|                  sve_ldst1_tlb_fn *tlb_fn,
 | |
|                  ClearFn *clr_fn,
 | |
|                  CopyFn *cpy_fn)
 | |
| {
 | |
|     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 | |
|     int bit55 = extract64(addr, 55, 1);
 | |
| 
 | |
|     /* Remove mtedesc from the normal sve descriptor. */
 | |
|     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 | |
| 
 | |
|     /* Perform gross MTE suppression early. */
 | |
|     if (!tbi_check(desc, bit55) ||
 | |
|         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 | |
|         mtedesc = 0;
 | |
|     }
 | |
| 
 | |
|     sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
 | |
|             host_fn, tlb_fn, clr_fn, cpy_fn);
 | |
| }
 | |
| 
 | |
| #define DO_LD(L, END, ESZ)                                                 \
 | |
| void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 | |
|                                  target_ulong addr, uint32_t desc)         \
 | |
| {                                                                          \
 | |
|     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 | |
|             sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
 | |
|             clear_horizontal, copy_horizontal);                            \
 | |
| }                                                                          \
 | |
| void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 | |
|                                  target_ulong addr, uint32_t desc)         \
 | |
| {                                                                          \
 | |
|     sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 | |
|             sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
 | |
|             clear_vertical_##L, copy_vertical_##L);                        \
 | |
| }                                                                          \
 | |
| void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 | |
|                                      target_ulong addr, uint32_t desc)     \
 | |
| {                                                                          \
 | |
|     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 | |
|                 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
 | |
|                 clear_horizontal, copy_horizontal);                        \
 | |
| }                                                                          \
 | |
| void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 | |
|                                      target_ulong addr, uint32_t desc)     \
 | |
| {                                                                          \
 | |
|     sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 | |
|                 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
 | |
|                 clear_vertical_##L, copy_vertical_##L);                    \
 | |
| }
 | |
| 
 | |
| DO_LD(b, , MO_8)
 | |
| DO_LD(h, _be, MO_16)
 | |
| DO_LD(h, _le, MO_16)
 | |
| DO_LD(s, _be, MO_32)
 | |
| DO_LD(s, _le, MO_32)
 | |
| DO_LD(d, _be, MO_64)
 | |
| DO_LD(d, _le, MO_64)
 | |
| DO_LD(q, _be, MO_128)
 | |
| DO_LD(q, _le, MO_128)
 | |
| 
 | |
| #undef DO_LD
 | |
| 
 | |
| /*
 | |
|  * Common helper for all contiguous predicated stores.
 | |
|  */
 | |
| 
 | |
| static inline QEMU_ALWAYS_INLINE
 | |
| void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
 | |
|              const target_ulong addr, uint32_t desc, const uintptr_t ra,
 | |
|              const int esz, uint32_t mtedesc, bool vertical,
 | |
|              sve_ldst1_host_fn *host_fn,
 | |
|              sve_ldst1_tlb_fn *tlb_fn)
 | |
| {
 | |
|     const intptr_t reg_max = simd_oprsz(desc);
 | |
|     const intptr_t esize = 1 << esz;
 | |
|     intptr_t reg_off, reg_last;
 | |
|     SVEContLdSt info;
 | |
|     void *host;
 | |
|     int flags;
 | |
| 
 | |
|     /* Find the active elements.  */
 | |
|     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 | |
|         /* The entire predicate was false; no store occurs.  */
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     /* Probe the page(s).  Exit with exception for any invalid page. */
 | |
|     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
 | |
| 
 | |
|     /* Handle watchpoints for all active elements. */
 | |
|     sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 | |
|                               BP_MEM_WRITE, ra);
 | |
| 
 | |
|     /*
 | |
|      * Handle mte checks for all active elements.
 | |
|      * Since TBI must be set for MTE, !mtedesc => !mte_active.
 | |
|      */
 | |
|     if (mtedesc) {
 | |
|         sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 | |
|                                 mtedesc, ra);
 | |
|     }
 | |
| 
 | |
|     flags = info.page[0].flags | info.page[1].flags;
 | |
|     if (unlikely(flags != 0)) {
 | |
| #ifdef CONFIG_USER_ONLY
 | |
|         g_assert_not_reached();
 | |
| #else
 | |
|         /*
 | |
|          * At least one page includes MMIO.
 | |
|          * Any bus operation can fail with cpu_transaction_failed,
 | |
|          * which for ARM will raise SyncExternal.  We cannot avoid
 | |
|          * this fault and will leave with the store incomplete.
 | |
|          */
 | |
|         reg_off = info.reg_off_first[0];
 | |
|         reg_last = info.reg_off_last[1];
 | |
|         if (reg_last < 0) {
 | |
|             reg_last = info.reg_off_split;
 | |
|             if (reg_last < 0) {
 | |
|                 reg_last = info.reg_off_last[0];
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         do {
 | |
|             uint64_t pg = vg[reg_off >> 6];
 | |
|             do {
 | |
|                 if ((pg >> (reg_off & 63)) & 1) {
 | |
|                     tlb_fn(env, za, reg_off, addr + reg_off, ra);
 | |
|                 }
 | |
|                 reg_off += esize;
 | |
|             } while (reg_off & 63);
 | |
|         } while (reg_off <= reg_last);
 | |
|         return;
 | |
| #endif
 | |
|     }
 | |
| 
 | |
|     reg_off = info.reg_off_first[0];
 | |
|     reg_last = info.reg_off_last[0];
 | |
|     host = info.page[0].host;
 | |
| 
 | |
|     while (reg_off <= reg_last) {
 | |
|         uint64_t pg = vg[reg_off >> 6];
 | |
|         do {
 | |
|             if ((pg >> (reg_off & 63)) & 1) {
 | |
|                 host_fn(za, reg_off, host + reg_off);
 | |
|             }
 | |
|             reg_off += 1 << esz;
 | |
|         } while (reg_off <= reg_last && (reg_off & 63));
 | |
|     }
 | |
| 
 | |
|     /*
 | |
|      * Use the slow path to manage the cross-page misalignment.
 | |
|      * But we know this is RAM and cannot trap.
 | |
|      */
 | |
|     reg_off = info.reg_off_split;
 | |
|     if (unlikely(reg_off >= 0)) {
 | |
|         tlb_fn(env, za, reg_off, addr + reg_off, ra);
 | |
|     }
 | |
| 
 | |
|     reg_off = info.reg_off_first[1];
 | |
|     if (unlikely(reg_off >= 0)) {
 | |
|         reg_last = info.reg_off_last[1];
 | |
|         host = info.page[1].host;
 | |
| 
 | |
|         do {
 | |
|             uint64_t pg = vg[reg_off >> 6];
 | |
|             do {
 | |
|                 if ((pg >> (reg_off & 63)) & 1) {
 | |
|                     host_fn(za, reg_off, host + reg_off);
 | |
|                 }
 | |
|                 reg_off += 1 << esz;
 | |
|             } while (reg_off & 63);
 | |
|         } while (reg_off <= reg_last);
 | |
|     }
 | |
| }
 | |
| 
 | |
| static inline QEMU_ALWAYS_INLINE
 | |
| void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
 | |
|                  uint32_t desc, uintptr_t ra, int esz, bool vertical,
 | |
|                  sve_ldst1_host_fn *host_fn,
 | |
|                  sve_ldst1_tlb_fn *tlb_fn)
 | |
| {
 | |
|     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 | |
|     int bit55 = extract64(addr, 55, 1);
 | |
| 
 | |
|     /* Remove mtedesc from the normal sve descriptor. */
 | |
|     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 | |
| 
 | |
|     /* Perform gross MTE suppression early. */
 | |
|     if (!tbi_check(desc, bit55) ||
 | |
|         tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 | |
|         mtedesc = 0;
 | |
|     }
 | |
| 
 | |
|     sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
 | |
|             vertical, host_fn, tlb_fn);
 | |
| }
 | |
| 
 | |
| #define DO_ST(L, END, ESZ)                                                 \
 | |
| void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 | |
|                                  target_ulong addr, uint32_t desc)         \
 | |
| {                                                                          \
 | |
|     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 | |
|             sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
 | |
| }                                                                          \
 | |
| void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 | |
|                                  target_ulong addr, uint32_t desc)         \
 | |
| {                                                                          \
 | |
|     sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 | |
|             sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
 | |
| }                                                                          \
 | |
| void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 | |
|                                      target_ulong addr, uint32_t desc)     \
 | |
| {                                                                          \
 | |
|     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 | |
|                 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
 | |
| }                                                                          \
 | |
| void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 | |
|                                      target_ulong addr, uint32_t desc)     \
 | |
| {                                                                          \
 | |
|     sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 | |
|                 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
 | |
| }
 | |
| 
 | |
| DO_ST(b, , MO_8)
 | |
| DO_ST(h, _be, MO_16)
 | |
| DO_ST(h, _le, MO_16)
 | |
| DO_ST(s, _be, MO_32)
 | |
| DO_ST(s, _le, MO_32)
 | |
| DO_ST(d, _be, MO_64)
 | |
| DO_ST(d, _le, MO_64)
 | |
| DO_ST(q, _be, MO_128)
 | |
| DO_ST(q, _le, MO_128)
 | |
| 
 | |
| #undef DO_ST
 | |
| 
 | |
| void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
 | |
|                          void *vpm, uint32_t desc)
 | |
| {
 | |
|     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 | |
|     uint64_t *pn = vpn, *pm = vpm;
 | |
|     uint32_t *zda = vzda, *zn = vzn;
 | |
| 
 | |
|     for (row = 0; row < oprsz; ) {
 | |
|         uint64_t pa = pn[row >> 4];
 | |
|         do {
 | |
|             if (pa & 1) {
 | |
|                 for (col = 0; col < oprsz; ) {
 | |
|                     uint64_t pb = pm[col >> 4];
 | |
|                     do {
 | |
|                         if (pb & 1) {
 | |
|                             zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
 | |
|                         }
 | |
|                         pb >>= 4;
 | |
|                     } while (++col & 15);
 | |
|                 }
 | |
|             }
 | |
|             pa >>= 4;
 | |
|         } while (++row & 15);
 | |
|     }
 | |
| }
 | |
| 
 | |
| void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
 | |
|                          void *vpm, uint32_t desc)
 | |
| {
 | |
|     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 | |
|     uint8_t *pn = vpn, *pm = vpm;
 | |
|     uint64_t *zda = vzda, *zn = vzn;
 | |
| 
 | |
|     for (row = 0; row < oprsz; ++row) {
 | |
|         if (pn[H1(row)] & 1) {
 | |
|             for (col = 0; col < oprsz; ++col) {
 | |
|                 if (pm[H1(col)] & 1) {
 | |
|                     zda[tile_vslice_index(row) + col] += zn[col];
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
 | |
|                          void *vpm, uint32_t desc)
 | |
| {
 | |
|     intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 | |
|     uint64_t *pn = vpn, *pm = vpm;
 | |
|     uint32_t *zda = vzda, *zn = vzn;
 | |
| 
 | |
|     for (row = 0; row < oprsz; ) {
 | |
|         uint64_t pa = pn[row >> 4];
 | |
|         do {
 | |
|             if (pa & 1) {
 | |
|                 uint32_t zn_row = zn[H4(row)];
 | |
|                 for (col = 0; col < oprsz; ) {
 | |
|                     uint64_t pb = pm[col >> 4];
 | |
|                     do {
 | |
|                         if (pb & 1) {
 | |
|                             zda[tile_vslice_index(row) + H4(col)] += zn_row;
 | |
|                         }
 | |
|                         pb >>= 4;
 | |
|                     } while (++col & 15);
 | |
|                 }
 | |
|             }
 | |
|             pa >>= 4;
 | |
|         } while (++row & 15);
 | |
|     }
 | |
| }
 | |
| 
 | |
| void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
 | |
|                          void *vpm, uint32_t desc)
 | |
| {
 | |
|     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 | |
|     uint8_t *pn = vpn, *pm = vpm;
 | |
|     uint64_t *zda = vzda, *zn = vzn;
 | |
| 
 | |
|     for (row = 0; row < oprsz; ++row) {
 | |
|         if (pn[H1(row)] & 1) {
 | |
|             uint64_t zn_row = zn[row];
 | |
|             for (col = 0; col < oprsz; ++col) {
 | |
|                 if (pm[H1(col)] & 1) {
 | |
|                     zda[tile_vslice_index(row) + col] += zn_row;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
 | |
|                          void *vpm, void *vst, uint32_t desc)
 | |
| {
 | |
|     intptr_t row, col, oprsz = simd_maxsz(desc);
 | |
|     uint32_t neg = simd_data(desc) << 31;
 | |
|     uint16_t *pn = vpn, *pm = vpm;
 | |
|     float_status fpst;
 | |
| 
 | |
|     /*
 | |
|      * Make a copy of float_status because this operation does not
 | |
|      * update the cumulative fp exception status.  It also produces
 | |
|      * default nans.
 | |
|      */
 | |
|     fpst = *(float_status *)vst;
 | |
|     set_default_nan_mode(true, &fpst);
 | |
| 
 | |
|     for (row = 0; row < oprsz; ) {
 | |
|         uint16_t pa = pn[H2(row >> 4)];
 | |
|         do {
 | |
|             if (pa & 1) {
 | |
|                 void *vza_row = vza + tile_vslice_offset(row);
 | |
|                 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
 | |
| 
 | |
|                 for (col = 0; col < oprsz; ) {
 | |
|                     uint16_t pb = pm[H2(col >> 4)];
 | |
|                     do {
 | |
|                         if (pb & 1) {
 | |
|                             uint32_t *a = vza_row + H1_4(col);
 | |
|                             uint32_t *m = vzm + H1_4(col);
 | |
|                             *a = float32_muladd(n, *m, *a, 0, vst);
 | |
|                         }
 | |
|                         col += 4;
 | |
|                         pb >>= 4;
 | |
|                     } while (col & 15);
 | |
|                 }
 | |
|             }
 | |
|             row += 4;
 | |
|             pa >>= 4;
 | |
|         } while (row & 15);
 | |
|     }
 | |
| }
 | |
| 
 | |
| void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
 | |
|                          void *vpm, void *vst, uint32_t desc)
 | |
| {
 | |
|     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 | |
|     uint64_t neg = (uint64_t)simd_data(desc) << 63;
 | |
|     uint64_t *za = vza, *zn = vzn, *zm = vzm;
 | |
|     uint8_t *pn = vpn, *pm = vpm;
 | |
|     float_status fpst = *(float_status *)vst;
 | |
| 
 | |
|     set_default_nan_mode(true, &fpst);
 | |
| 
 | |
|     for (row = 0; row < oprsz; ++row) {
 | |
|         if (pn[H1(row)] & 1) {
 | |
|             uint64_t *za_row = &za[tile_vslice_index(row)];
 | |
|             uint64_t n = zn[row] ^ neg;
 | |
| 
 | |
|             for (col = 0; col < oprsz; ++col) {
 | |
|                 if (pm[H1(col)] & 1) {
 | |
|                     uint64_t *a = &za_row[col];
 | |
|                     *a = float64_muladd(n, zm[col], *a, 0, &fpst);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Alter PAIR as needed for controlling predicates being false,
 | |
|  * and for NEG on an enabled row element.
 | |
|  */
 | |
| static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
 | |
| {
 | |
|     /*
 | |
|      * The pseudocode uses a conditional negate after the conditional zero.
 | |
|      * It is simpler here to unconditionally negate before conditional zero.
 | |
|      */
 | |
|     pair ^= neg;
 | |
|     if (!(pg & 1)) {
 | |
|         pair &= 0xffff0000u;
 | |
|     }
 | |
|     if (!(pg & 4)) {
 | |
|         pair &= 0x0000ffffu;
 | |
|     }
 | |
|     return pair;
 | |
| }
 | |
| 
 | |
| static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
 | |
|                           float_status *s_std, float_status *s_odd)
 | |
| {
 | |
|     float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
 | |
|     float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
 | |
|     float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
 | |
|     float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
 | |
|     float64 t64;
 | |
|     float32 t32;
 | |
| 
 | |
|     /*
 | |
|      * The ARM pseudocode function FPDot performs both multiplies
 | |
|      * and the add with a single rounding operation.  Emulate this
 | |
|      * by performing the first multiply in round-to-odd, then doing
 | |
|      * the second multiply as fused multiply-add, and rounding to
 | |
|      * float32 all in one step.
 | |
|      */
 | |
|     t64 = float64_mul(e1r, e2r, s_odd);
 | |
|     t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
 | |
| 
 | |
|     /* This conversion is exact, because we've already rounded. */
 | |
|     t32 = float64_to_float32(t64, s_std);
 | |
| 
 | |
|     /* The final accumulation step is not fused. */
 | |
|     return float32_add(sum, t32, s_std);
 | |
| }
 | |
| 
 | |
| void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
 | |
|                          void *vpm, void *vst, uint32_t desc)
 | |
| {
 | |
|     intptr_t row, col, oprsz = simd_maxsz(desc);
 | |
|     uint32_t neg = simd_data(desc) * 0x80008000u;
 | |
|     uint16_t *pn = vpn, *pm = vpm;
 | |
|     float_status fpst_odd, fpst_std;
 | |
| 
 | |
|     /*
 | |
|      * Make a copy of float_status because this operation does not
 | |
|      * update the cumulative fp exception status.  It also produces
 | |
|      * default nans.  Make a second copy with round-to-odd -- see above.
 | |
|      */
 | |
|     fpst_std = *(float_status *)vst;
 | |
|     set_default_nan_mode(true, &fpst_std);
 | |
|     fpst_odd = fpst_std;
 | |
|     set_float_rounding_mode(float_round_to_odd, &fpst_odd);
 | |
| 
 | |
|     for (row = 0; row < oprsz; ) {
 | |
|         uint16_t prow = pn[H2(row >> 4)];
 | |
|         do {
 | |
|             void *vza_row = vza + tile_vslice_offset(row);
 | |
|             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
 | |
| 
 | |
|             n = f16mop_adj_pair(n, prow, neg);
 | |
| 
 | |
|             for (col = 0; col < oprsz; ) {
 | |
|                 uint16_t pcol = pm[H2(col >> 4)];
 | |
|                 do {
 | |
|                     if (prow & pcol & 0b0101) {
 | |
|                         uint32_t *a = vza_row + H1_4(col);
 | |
|                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
 | |
| 
 | |
|                         m = f16mop_adj_pair(m, pcol, 0);
 | |
|                         *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
 | |
| 
 | |
|                         col += 4;
 | |
|                         pcol >>= 4;
 | |
|                     }
 | |
|                 } while (col & 15);
 | |
|             }
 | |
|             row += 4;
 | |
|             prow >>= 4;
 | |
|         } while (row & 15);
 | |
|     }
 | |
| }
 | |
| 
 | |
| void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
 | |
|                         void *vpm, uint32_t desc)
 | |
| {
 | |
|     intptr_t row, col, oprsz = simd_maxsz(desc);
 | |
|     uint32_t neg = simd_data(desc) * 0x80008000u;
 | |
|     uint16_t *pn = vpn, *pm = vpm;
 | |
| 
 | |
|     for (row = 0; row < oprsz; ) {
 | |
|         uint16_t prow = pn[H2(row >> 4)];
 | |
|         do {
 | |
|             void *vza_row = vza + tile_vslice_offset(row);
 | |
|             uint32_t n = *(uint32_t *)(vzn + H1_4(row));
 | |
| 
 | |
|             n = f16mop_adj_pair(n, prow, neg);
 | |
| 
 | |
|             for (col = 0; col < oprsz; ) {
 | |
|                 uint16_t pcol = pm[H2(col >> 4)];
 | |
|                 do {
 | |
|                     if (prow & pcol & 0b0101) {
 | |
|                         uint32_t *a = vza_row + H1_4(col);
 | |
|                         uint32_t m = *(uint32_t *)(vzm + H1_4(col));
 | |
| 
 | |
|                         m = f16mop_adj_pair(m, pcol, 0);
 | |
|                         *a = bfdotadd(*a, n, m);
 | |
| 
 | |
|                         col += 4;
 | |
|                         pcol >>= 4;
 | |
|                     }
 | |
|                 } while (col & 15);
 | |
|             }
 | |
|             row += 4;
 | |
|             prow >>= 4;
 | |
|         } while (row & 15);
 | |
|     }
 | |
| }
 | |
| 
 | |
| typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
 | |
| 
 | |
| static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
 | |
|                             uint8_t *pn, uint8_t *pm,
 | |
|                             uint32_t desc, IMOPFn *fn)
 | |
| {
 | |
|     intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 | |
|     bool neg = simd_data(desc);
 | |
| 
 | |
|     for (row = 0; row < oprsz; ++row) {
 | |
|         uint8_t pa = pn[H1(row)];
 | |
|         uint64_t *za_row = &za[tile_vslice_index(row)];
 | |
|         uint64_t n = zn[row];
 | |
| 
 | |
|         for (col = 0; col < oprsz; ++col) {
 | |
|             uint8_t pb = pm[H1(col)];
 | |
|             uint64_t *a = &za_row[col];
 | |
| 
 | |
|             *a = fn(n, zm[col], *a, pa & pb, neg);
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
 | |
| static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
 | |
| {                                                                           \
 | |
|     uint32_t sum0 = 0, sum1 = 0;                                            \
 | |
|     /* Apply P to N as a mask, making the inactive elements 0. */           \
 | |
|     n &= expand_pred_b(p);                                                  \
 | |
|     sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
 | |
|     sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
 | |
|     sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
 | |
|     sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
 | |
|     sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
 | |
|     sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
 | |
|     sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
 | |
|     sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
 | |
|     if (neg) {                                                              \
 | |
|         sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
 | |
|     } else {                                                                \
 | |
|         sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
 | |
|     }                                                                       \
 | |
|     return ((uint64_t)sum1 << 32) | sum0;                                   \
 | |
| }
 | |
| 
 | |
| #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
 | |
| static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
 | |
| {                                                                           \
 | |
|     uint64_t sum = 0;                                                       \
 | |
|     /* Apply P to N as a mask, making the inactive elements 0. */           \
 | |
|     n &= expand_pred_h(p);                                                  \
 | |
|     sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
 | |
|     sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
 | |
|     sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
 | |
|     sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
 | |
|     return neg ? a - sum : a + sum;                                         \
 | |
| }
 | |
| 
 | |
| DEF_IMOP_32(smopa_s, int8_t, int8_t)
 | |
| DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
 | |
| DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
 | |
| DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
 | |
| 
 | |
| DEF_IMOP_64(smopa_d, int16_t, int16_t)
 | |
| DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
 | |
| DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
 | |
| DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
 | |
| 
 | |
| #define DEF_IMOPH(NAME) \
 | |
|     void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
 | |
|                             void *vpm, uint32_t desc)                        \
 | |
|     { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
 | |
| 
 | |
| DEF_IMOPH(smopa_s)
 | |
| DEF_IMOPH(umopa_s)
 | |
| DEF_IMOPH(sumopa_s)
 | |
| DEF_IMOPH(usmopa_s)
 | |
| DEF_IMOPH(smopa_d)
 | |
| DEF_IMOPH(umopa_d)
 | |
| DEF_IMOPH(sumopa_d)
 | |
| DEF_IMOPH(usmopa_d)
 |