mirror of
https://git.proxmox.com/git/rustc
synced 2025-05-22 03:12:07 +00:00
5461 lines
164 KiB
Python
5461 lines
164 KiB
Python
// ARM Neon intrinsic specification.
|
|
//
|
|
// This file contains the specification for a number of
|
|
// intrinsics that allows us to generate them along with
|
|
// their test cases.
|
|
//
|
|
// To the syntax of the file - it's not very intelligently parsed!
|
|
//
|
|
// # Comments
|
|
// start with AT LEAST two, or four or more slashes so // is a
|
|
// comment /////// is too.
|
|
//
|
|
// # Sections
|
|
// Sections start with EXACTLY three slashes followed
|
|
// by AT LEAST one space. Sections are used for two things:
|
|
//
|
|
// 1) they serve as the doc comment for the given intrinics.
|
|
// 2) they reset all variables (name, fn, etc.)
|
|
//
|
|
// # Variables
|
|
//
|
|
// name - The prefix of the function, suffixes are auto
|
|
// generated by the type they get passed.
|
|
//
|
|
// fn - The function to call in rust-land.
|
|
//
|
|
// aarch64 - The intrinsic to check on aarch64 architecture.
|
|
// If this is given but no arm intrinsic is provided,
|
|
// the function will exclusively be generated for
|
|
// aarch64.
|
|
// This is used to generate both aarch64 specific and
|
|
// shared intrinics by first only specifying th aarch64
|
|
// variant then the arm variant.
|
|
//
|
|
// arm - The arm v7 intrinics used to checked for arm code
|
|
// generation. All neon functions available in arm are
|
|
// also available in aarch64. If no aarch64 intrinic was
|
|
// set they are assumed to be the same.
|
|
// Intrinics ending with a `.` will have a size suffixes
|
|
// added (such as `i8` or `i64`) that is not sign specific
|
|
// Intrinics ending with a `.s` will have a size suffixes
|
|
// added (such as `s8` or `u64`) that is sign specific
|
|
//
|
|
// a - First input for tests, it gets scaled to the size of
|
|
// the type.
|
|
//
|
|
// b - Second input for tests, it gets scaled to the size of
|
|
// the type.
|
|
//
|
|
// # special values
|
|
//
|
|
// TRUE - 'true' all bits are set to 1
|
|
// FALSE - 'false' all bits are set to 0
|
|
// FF - same as 'true'
|
|
// MIN - minimal value (either 0 or the lowest negative number)
|
|
// MAX - maximal value proper to overflow
|
|
//
|
|
// # validate <values>
|
|
// Validates a and b aginst the expected result of the test.
|
|
// The special values 'TRUE' and 'FALSE' can be used to
|
|
// represent the correct NEON representation of true or
|
|
// false values. It too gets scaled to the type.
|
|
//
|
|
// Validate needs to be called before generate as it sets
|
|
// up the rules for validation that get generated for each
|
|
// type.
|
|
// # generate <types>
|
|
// The generate command generates the intrinsics, it uses the
|
|
// Variables set and can be called multiple times while overwriting
|
|
// some of the variables.
|
|
|
|
/// Vector bitwise and
|
|
name = vand
|
|
fn = simd_and
|
|
arm = vand
|
|
aarch64 = and
|
|
a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00
|
|
b = 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F
|
|
validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00
|
|
b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
|
validate 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
|
generate int*_t, uint*_t, int64x*_t, uint64x*_t
|
|
|
|
/// Vector bitwise or (immediate, inclusive)
|
|
name = vorr
|
|
fn = simd_or
|
|
arm = vorr
|
|
aarch64 = orr
|
|
a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
|
|
b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
|
validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
|
|
generate int*_t, uint*_t, int64x*_t, uint64x*_t
|
|
|
|
|
|
/// Vector bitwise exclusive or (vector)
|
|
name = veor
|
|
fn = simd_xor
|
|
arm = veor
|
|
aarch64 = eor
|
|
a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
|
|
b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
|
validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
|
|
generate int*_t, uint*_t, int64x*_t, uint64x*_t
|
|
|
|
////////////////////
|
|
// Absolute difference between the arguments
|
|
////////////////////
|
|
|
|
/// Absolute difference between the arguments
|
|
name = vabd
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
|
|
validate 15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15
|
|
|
|
arm = vabd.s
|
|
aarch64 = sabd
|
|
link-arm = vabds._EXT_
|
|
link-aarch64 = sabd._EXT_
|
|
generate int*_t
|
|
|
|
arm = vabd.s
|
|
aarch64 = uabd
|
|
link-arm = vabdu._EXT_
|
|
link-aarch64 = uabd._EXT_
|
|
generate uint*_t
|
|
|
|
/// Absolute difference between the arguments of Floating
|
|
name = vabd
|
|
a = 1.0, 2.0, 5.0, -4.0
|
|
b = 9.0, 3.0, 2.0, 8.0
|
|
validate 8.0, 1.0, 3.0, 12.0
|
|
|
|
aarch64 = fabd
|
|
link-aarch64 = fabd._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vabd.s
|
|
aarch64 = fabd
|
|
link-arm = vabds._EXT_
|
|
link-aarch64 = fabd._EXT_
|
|
generate float*_t
|
|
|
|
////////////////////
|
|
// Absolute difference Long
|
|
////////////////////
|
|
|
|
/// Unsigned Absolute difference Long
|
|
name = vabdl
|
|
multi_fn = simd_cast, {vabd-unsigned-noext, a, b}
|
|
a = 1, 2, 3, 4, 4, 3, 2, 1
|
|
b = 10, 10, 10, 10, 10, 10, 10, 10
|
|
validate 9, 8, 7, 6, 6, 7, 8, 9
|
|
|
|
arm = vabdl.s
|
|
aarch64 = uabdl
|
|
generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Signed Absolute difference Long
|
|
name = vabdl
|
|
multi_fn = simd_cast, c:uint8x8_t, {vabd-signed-noext, a, b}
|
|
multi_fn = simd_cast, c
|
|
a = 1, 2, 3, 4, 4, 3, 2, 1
|
|
b = 10, 10, 10, 10, 10, 10, 10, 10
|
|
validate 9, 8, 7, 6, 6, 7, 8, 9
|
|
|
|
arm = vabdl.s
|
|
aarch64 = sabdl
|
|
generate int8x8_t:int8x8_t:int16x8_t
|
|
|
|
/// Signed Absolute difference Long
|
|
name = vabdl
|
|
multi_fn = simd_cast, c:uint16x4_t, {vabd-signed-noext, a, b}
|
|
multi_fn = simd_cast, c
|
|
a = 1, 2, 11, 12
|
|
b = 10, 10, 10, 10
|
|
validate 9, 8, 1, 2
|
|
|
|
arm = vabdl.s
|
|
aarch64 = sabdl
|
|
generate int16x4_t:int16x4_t:int32x4_t
|
|
|
|
/// Signed Absolute difference Long
|
|
name = vabdl
|
|
multi_fn = simd_cast, c:uint32x2_t, {vabd-signed-noext, a, b}
|
|
multi_fn = simd_cast, c
|
|
a = 1, 11
|
|
b = 10, 10
|
|
validate 9, 1
|
|
|
|
arm = vabdl.s
|
|
aarch64 = sabdl
|
|
generate int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Unsigned Absolute difference Long
|
|
name = vabdl_high
|
|
no-q
|
|
multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_cast, {vabd_u8, c, d}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
|
|
validate 1, 0, 1, 2, 3, 4, 5, 6
|
|
|
|
aarch64 = uabdl
|
|
generate uint8x16_t:uint8x16_t:uint16x8_t
|
|
|
|
/// Unsigned Absolute difference Long
|
|
name = vabdl_high
|
|
no-q
|
|
multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
|
|
multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
|
|
multi_fn = simd_cast, {vabd_u16, c, d}
|
|
a = 1, 2, 3, 4, 8, 9, 11, 12
|
|
b = 10, 10, 10, 10, 10, 10, 10, 10
|
|
validate 2, 1, 1, 2
|
|
|
|
aarch64 = uabdl
|
|
generate uint16x8_t:uint16x8_t:uint32x4_t
|
|
|
|
/// Unsigned Absolute difference Long
|
|
name = vabdl_high
|
|
no-q
|
|
multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
|
|
multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
|
|
multi_fn = simd_cast, {vabd_u32, c, d}
|
|
a = 1, 2, 3, 4
|
|
b = 10, 10, 10, 10
|
|
validate 7, 6
|
|
|
|
aarch64 = uabdl
|
|
generate uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Signed Absolute difference Long
|
|
name = vabdl_high
|
|
no-q
|
|
multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d}
|
|
multi_fn = simd_cast, e
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
|
|
validate 1, 0, 1, 2, 3, 4, 5, 6
|
|
|
|
aarch64 = sabdl
|
|
generate int8x16_t:int8x16_t:int16x8_t
|
|
|
|
/// Signed Absolute difference Long
|
|
name = vabdl_high
|
|
no-q
|
|
multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
|
|
multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
|
|
multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d}
|
|
multi_fn = simd_cast, e
|
|
a = 1, 2, 3, 4, 9, 10, 11, 12
|
|
b = 10, 10, 10, 10, 10, 10, 10, 10
|
|
validate 1, 0, 1, 2
|
|
|
|
aarch64 = sabdl
|
|
generate int16x8_t:int16x8_t:int32x4_t
|
|
|
|
/// Signed Absolute difference Long
|
|
name = vabdl_high
|
|
no-q
|
|
multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
|
|
multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
|
|
multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d}
|
|
multi_fn = simd_cast, e
|
|
a = 1, 2, 3, 4
|
|
b = 10, 10, 10, 10
|
|
validate 7, 6
|
|
|
|
aarch64 = sabdl
|
|
generate int32x4_t:int32x4_t:int64x2_t
|
|
|
|
////////////////////
|
|
// equality
|
|
////////////////////
|
|
|
|
/// Compare bitwise Equal (vector)
|
|
name = vceq
|
|
fn = simd_eq
|
|
a = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX
|
|
b = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
a = MIN, MIN, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, MAX
|
|
b = MIN, MAX, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, MIN
|
|
validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE
|
|
|
|
aarch64 = cmeq
|
|
generate uint64x*_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
|
|
|
|
arm = vceq.
|
|
generate uint*_t, int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t
|
|
|
|
/// Floating-point compare equal
|
|
name = vceq
|
|
fn = simd_eq
|
|
a = 1.2, 3.4, 5.6, 7.8
|
|
b = 1.2, 3.4, 5.6, 7.8
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmeq
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vceq.
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Signed compare bitwise equal to zero
|
|
name = vceqz
|
|
fn = simd_eq
|
|
a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = cmeq
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
|
|
|
|
/// Unsigned compare bitwise equal to zero
|
|
name = vceqz
|
|
fn = simd_eq
|
|
a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = cmeq
|
|
generate uint*_t, uint64x*_t
|
|
|
|
/// Floating-point compare bitwise equal to zero
|
|
name = vceqz
|
|
fn = simd_eq
|
|
a = 0.0, 1.2, 3.4, 5.6
|
|
fixed = 0.0, 0.0, 0.0, 0.0
|
|
validate TRUE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = fcmeq
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Signed compare bitwise Test bits nonzero
|
|
name = vtst
|
|
multi_fn = simd_and, c:in_t, a, b
|
|
multi_fn = fixed, d:in_t
|
|
multi_fn = simd_ne, c, transmute(d)
|
|
a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmtst
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
|
|
|
|
arm = vtst
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t
|
|
|
|
/// Unsigned compare bitwise Test bits nonzero
|
|
name = vtst
|
|
multi_fn = simd_and, c:in_t, a, b
|
|
multi_fn = fixed, d:in_t
|
|
multi_fn = simd_ne, c, transmute(d)
|
|
a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmtst
|
|
generate uint64x*_t
|
|
|
|
arm = vtst
|
|
generate uint*_t
|
|
|
|
////////////////////
|
|
// Floating-point absolute value
|
|
////////////////////
|
|
|
|
/// Floating-point absolute value
|
|
name = vabs
|
|
fn = simd_fabs
|
|
a = -0.1, -2.2, -3.3, -6.6
|
|
validate 0.1, 2.2, 3.3, 6.6
|
|
aarch64 = fabs
|
|
generate float64x1_t:float64x1_t, float64x2_t:float64x2_t
|
|
|
|
arm = vabs
|
|
generate float32x2_t:float32x2_t, float32x4_t:float32x4_t
|
|
|
|
////////////////////
|
|
// greater then
|
|
////////////////////
|
|
|
|
/// Compare signed greater than
|
|
name = vcgt
|
|
fn = simd_gt
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
aarch64 = cmgt
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
arm = vcgt.s
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
|
|
|
|
/// Compare unsigned highe
|
|
name = vcgt
|
|
fn = simd_gt
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmhi
|
|
generate uint64x*_t
|
|
|
|
arm = vcgt.s
|
|
generate uint*_t
|
|
|
|
/// Floating-point compare greater than
|
|
name = vcgt
|
|
fn = simd_gt
|
|
a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
|
|
b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmgt
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vcgt.s
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
////////////////////
|
|
// lesser then
|
|
////////////////////
|
|
|
|
/// Compare signed less than
|
|
name = vclt
|
|
fn = simd_lt
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
aarch64 = cmgt
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
arm = vcgt.s
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
|
|
|
|
/// Compare unsigned less than
|
|
name = vclt
|
|
fn = simd_lt
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmhi
|
|
generate uint64x*_t
|
|
|
|
arm = vcgt.s
|
|
generate uint*_t
|
|
|
|
/// Floating-point compare less than
|
|
name = vclt
|
|
fn = simd_lt
|
|
a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
|
|
b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmgt
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vcgt.s
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
////////////////////
|
|
// lesser then equals
|
|
////////////////////
|
|
|
|
/// Compare signed less than or equal
|
|
name = vcle
|
|
fn = simd_le
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmge
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
arm = vcge.s
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
|
|
|
|
/// Compare unsigned less than or equal
|
|
name = vcle
|
|
fn = simd_le
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmhs
|
|
generate uint64x*_t
|
|
|
|
arm = vcge.s
|
|
generate uint*_t
|
|
|
|
/// Floating-point compare less than or equal
|
|
name = vcle
|
|
fn = simd_le
|
|
a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
|
|
b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
aarch64 = fcmge
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
arm = vcge.s
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
////////////////////
|
|
// greater then equals
|
|
////////////////////
|
|
|
|
/// Compare signed greater than or equal
|
|
name = vcge
|
|
fn = simd_ge
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmge
|
|
generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
arm = vcge.s
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t
|
|
|
|
/// Compare unsigned greater than or equal
|
|
name = vcge
|
|
fn = simd_ge
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmhs
|
|
generate uint64x*_t
|
|
|
|
arm = vcge.s
|
|
generate uint*_t
|
|
|
|
/// Floating-point compare greater than or equal
|
|
name = vcge
|
|
fn = simd_ge
|
|
a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9
|
|
b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8
|
|
validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmge
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vcge.s
|
|
// we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Compare signed greater than or equal to zero
|
|
name = vcgez
|
|
fn = simd_ge
|
|
a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmge
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
/// Floating-point compare greater than or equal to zero
|
|
name = vcgez
|
|
fn = simd_ge
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
|
|
validate FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmge
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Compare signed greater than zero
|
|
name = vcgtz
|
|
fn = simd_gt
|
|
a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = cmgt
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
/// Floating-point compare greater than zero
|
|
name = vcgtz
|
|
fn = simd_gt
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
|
|
validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
|
|
|
|
aarch64 = fcmgt
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Compare signed less than or equal to zero
|
|
name = vclez
|
|
fn = simd_le
|
|
a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = cmgt
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
/// Floating-point compare less than or equal to zero
|
|
name = vclez
|
|
fn = simd_le
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
|
|
validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = fcmle
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Compare signed less than zero
|
|
name = vcltz
|
|
fn = simd_lt
|
|
a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX
|
|
fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = sshr
|
|
generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t
|
|
|
|
/// Floating-point compare less than zero
|
|
name = vcltz
|
|
fn = simd_lt
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
|
|
validate TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
|
|
aarch64 = fcmlt
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Count leading sign bits
|
|
name = vcls
|
|
a = MIN, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX
|
|
validate 0, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0
|
|
|
|
arm = vcls.s
|
|
aarch64 = cls
|
|
link-arm = vcls._EXT_
|
|
link-aarch64 = cls._EXT_
|
|
generate int*_t
|
|
|
|
/// Signed count leading sign bits
|
|
name = vclz
|
|
multi_fn = self-signed-ext, a
|
|
a = MIN, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX
|
|
validate 0, 0, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 1
|
|
|
|
arm = vclz.
|
|
aarch64 = clz
|
|
generate int*_t
|
|
|
|
/// Unsigned count leading sign bits
|
|
name = vclz
|
|
multi_fn = transmute, {self-signed-ext, transmute(a)}
|
|
a = MIN, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX
|
|
validate BITS, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0
|
|
|
|
arm = vclz.
|
|
aarch64 = clz
|
|
generate uint*_t
|
|
|
|
/// Floating-point absolute compare greater than
|
|
name = vcagt
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
|
|
validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE
|
|
|
|
aarch64 = facgt
|
|
link-aarch64 = facgt._EXT2_._EXT_
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vacgt.s
|
|
link-arm = vacgt._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Floating-point absolute compare greater than or equal
|
|
name = vcage
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
|
|
validate TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE
|
|
|
|
aarch64 = facge
|
|
link-aarch64 = facge._EXT2_._EXT_
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vacge.s
|
|
link-arm = vacge._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Floating-point absolute compare less than
|
|
name = vcalt
|
|
multi_fn = vcagt-self-noext, b, a
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
|
|
validate FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE
|
|
|
|
aarch64 = facgt
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vacgt.s
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Floating-point absolute compare less than or equal
|
|
name = vcale
|
|
multi_fn = vcage-self-noext , b, a
|
|
a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7
|
|
b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8
|
|
validate FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE
|
|
|
|
aarch64 = facge
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
arm = vacge.s
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vcopy
|
|
lane-suffixes
|
|
constn = LANE1:LANE2
|
|
multi_fn = static_assert_imm-in0_exp_len-LANE1
|
|
multi_fn = static_assert_imm-in_exp_len-LANE2
|
|
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 0:1
|
|
validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = mov
|
|
generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x2_t, int32x4_t, int64x2_t
|
|
generate uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x2_t, uint32x4_t, uint64x2_t
|
|
generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vcopy
|
|
lane-suffixes
|
|
constn = LANE1:LANE2
|
|
multi_fn = static_assert_imm-in0_exp_len-LANE1
|
|
multi_fn = static_assert_imm-in_exp_len-LANE2
|
|
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
|
|
a = 1., 2., 3., 4.
|
|
b = 0., 0.5, 0., 0.
|
|
n = 0:1
|
|
validate 0.5, 2., 3., 4.
|
|
|
|
aarch64 = mov
|
|
generate float32x2_t, float32x4_t, float64x2_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vcopy
|
|
lane-suffixes
|
|
constn = LANE1:LANE2
|
|
multi_fn = static_assert_imm-in0_exp_len-LANE1
|
|
multi_fn = static_assert_imm-in_exp_len-LANE2
|
|
multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
|
|
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 0:1
|
|
validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = mov
|
|
generate int8x8_t:int8x16_t:int8x8_t, int16x4_t:int16x8_t:int16x4_t, int32x2_t:int32x4_t:int32x2_t
|
|
generate uint8x8_t:uint8x16_t:uint8x8_t, uint16x4_t:uint16x8_t:uint16x4_t, uint32x2_t:uint32x4_t:uint32x2_t
|
|
generate poly8x8_t:poly8x16_t:poly8x8_t, poly16x4_t:poly16x8_t:poly16x4_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vcopy
|
|
lane-suffixes
|
|
constn = LANE1:LANE2
|
|
multi_fn = static_assert_imm-in0_exp_len-LANE1
|
|
multi_fn = static_assert_imm-in_exp_len-LANE2
|
|
multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len}
|
|
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2}
|
|
a = 1., 2., 3., 4.
|
|
b = 0., 0.5, 0., 0.
|
|
n = 0:1
|
|
validate 0.5, 2., 3., 4.
|
|
|
|
aarch64 = mov
|
|
generate float32x2_t:float32x4_t:float32x2_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vcopy
|
|
lane-suffixes
|
|
constn = LANE1:LANE2
|
|
multi_fn = static_assert_imm-in0_exp_len-LANE1
|
|
multi_fn = static_assert_imm-in_exp_len-LANE2
|
|
multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
|
|
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 0:1
|
|
validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = mov
|
|
generate int8x16_t:int8x8_t:int8x16_t, int16x8_t:int16x4_t:int16x8_t, int32x4_t:int32x2_t:int32x4_t
|
|
generate uint8x16_t:uint8x8_t:uint8x16_t, uint16x8_t:uint16x4_t:uint16x8_t, uint32x4_t:uint32x2_t:uint32x4_t
|
|
generate poly8x16_t:poly8x8_t:poly8x16_t, poly16x8_t:poly16x4_t:poly16x8_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vcopy
|
|
lane-suffixes
|
|
constn = LANE1:LANE2
|
|
multi_fn = static_assert_imm-in0_exp_len-LANE1
|
|
multi_fn = static_assert_imm-in_exp_len-LANE2
|
|
multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
|
|
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1:0
|
|
validate 1, MAX
|
|
|
|
aarch64 = zip1
|
|
generate int64x2_t:int64x1_t:int64x2_t, uint64x2_t:uint64x1_t:uint64x2_t, poly64x2_t:poly64x1_t:poly64x2_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vcopy
|
|
lane-suffixes
|
|
constn = LANE1:LANE2
|
|
multi_fn = static_assert_imm-in0_exp_len-LANE1
|
|
multi_fn = static_assert_imm-in_exp_len-LANE2
|
|
multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len}
|
|
multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2}
|
|
a = 1., 2., 3., 4.
|
|
b = 0.5, 0., 0., 0.
|
|
n = 1:0
|
|
validate 1., 0.5, 3., 4.
|
|
|
|
aarch64 = mov
|
|
generate float32x4_t:float32x2_t:float32x4_t
|
|
aarch64 = zip1
|
|
generate float64x2_t:float64x1_t:float64x2_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vcreate
|
|
out-suffix
|
|
multi_fn = transmute, a
|
|
a = 1
|
|
validate 1, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
aarch64 = nop
|
|
arm = nop
|
|
generate u64:int8x8_t, u64:int16x4_t: u64:int32x2_t, u64:int64x1_t
|
|
generate u64:uint8x8_t, u64:uint16x4_t: u64:uint32x2_t, u64:uint64x1_t
|
|
generate u64:poly8x8_t, u64:poly16x4_t
|
|
target = aes
|
|
generate u64:poly64x1_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vcreate
|
|
out-suffix
|
|
multi_fn = transmute, a
|
|
a = 0
|
|
validate 0., 0.
|
|
|
|
aarch64 = nop
|
|
generate u64:float64x1_t
|
|
arm = nop
|
|
generate u64:float32x2_t
|
|
|
|
/// Fixed-point convert to floating-point
|
|
name = vcvt
|
|
double-suffixes
|
|
fn = simd_cast
|
|
a = 1, 2, 3, 4
|
|
validate 1., 2., 3., 4.
|
|
|
|
aarch64 = scvtf
|
|
generate int64x1_t:float64x1_t, int64x2_t:float64x2_t
|
|
aarch64 = ucvtf
|
|
generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t
|
|
|
|
arm = vcvt
|
|
aarch64 = scvtf
|
|
generate int32x2_t:float32x2_t, int32x4_t:float32x4_t
|
|
aarch64 = ucvtf
|
|
generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t
|
|
|
|
/// Floating-point convert to higher precision long
|
|
name = vcvt
|
|
double-suffixes
|
|
fn = simd_cast
|
|
a = -1.2, 1.2
|
|
validate -1.2f32 as f64, 1.2f32 as f64
|
|
|
|
aarch64 = fcvtl
|
|
generate float32x2_t:float64x2_t
|
|
|
|
/// Floating-point convert to higher precision long
|
|
name = vcvt_high
|
|
noq-double-suffixes
|
|
multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3]
|
|
multi_fn = simd_cast, b
|
|
a = -1.2, 1.2, 2.3, 3.4
|
|
validate 2.3f32 as f64, 3.4f32 as f64
|
|
|
|
aarch64 = fcvtl
|
|
generate float32x4_t:float64x2_t
|
|
|
|
/// Floating-point convert to lower precision narrow
|
|
name = vcvt
|
|
double-suffixes
|
|
fn = simd_cast
|
|
a = -1.2, 1.2
|
|
validate -1.2f64 as f32, 1.2f64 as f32
|
|
|
|
aarch64 = fcvtn
|
|
generate float64x2_t:float32x2_t
|
|
|
|
/// Floating-point convert to lower precision narrow
|
|
name = vcvt_high
|
|
noq-double-suffixes
|
|
multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3]
|
|
a = -1.2, 1.2
|
|
b = -2.3, 3.4
|
|
validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32
|
|
|
|
aarch64 = fcvtn
|
|
generate float32x2_t:float64x2_t:float32x4_t
|
|
|
|
/// Floating-point convert to lower precision narrow, rounding to odd
|
|
name = vcvtx
|
|
double-suffixes
|
|
a = -1.0, 2.0
|
|
validate -1.0, 2.0
|
|
|
|
aarch64 = fcvtxn
|
|
link-aarch64 = fcvtxn._EXT2_._EXT_
|
|
generate float64x2_t:float32x2_t
|
|
|
|
/// Floating-point convert to lower precision narrow, rounding to odd
|
|
name = vcvtx_high
|
|
noq-double-suffixes
|
|
multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3]
|
|
a = -1.0, 2.0
|
|
b = -3.0, 4.0
|
|
validate -1.0, 2.0, -3.0, 4.0
|
|
|
|
aarch64 = fcvtxn
|
|
generate float32x2_t:float64x2_t:float32x4_t
|
|
|
|
/// Fixed-point convert to floating-point
|
|
name = vcvt
|
|
double-n-suffixes
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
a = 1, 2, 3, 4
|
|
n = 2
|
|
validate 0.25, 0.5, 0.75, 1.
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = scvtf
|
|
link-aarch64 = vcvtfxs2fp._EXT2_._EXT_
|
|
const-aarch64 = N
|
|
generate int64x1_t:float64x1_t, int64x2_t:float64x2_t, i32:f32, i64:f64
|
|
|
|
aarch64 = ucvtf
|
|
link-aarch64 = vcvtfxu2fp._EXT2_._EXT_
|
|
const-aarch64 = N
|
|
generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t, u32:f32, u64:f64
|
|
|
|
aarch64 = scvtf
|
|
link-aarch64 = vcvtfxs2fp._EXT2_._EXT_
|
|
arm = vcvt
|
|
link-arm = vcvtfxs2fp._EXT2_._EXT_
|
|
const-arm = N:i32
|
|
|
|
generate int32x2_t:float32x2_t, int32x4_t:float32x4_t
|
|
|
|
aarch64 = ucvtf
|
|
link-aarch64 = vcvtfxu2fp._EXT2_._EXT_
|
|
arm = vcvt
|
|
link-arm = vcvtfxu2fp._EXT2_._EXT_
|
|
const-arm = N:i32
|
|
generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t
|
|
|
|
/// Floating-point convert to fixed-point, rounding toward zero
|
|
name = vcvt
|
|
double-n-suffixes
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
a = 0.25, 0.5, 0.75, 1.
|
|
n = 2
|
|
validate 1, 2, 3, 4
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = fcvtzs
|
|
link-aarch64 = vcvtfp2fxs._EXT2_._EXT_
|
|
const-aarch64 = N
|
|
generate float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64
|
|
|
|
aarch64 = fcvtzu
|
|
link-aarch64 = vcvtfp2fxu._EXT2_._EXT_
|
|
const-aarch64 = N
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
|
|
|
|
aarch64 = fcvtzs
|
|
link-aarch64 = vcvtfp2fxs._EXT2_._EXT_
|
|
arm = vcvt
|
|
link-arm = vcvtfp2fxs._EXT2_._EXT_
|
|
const-arm = N:i32
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t
|
|
|
|
aarch64 = fcvtzu
|
|
link-aarch64 = vcvtfp2fxu._EXT2_._EXT_
|
|
arm = vcvt
|
|
link-arm = vcvtfp2fxu._EXT2_._EXT_
|
|
const-arm = N:i32
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Fixed-point convert to floating-point
|
|
name = vcvt
|
|
double-suffixes
|
|
multi_fn = a as out_t
|
|
a = 1
|
|
validate 1.
|
|
|
|
aarch64 = scvtf
|
|
generate i32:f32, i64:f64
|
|
aarch64 = ucvtf
|
|
generate u32:f32, u64:f64
|
|
|
|
/// Fixed-point convert to floating-point
|
|
name = vcvt
|
|
double-suffixes
|
|
multi_fn = a as out_t
|
|
a = 1.
|
|
validate 1
|
|
|
|
aarch64 = fcvtzs
|
|
generate f32:i32, f64:i64
|
|
aarch64 = fcvtzu
|
|
generate f32:u32, f64:u64
|
|
|
|
/// Floating-point convert to signed fixed-point, rounding toward zero
|
|
name = vcvt
|
|
double-suffixes
|
|
link-aarch64 = llvm.fptosi.sat._EXT2_._EXT_
|
|
a = -1.1, 2.1, -2.9, 3.9
|
|
validate -1, 2, -2, 3
|
|
|
|
aarch64 = fcvtzs
|
|
generate float64x1_t:int64x1_t, float64x2_t:int64x2_t
|
|
|
|
link-arm = llvm.fptosi.sat._EXT2_._EXT_
|
|
arm = vcvt
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t
|
|
|
|
/// Floating-point convert to unsigned fixed-point, rounding toward zero
|
|
name = vcvt
|
|
double-suffixes
|
|
link-aarch64 = llvm.fptoui.sat._EXT2_._EXT_
|
|
a = 1.1, 2.1, 2.9, 3.9
|
|
validate 1, 2, 2, 3
|
|
|
|
aarch64 = fcvtzu
|
|
generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
link-arm = llvm.fptoui.sat._EXT2_._EXT_
|
|
arm = vcvt
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
|
|
|
|
/// Floating-point convert to signed integer, rounding to nearest with ties to away
|
|
name = vcvta
|
|
double-suffixes
|
|
a = -1.1, 2.1, -2.9, 3.9
|
|
validate -1, 2, -3, 4
|
|
|
|
aarch64 = fcvtas
|
|
link-aarch64 = fcvtas._EXT2_._EXT_
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t
|
|
|
|
/// Floating-point convert to integer, rounding to nearest with ties to away
|
|
name = vcvta
|
|
double-suffixes
|
|
a = 2.9
|
|
validate 3
|
|
|
|
aarch64 = fcvtas
|
|
link-aarch64 = fcvtas._EXT2_._EXT_
|
|
generate f32:i32, f64:i64
|
|
|
|
aarch64 = fcvtau
|
|
link-aarch64 = fcvtau._EXT2_._EXT_
|
|
generate f32:u32, f64:u64
|
|
|
|
/// Floating-point convert to signed integer, rounding to nearest with ties to even
|
|
name = vcvtn
|
|
double-suffixes
|
|
a = -1.5, 2.1, -2.9, 3.9
|
|
validate -2, 2, -3, 4
|
|
|
|
aarch64 = fcvtns
|
|
link-aarch64 = fcvtns._EXT2_._EXT_
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64
|
|
|
|
/// Floating-point convert to signed integer, rounding toward minus infinity
|
|
name = vcvtm
|
|
double-suffixes
|
|
a = -1.1, 2.1, -2.9, 3.9
|
|
validate -2, 2, -3, 3
|
|
|
|
aarch64 = fcvtms
|
|
link-aarch64 = fcvtms._EXT2_._EXT_
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64
|
|
|
|
/// Floating-point convert to signed integer, rounding toward plus infinity
|
|
name = vcvtp
|
|
double-suffixes
|
|
a = -1.1, 2.1, -2.9, 3.9
|
|
validate -1, 3, -2, 4
|
|
|
|
aarch64 = fcvtps
|
|
link-aarch64 = fcvtps._EXT2_._EXT_
|
|
generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64
|
|
|
|
/// Floating-point convert to unsigned integer, rounding to nearest with ties to away
|
|
name = vcvta
|
|
double-suffixes
|
|
a = 1.1, 2.1, 2.9, 3.9
|
|
validate 1, 2, 3, 4
|
|
|
|
aarch64 = fcvtau
|
|
link-aarch64 = fcvtau._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
|
|
|
|
/// Floating-point convert to unsigned integer, rounding to nearest with ties to even
|
|
name = vcvtn
|
|
double-suffixes
|
|
a = 1.5, 2.1, 2.9, 3.9
|
|
validate 2, 2, 3, 4
|
|
|
|
aarch64 = fcvtnu
|
|
link-aarch64 = fcvtnu._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
|
|
|
|
/// Floating-point convert to unsigned integer, rounding toward minus infinity
|
|
name = vcvtm
|
|
double-suffixes
|
|
a = 1.1, 2.1, 2.9, 3.9
|
|
validate 1, 2, 2, 3
|
|
|
|
aarch64 = fcvtmu
|
|
link-aarch64 = fcvtmu._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
|
|
|
|
/// Floating-point convert to unsigned integer, rounding toward plus infinity
|
|
name = vcvtp
|
|
double-suffixes
|
|
a = 1.1, 2.1, 2.9, 3.9
|
|
validate 2, 3, 3, 4
|
|
|
|
aarch64 = fcvtpu
|
|
link-aarch64 = fcvtpu._EXT2_._EXT_
|
|
generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64
|
|
|
|
/// Set all vector lanes to the same value
|
|
name = vdup
|
|
lane-suffixes
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
|
|
a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
|
|
n = HFLEN
|
|
validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
|
|
aarch64 = dup
|
|
generate poly64x2_t, poly64x1_t:poly64x2_t
|
|
|
|
arm = vdup.l
|
|
generate int*_t
|
|
generate int8x16_t:int8x8_t, int16x8_t:int16x4_t, int32x4_t:int32x2_t
|
|
generate int8x8_t:int8x16_t, int16x4_t:int16x8_t, int32x2_t:int32x4_t
|
|
|
|
generate uint*_t
|
|
generate uint8x16_t:uint8x8_t, uint16x8_t:uint16x4_t, uint32x4_t:uint32x2_t
|
|
generate uint8x8_t:uint8x16_t, uint16x4_t:uint16x8_t, uint32x2_t:uint32x4_t
|
|
|
|
generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
|
|
generate poly8x16_t:poly8x8_t, poly16x8_t:poly16x4_t
|
|
generate poly8x8_t:poly8x16_t, poly16x4_t:poly16x8_t
|
|
|
|
/// Set all vector lanes to the same value
|
|
name = vdup
|
|
lane-suffixes
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
|
|
a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
|
|
n = HFLEN
|
|
validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
|
|
aarch64 = dup
|
|
arm = vmov
|
|
generate int64x2_t, int64x1_t:int64x2_t, uint64x2_t, uint64x1_t:uint64x2_t
|
|
|
|
/// Set all vector lanes to the same value
|
|
name = vdup
|
|
lane-suffixes
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32}
|
|
a = 1., 1., 1., 4.
|
|
n = HFLEN
|
|
validate 1., 1., 1., 1.
|
|
|
|
aarch64 = dup
|
|
generate float64x2_t, float64x1_t:float64x2_t
|
|
|
|
arm = vdup.l
|
|
generate float*_t, float32x4_t:float32x2_t, float32x2_t:float32x4_t
|
|
|
|
/// Set all vector lanes to the same value
|
|
name = vdup
|
|
lane-suffixes
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = a
|
|
a = 0
|
|
n = HFLEN
|
|
validate 0
|
|
|
|
aarch64 = nop
|
|
generate poly64x1_t
|
|
|
|
arm = nop
|
|
generate int64x1_t, uint64x1_t
|
|
|
|
/// Set all vector lanes to the same value
|
|
name = vdup
|
|
lane-suffixes
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = a
|
|
a = 0.
|
|
n = HFLEN
|
|
validate 0.
|
|
|
|
aarch64 = nop
|
|
generate float64x1_t
|
|
|
|
/// Set all vector lanes to the same value
|
|
name = vdup
|
|
lane-suffixes
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32}
|
|
a = 0, 1
|
|
n = HFLEN
|
|
validate 1
|
|
|
|
aarch64 = nop
|
|
generate poly64x2_t:poly64x1_t
|
|
|
|
arm = vmov
|
|
generate int64x2_t:int64x1_t, uint64x2_t:uint64x1_t
|
|
|
|
/// Set all vector lanes to the same value
|
|
name = vdup
|
|
lane-suffixes
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32}
|
|
a = 0., 1.
|
|
n = HFLEN
|
|
validate 1.
|
|
|
|
aarch64 = nop
|
|
generate float64x2_t:float64x1_t
|
|
|
|
/// Set all vector lanes to the same value
|
|
name = vdup
|
|
lane-suffixes
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_extract, a, N as u32
|
|
a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
|
|
n = HFLEN
|
|
validate 1
|
|
|
|
aarch64 = nop
|
|
generate int8x8_t:i8, int8x16_t:i8, int16x4_t:i16, int16x8_t:i16, int32x2_t:i32, int32x4_t:i32, int64x1_t:i64, int64x2_t:i64
|
|
generate uint8x8_t:u8, uint8x16_t:u8, uint16x4_t:u16, uint16x8_t:u16, uint32x2_t:u32, uint32x4_t:u32, uint64x1_t:u64, uint64x2_t:u64
|
|
generate poly8x8_t:p8, poly8x16_t:p8, poly16x4_t:p16, poly16x8_t:p16
|
|
|
|
/// Set all vector lanes to the same value
|
|
name = vdup
|
|
lane-suffixes
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_extract, a, N as u32
|
|
a = 1., 1., 1., 4.
|
|
n = HFLEN
|
|
validate 1.
|
|
|
|
aarch64 = nop
|
|
generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64
|
|
|
|
/// Extract vector from pair of vectors
|
|
name = vext
|
|
constn = N
|
|
multi_fn = static_assert_imm-out_exp_len-N
|
|
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
|
|
a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
|
|
b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
|
|
n = HFLEN
|
|
validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19
|
|
|
|
arm = "vext.8"
|
|
aarch64 = ext
|
|
generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
|
|
|
|
/// Extract vector from pair of vectors
|
|
name = vext
|
|
constn = N
|
|
multi_fn = static_assert_imm-out_exp_len-N
|
|
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
|
|
a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15
|
|
b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11
|
|
n = HFLEN
|
|
validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19
|
|
|
|
aarch64 = ext
|
|
generate poly64x2_t
|
|
|
|
arm = vmov
|
|
generate int64x2_t, uint64x2_t
|
|
|
|
/// Extract vector from pair of vectors
|
|
name = vext
|
|
constn = N
|
|
multi_fn = static_assert_imm-out_exp_len-N
|
|
multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len}
|
|
a = 0., 2., 2., 3.
|
|
b = 3., 4., 5., 6.,
|
|
n = HFLEN
|
|
validate 2., 3., 3., 4.
|
|
|
|
aarch64 = ext
|
|
generate float64x2_t
|
|
|
|
arm = "vext.8"
|
|
generate float*_t
|
|
|
|
/// Multiply-add to accumulator
|
|
name = vmla
|
|
multi_fn = simd_add, a, {simd_mul, b, c}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
arm = vmla.
|
|
aarch64 = mla
|
|
generate int*_t, uint*_t
|
|
|
|
/// Floating-point multiply-add to accumulator
|
|
name = vmla
|
|
multi_fn = simd_add, a, {simd_mul, b, c}
|
|
a = 0., 1., 2., 3.
|
|
b = 2., 2., 2., 2.
|
|
c = 3., 3., 3., 3.
|
|
validate 6., 7., 8., 9.
|
|
|
|
aarch64 = fmul
|
|
generate float64x*_t
|
|
|
|
arm = vmla.
|
|
generate float*_t
|
|
|
|
/// Vector multiply accumulate with scalar
|
|
name = vmla
|
|
n-suffix
|
|
multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
aarch64 = mla
|
|
arm = vmla.
|
|
generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t
|
|
generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t
|
|
|
|
/// Vector multiply accumulate with scalar
|
|
name = vmla
|
|
n-suffix
|
|
multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c}
|
|
a = 0., 1., 2., 3.
|
|
b = 2., 2., 2., 2.
|
|
c = 3.
|
|
validate 6., 7., 8., 9.
|
|
|
|
aarch64 = fmul
|
|
arm = vmla.
|
|
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
|
|
|
|
/// Vector multiply accumulate with scalar
|
|
name = vmla
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
aarch64 = mla
|
|
arm = vmla.
|
|
generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
|
|
generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
|
|
generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
|
|
generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
|
|
|
|
/// Vector multiply accumulate with scalar
|
|
name = vmla
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
|
|
a = 0., 1., 2., 3.
|
|
b = 2., 2., 2., 2.
|
|
c = 0., 3., 0., 0.
|
|
n = 1
|
|
validate 6., 7., 8., 9.
|
|
|
|
aarch64 = fmul
|
|
arm = vmla.
|
|
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
|
|
|
|
/// Signed multiply-add long
|
|
name = vmlal
|
|
multi_fn = simd_add, a, {vmull-self-noext, b, c}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
arm = vmlal.s
|
|
aarch64 = smlal
|
|
generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Unsigned multiply-add long
|
|
name = vmlal
|
|
multi_fn = simd_add, a, {vmull-self-noext, b, c}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
arm = vmlal.s
|
|
aarch64 = umlal
|
|
generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Vector widening multiply accumulate with scalar
|
|
name = vmlal
|
|
n-suffix
|
|
multi_fn = vmlal-self-noext, a, b, {vdup-nself-noext, c}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
arm = vmlal.s
|
|
aarch64 = smlal
|
|
generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
|
|
aarch64 = umlal
|
|
generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t
|
|
|
|
/// Vector widening multiply accumulate with scalar
|
|
name = vmlal_lane
|
|
in2-suffix
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
|
|
arm = vmlal.s
|
|
aarch64 = smlal
|
|
generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t
|
|
generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
|
|
aarch64 = umlal
|
|
generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t
|
|
generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Signed multiply-add long
|
|
name = vmlal_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
|
|
multi_fn = vmlal-noqself-noext, a, b, c
|
|
a = 8, 7, 6, 5, 4, 3, 2, 1
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = smlal2
|
|
generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Unsigned multiply-add long
|
|
name = vmlal_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
|
|
multi_fn = vmlal-noqself-noext, a, b, c
|
|
a = 8, 7, 6, 5, 4, 3, 2, 1
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = umlal2
|
|
generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Multiply-add long
|
|
name = vmlal_high_n
|
|
no-q
|
|
multi_fn = vmlal_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c}
|
|
a = 8, 7, 6, 5, 4, 3, 2, 1
|
|
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
c = 2
|
|
validate 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = smlal2
|
|
generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
|
|
aarch64 = umlal2
|
|
generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t
|
|
|
|
/// Multiply-add long
|
|
name = vmlal_high_lane
|
|
in2-suffix
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
|
|
a = 8, 7, 6, 5, 4, 3, 2, 1
|
|
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = smlal2
|
|
generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t
|
|
generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
aarch64 = umlal2
|
|
generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
|
|
generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Multiply-subtract from accumulator
|
|
name = vmls
|
|
multi_fn = simd_sub, a, {simd_mul, b, c}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
arm = vmls.
|
|
aarch64 = mls
|
|
generate int*_t, uint*_t
|
|
|
|
/// Floating-point multiply-subtract from accumulator
|
|
name = vmls
|
|
multi_fn = simd_sub, a, {simd_mul, b, c}
|
|
a = 6., 7., 8., 9.
|
|
b = 2., 2., 2., 2.
|
|
c = 3., 3., 3., 3.
|
|
validate 0., 1., 2., 3.
|
|
|
|
aarch64 = fmul
|
|
generate float64x*_t
|
|
|
|
arm = vmls.
|
|
generate float*_t
|
|
|
|
/// Vector multiply subtract with scalar
|
|
name = vmls
|
|
n-suffix
|
|
multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = mls
|
|
arm = vmls.
|
|
generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t
|
|
generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t
|
|
|
|
/// Vector multiply subtract with scalar
|
|
name = vmls
|
|
n-suffix
|
|
multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c}
|
|
a = 6., 7., 8., 9.
|
|
b = 2., 2., 2., 2.
|
|
c = 3.
|
|
validate 0., 1., 2., 3.
|
|
|
|
aarch64 = fmul
|
|
arm = vmls.
|
|
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
|
|
|
|
/// Vector multiply subtract with scalar
|
|
name = vmls
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = mls
|
|
arm = vmls.
|
|
generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
|
|
generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
|
|
generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
|
|
generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
|
|
|
|
/// Vector multiply subtract with scalar
|
|
name = vmls
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
|
|
a = 6., 7., 8., 9.
|
|
b = 2., 2., 2., 2.
|
|
c = 0., 3., 0., 0.
|
|
n = 1
|
|
validate 0., 1., 2., 3.
|
|
|
|
aarch64 = fmul
|
|
arm = vmls.
|
|
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
|
|
|
|
/// Signed multiply-subtract long
|
|
name = vmlsl
|
|
multi_fn = simd_sub, a, {vmull-self-noext, b, c}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
arm = vmlsl.s
|
|
aarch64 = smlsl
|
|
generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Unsigned multiply-subtract long
|
|
name = vmlsl
|
|
multi_fn = simd_sub, a, {vmull-self-noext, b, c}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
arm = vmlsl.s
|
|
aarch64 = umlsl
|
|
generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Vector widening multiply subtract with scalar
|
|
name = vmlsl
|
|
n-suffix
|
|
multi_fn = vmlsl-self-noext, a, b, {vdup-nself-noext, c}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
arm = vmlsl.s
|
|
aarch64 = smlsl
|
|
generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
|
|
aarch64 = umlsl
|
|
generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t
|
|
|
|
/// Vector widening multiply subtract with scalar
|
|
name = vmlsl_lane
|
|
in2-suffix
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
|
|
a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
arm = vmlsl.s
|
|
aarch64 = smlsl
|
|
generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t
|
|
generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
|
|
aarch64 = umlsl
|
|
generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t
|
|
generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Signed multiply-subtract long
|
|
name = vmlsl_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
|
|
multi_fn = vmlsl-noqself-noext, a, b, c
|
|
a = 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 14, 13, 12, 11, 10, 9, 8, 7
|
|
|
|
aarch64 = smlsl2
|
|
generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Unsigned multiply-subtract long
|
|
name = vmlsl_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right}
|
|
multi_fn = vmlsl-noqself-noext, a, b, c
|
|
a = 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 14, 13, 12, 11, 10, 9, 8, 7
|
|
|
|
aarch64 = umlsl2
|
|
generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Multiply-subtract long
|
|
name = vmlsl_high_n
|
|
no-q
|
|
multi_fn = vmlsl_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c}
|
|
a = 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
c = 2
|
|
validate 14, 13, 12, 11, 10, 9, 8, 7
|
|
|
|
aarch64 = smlsl2
|
|
generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
|
|
aarch64 = umlsl2
|
|
generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t
|
|
|
|
/// Multiply-subtract long
|
|
name = vmlsl_high_lane
|
|
in2-suffix
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}}
|
|
a = 14, 15, 16, 17, 18, 19, 20, 21
|
|
b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
|
|
c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 14, 13, 12, 11, 10, 9, 8, 7
|
|
|
|
aarch64 = smlsl2
|
|
generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t
|
|
generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
aarch64 = umlsl2
|
|
generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
|
|
generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Extract narrow
|
|
name = vmovn_high
|
|
no-q
|
|
multi_fn = simd_cast, c:in_t0, b
|
|
multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len}
|
|
a = 0, 1, 2, 3, 2, 3, 4, 5
|
|
b = 2, 3, 4, 5, 12, 13, 14, 15
|
|
validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15
|
|
|
|
aarch64 = xtn2
|
|
generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
|
|
generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
|
|
|
|
/// Negate
|
|
name = vneg
|
|
fn = simd_neg
|
|
a = 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8
|
|
validate 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8
|
|
|
|
aarch64 = neg
|
|
generate int64x*_t
|
|
|
|
arm = vneg.s
|
|
generate int*_t
|
|
|
|
/// Negate
|
|
name = vneg
|
|
fn = simd_neg
|
|
a = 0., 1., -1., 2., -2., 3., -3., 4.
|
|
validate 0., -1., 1., -2., 2., -3., 3., -4.
|
|
|
|
aarch64 = fneg
|
|
generate float64x*_t
|
|
|
|
arm = vneg.s
|
|
generate float*_t
|
|
|
|
/// Signed saturating negate
|
|
name = vqneg
|
|
a = MIN, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7
|
|
validate MAX, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7
|
|
link-arm = vqneg._EXT_
|
|
link-aarch64 = sqneg._EXT_
|
|
|
|
aarch64 = sqneg
|
|
generate int64x*_t
|
|
|
|
arm = vqneg.s
|
|
generate int*_t
|
|
|
|
/// Saturating subtract
|
|
name = vqsub
|
|
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26
|
|
|
|
arm = vqsub.s
|
|
aarch64 = uqsub
|
|
link-arm = llvm.usub.sat._EXT_
|
|
link-aarch64 = uqsub._EXT_
|
|
generate uint*_t, uint64x*_t
|
|
|
|
arm = vqsub.s
|
|
aarch64 = sqsub
|
|
link-arm = llvm.ssub.sat._EXT_
|
|
link-aarch64 = sqsub._EXT_
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Saturating subtract
|
|
name = vqsub
|
|
multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
|
|
multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
|
|
multi_fn = simd_extract, {vqsub-in_ntt-noext, a, b}, 0
|
|
a = 42
|
|
b = 1
|
|
validate 41
|
|
|
|
aarch64 = sqsub
|
|
generate i8, i16
|
|
aarch64 = uqsub
|
|
generate u8, u16
|
|
|
|
/// Saturating subtract
|
|
name = vqsub
|
|
a = 42
|
|
b = 1
|
|
validate 41
|
|
|
|
aarch64 = uqsub
|
|
link-aarch64 = uqsub._EXT_
|
|
generate u32, u64
|
|
|
|
aarch64 = sqsub
|
|
link-aarch64 = sqsub._EXT_
|
|
generate i32, i64
|
|
|
|
/// Halving add
|
|
name = vhadd
|
|
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29
|
|
|
|
arm = vhadd.s
|
|
aarch64 = uhadd
|
|
link-aarch64 = uhadd._EXT_
|
|
link-arm = vhaddu._EXT_
|
|
generate uint*_t
|
|
|
|
arm = vhadd.s
|
|
aarch64 = shadd
|
|
link-aarch64 = shadd._EXT_
|
|
link-arm = vhadds._EXT_
|
|
generate int*_t
|
|
|
|
/// Reverse bit order
|
|
name = vrbit
|
|
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120
|
|
|
|
aarch64 = rbit
|
|
link-aarch64 = rbit._EXT_
|
|
|
|
generate int8x8_t, int8x16_t
|
|
|
|
/// Reverse bit order
|
|
name = vrbit
|
|
multi_fn = transmute, {vrbit-signed-noext, transmute(a)}
|
|
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120
|
|
|
|
aarch64 = rbit
|
|
|
|
generate uint8x8_t, uint8x16_t, poly8x8_t, poly8x16_t
|
|
|
|
/// Rounding halving add
|
|
name = vrhadd
|
|
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29
|
|
|
|
arm = vrhadd.s
|
|
aarch64 = urhadd
|
|
link-arm = vrhaddu._EXT_
|
|
link-aarch64 = urhadd._EXT_
|
|
generate uint*_t
|
|
|
|
arm = vrhadd.s
|
|
aarch64 = srhadd
|
|
link-arm = vrhadds._EXT_
|
|
link-aarch64 = srhadd._EXT_
|
|
generate int*_t
|
|
|
|
/// Floating-point round to integral exact, using current rounding mode
|
|
name = vrndx
|
|
a = -1.5, 0.5, 1.5, 2.5
|
|
validate -2.0, 0.0, 2.0, 2.0
|
|
|
|
aarch64 = frintx
|
|
link-aarch64 = llvm.rint._EXT_
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Floating-point round to integral, to nearest with ties to away
|
|
name = vrnda
|
|
a = -1.5, 0.5, 1.5, 2.5
|
|
validate -2.0, 1.0, 2.0, 3.0
|
|
|
|
aarch64 = frinta
|
|
link-aarch64 = llvm.round._EXT_
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Floating-point round to integral, to nearest with ties to even
|
|
name = vrndn
|
|
a = -1.5, 0.5, 1.5, 2.5
|
|
validate -2.0, 0.0, 2.0, 2.0
|
|
|
|
link-aarch64 = frintn._EXT_
|
|
aarch64 = frintn
|
|
generate float64x*_t
|
|
|
|
target = fp-armv8
|
|
arm = vrintn
|
|
link-arm = vrintn._EXT_
|
|
generate float*_t
|
|
|
|
/// Floating-point round to integral, toward minus infinity
|
|
name = vrndm
|
|
a = -1.5, 0.5, 1.5, 2.5
|
|
validate -2.0, 0.0, 1.0, 2.0
|
|
|
|
aarch64 = frintm
|
|
link-aarch64 = llvm.floor._EXT_
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Floating-point round to integral, toward plus infinity
|
|
name = vrndp
|
|
a = -1.5, 0.5, 1.5, 2.5
|
|
validate -1.0, 1.0, 2.0, 3.0
|
|
|
|
aarch64 = frintp
|
|
link-aarch64 = llvm.ceil._EXT_
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Floating-point round to integral, toward zero
|
|
name = vrnd
|
|
a = -1.5, 0.5, 1.5, 2.5
|
|
validate -1.0, 0.0, 1.0, 2.0
|
|
|
|
aarch64 = frintz
|
|
link-aarch64 = llvm.trunc._EXT_
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Floating-point round to integral, using current rounding mode
|
|
name = vrndi
|
|
a = -1.5, 0.5, 1.5, 2.5
|
|
validate -2.0, 0.0, 2.0, 2.0
|
|
|
|
aarch64 = frinti
|
|
link-aarch64 = llvm.nearbyint._EXT_
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Saturating add
|
|
name = vqadd
|
|
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58
|
|
|
|
arm = vqadd.s
|
|
aarch64 = uqadd
|
|
link-arm = llvm.uadd.sat._EXT_
|
|
link-aarch64 = uqadd._EXT_
|
|
generate uint*_t, uint64x*_t
|
|
|
|
arm = vqadd.s
|
|
aarch64 = sqadd
|
|
link-arm = llvm.sadd.sat._EXT_
|
|
link-aarch64 = sqadd._EXT_
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Saturating add
|
|
name = vqadd
|
|
multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
|
|
multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
|
|
multi_fn = simd_extract, {vqadd-in_ntt-noext, a, b}, 0
|
|
a = 42
|
|
b = 1
|
|
validate 43
|
|
|
|
aarch64 = sqadd
|
|
generate i8, i16
|
|
aarch64 = uqadd
|
|
generate u8, u16
|
|
|
|
/// Saturating add
|
|
name = vqadd
|
|
a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58
|
|
|
|
aarch64 = uqadd
|
|
link-aarch64 = uqadd._EXT_
|
|
generate u32, u64
|
|
|
|
aarch64 = sqadd
|
|
link-aarch64 = sqadd._EXT_
|
|
generate i32, i64
|
|
|
|
/// Load multiple single-element structures to one, two, three, or four registers
|
|
name = vld1
|
|
out-suffix
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
|
load_fn
|
|
|
|
aarch64 = ld1
|
|
link-aarch64 = ld1x2._EXT2_
|
|
arm = vld1
|
|
link-arm = vld1x2._EXT2_
|
|
generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t
|
|
generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t
|
|
|
|
link-aarch64 = ld1x3._EXT2_
|
|
link-arm = vld1x3._EXT2_
|
|
generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t
|
|
generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t, *const i64:int64x2x3_t
|
|
|
|
link-aarch64 = ld1x4._EXT2_
|
|
link-arm = vld1x4._EXT2_
|
|
generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t
|
|
generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t, *const i64:int64x2x4_t
|
|
|
|
/// Load multiple single-element structures to one, two, three, or four registers
|
|
name = vld1
|
|
out-suffix
|
|
multi_fn = transmute, {vld1-outsigned-noext, transmute(a)}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
|
|
|
load_fn
|
|
aarch64 = ld1
|
|
arm = vld1
|
|
generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t
|
|
generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t
|
|
generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t
|
|
generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t, *const u64:uint64x2x3_t
|
|
generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t
|
|
generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t, *const u64:uint64x2x4_t
|
|
generate *const p8:poly8x8x2_t, *const p8:poly8x8x3_t, *const p8:poly8x8x4_t
|
|
generate *const p8:poly8x16x2_t, *const p8:poly8x16x3_t, *const p8:poly8x16x4_t
|
|
generate *const p16:poly16x4x2_t, *const p16:poly16x4x3_t, *const p16:poly16x4x4_t
|
|
generate *const p16:poly16x8x2_t, *const p16:poly16x8x3_t, *const p16:poly16x8x4_t
|
|
target = aes
|
|
generate *const p64:poly64x1x2_t
|
|
arm = ldr
|
|
generate *const p64:poly64x1x3_t, *const p64:poly64x1x4_t
|
|
generate *const p64:poly64x2x2_t, *const p64:poly64x2x3_t, *const p64:poly64x2x4_t
|
|
|
|
/// Load multiple single-element structures to one, two, three, or four registers
|
|
name = vld1
|
|
out-suffix
|
|
a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
|
|
validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
|
|
load_fn
|
|
|
|
aarch64 = ld1
|
|
link-aarch64 = ld1x2._EXT2_
|
|
generate *const f64:float64x1x2_t, *const f64:float64x2x2_t
|
|
|
|
link-aarch64 = ld1x3._EXT2_
|
|
generate *const f64:float64x1x3_t, *const f64:float64x2x3_t
|
|
|
|
link-aarch64 = ld1x4._EXT2_
|
|
generate *const f64:float64x1x4_t, *const f64:float64x2x4_t
|
|
|
|
arm = vld1
|
|
link-aarch64 = ld1x2._EXT2_
|
|
link-arm = vld1x2._EXT2_
|
|
generate *const f32:float32x2x2_t, *const f32:float32x4x2_t
|
|
|
|
link-aarch64 = ld1x3._EXT2_
|
|
link-arm = vld1x3._EXT2_
|
|
generate *const f32:float32x2x3_t, *const f32:float32x4x3_t
|
|
|
|
link-aarch64 = ld1x4._EXT2_
|
|
link-arm = vld1x4._EXT2_
|
|
generate *const f32:float32x2x4_t, *const f32:float32x4x4_t
|
|
|
|
/// Load multiple 2-element structures to two registers
|
|
name = vld2
|
|
out-nox
|
|
a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
|
|
validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
|
|
load_fn
|
|
|
|
aarch64 = ld2
|
|
link-aarch64 = ld2._EXTv2_
|
|
arm = vld2
|
|
link-arm = vld2._EXTpi82_
|
|
//generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t
|
|
//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t
|
|
|
|
/// Load multiple 2-element structures to two registers
|
|
name = vld2
|
|
out-nox
|
|
multi_fn = transmute, {vld2-outsignednox-noext, transmute(a)}
|
|
a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
|
|
validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
|
|
load_fn
|
|
|
|
aarch64 = ld2
|
|
arm = vld2
|
|
//generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t
|
|
//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t
|
|
//generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t
|
|
|
|
/// Load multiple 2-element structures to two registers
|
|
name = vld2
|
|
out-nox
|
|
a = 0., 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9.
|
|
validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9.
|
|
load_fn
|
|
|
|
aarch64 = ld2
|
|
link-aarch64 = ld2._EXTv2_
|
|
//generate *const f64:float64x1x2_t, *const f64:float64x2x2_t
|
|
|
|
arm = vld2
|
|
link-arm = vld2._EXTpi82_
|
|
//generate *const f32:float32x2x2_t, *const f32:float32x4x2_t
|
|
|
|
/// Load single 2-element structure and replicate to all lanes of two registers
|
|
name = vld2
|
|
out-dup-nox
|
|
a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
|
|
validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
load_fn
|
|
|
|
arm = vld2dup
|
|
link-arm = vld2dup._EXTpi82_
|
|
aarch64 = ld2r
|
|
link-aarch64 = ld2r._EXT2_
|
|
//generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t
|
|
//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t
|
|
|
|
/// Load single 2-element structure and replicate to all lanes of two registers
|
|
name = vld2
|
|
out-dup-nox
|
|
multi_fn = transmute, {vld2-outsigneddupnox-noext, transmute(a)}
|
|
a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17
|
|
validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
load_fn
|
|
|
|
arm = vld2dup
|
|
aarch64 = ld2r
|
|
//generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t
|
|
//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t
|
|
//generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t
|
|
|
|
/// Load single 2-element structure and replicate to all lanes of two registers
|
|
name = vld2
|
|
out-dup-nox
|
|
a = 0., 1., 1., 2., 3., 1., 4., 3., 5.
|
|
validate 1., 1., 1., 1., 1., 1., 1., 1.
|
|
load_fn
|
|
|
|
aarch64 = ld2r
|
|
link-aarch64 = ld2r._EXT2_
|
|
//generate *const f64:float64x1x2_t, *const f64:float64x2x2_t
|
|
|
|
arm = vld2dup
|
|
link-arm = vld2dup._EXTpi82_
|
|
//generate *const f32:float32x2x2_t, *const f32:float32x4x2_t
|
|
|
|
/// Load multiple 2-element structures to two registers
|
|
name = vld2
|
|
out-lane-nox
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
constn = LANE
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
|
b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
|
|
n = 0
|
|
validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
|
|
load_fn
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = ld2lane
|
|
const-aarch64 = LANE
|
|
link-aarch64 = ld2lane._EXTpi82_
|
|
//generate *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t
|
|
|
|
arm = vld2lane
|
|
const-arm = LANE
|
|
link-arm = vld2lane._EXTpi82_
|
|
//generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t
|
|
//generate *const i8:int8x16x2_t:int8x16x2_t, *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t
|
|
|
|
/// Load multiple 2-element structures to two registers
|
|
name = vld2
|
|
out-lane-nox
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = transmute, {vld2-outsignedlanenox-::<LANE>, transmute(a), transmute(b)}
|
|
constn = LANE
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
|
|
b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
|
|
n = 0
|
|
validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
|
|
load_fn
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = ld2lane
|
|
const-aarch64 = LANE
|
|
|
|
target = aes
|
|
//generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t
|
|
|
|
target = default
|
|
//generate *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t
|
|
|
|
arm = vld2lane
|
|
const-arm = LANE
|
|
//generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t
|
|
//generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t
|
|
//generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t
|
|
//generate *const p8:poly8x16x2_t:poly8x16x2_t, *const p16:poly16x8x2_t:poly16x8x2_t
|
|
|
|
/// Load multiple 2-element structures to two registers
|
|
name = vld2
|
|
out-lane-nox
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
constn = LANE
|
|
a = 0., 1., 2., 3., 4., 5., 6., 7., 8.
|
|
b = 0., 2., 2., 14., 2., 16., 17., 18.
|
|
n = 0
|
|
validate 1., 2., 2., 14., 2., 16., 17., 18.
|
|
load_fn
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = ld2lane
|
|
const-aarch64 = LANE
|
|
link-aarch64 = ld2lane._EXTpi82_
|
|
//generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t
|
|
|
|
arm = vld2lane
|
|
const-arm = LANE
|
|
link-arm = vld2lane._EXTpi82_
|
|
//generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t
|
|
|
|
/// Store multiple single-element structures from one, two, three, or four registers
|
|
name = vst1
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
|
store_fn
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = st1
|
|
link-aarch64 = st1x2._EXT3_
|
|
arm = vst1
|
|
link-arm = vst1x2._EXTr3_
|
|
generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void
|
|
generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void, *mut i64:int64x2x2_t:void
|
|
|
|
link-aarch64 = st1x3._EXT3_
|
|
link-arm = vst1x3._EXTr3_
|
|
generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void
|
|
generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void, *mut i64:int64x2x3_t:void
|
|
|
|
link-aarch64 = st1x4._EXT3_
|
|
link-arm = vst1x4._EXTr3_
|
|
generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void
|
|
generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void, *mut i64:int64x2x4_t:void
|
|
|
|
/// Store multiple single-element structures to one, two, three, or four registers
|
|
name = vst1
|
|
multi_fn = vst1-signed-noext, transmute(a), transmute(b)
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
|
|
|
store_fn
|
|
aarch64 = st1
|
|
arm = vst1
|
|
generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void
|
|
generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void, *mut u64:uint64x2x2_t:void
|
|
generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void
|
|
generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void, *mut u64:uint64x2x3_t:void
|
|
generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void
|
|
generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void, *mut u64:uint64x2x4_t:void
|
|
generate *mut p8:poly8x8x2_t:void, *mut p8:poly8x8x3_t:void, *mut p8:poly8x8x4_t:void
|
|
generate *mut p8:poly8x16x2_t:void, *mut p8:poly8x16x3_t:void, *mut p8:poly8x16x4_t:void
|
|
generate *mut p16:poly16x4x2_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x4x4_t:void
|
|
generate *mut p16:poly16x8x2_t:void, *mut p16:poly16x8x3_t:void, *mut p16:poly16x8x4_t:void
|
|
|
|
/// Store multiple single-element structures to one, two, three, or four registers
|
|
name = vst1
|
|
a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
|
|
validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.
|
|
store_fn
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = st1
|
|
link-aarch64 = st1x2._EXT3_
|
|
generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void
|
|
|
|
link-aarch64 = st1x3._EXT3_
|
|
generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void
|
|
|
|
link-aarch64 = st1x4._EXT3_
|
|
generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void
|
|
|
|
arm = vst1
|
|
link-aarch64 = st1x2._EXT3_
|
|
link-arm = vst1x2._EXTr3_
|
|
generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void
|
|
|
|
link-aarch64 = st1x3._EXT3_
|
|
link-arm = vst1x3._EXTr3_
|
|
generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void
|
|
|
|
link-aarch64 = st1x4._EXT3_
|
|
link-arm = vst1x4._EXTr3_
|
|
generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
|
|
|
|
/// Multiply
|
|
name = vmul
|
|
a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32
|
|
arm = vmul.
|
|
aarch64 = mul
|
|
fn = simd_mul
|
|
generate int*_t, uint*_t
|
|
|
|
/// Polynomial multiply
|
|
name = vmul
|
|
a = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
validate 1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48
|
|
|
|
aarch64 = pmul
|
|
link-aarch64 = pmul._EXT_
|
|
arm = vmul
|
|
link-arm = vmulp._EXT_
|
|
generate poly8x8_t, poly8x16_t
|
|
|
|
/// Multiply
|
|
name = vmul
|
|
fn = simd_mul
|
|
a = 1.0, 2.0, 1.0, 2.0
|
|
b = 2.0, 3.0, 4.0, 5.0
|
|
validate 2.0, 6.0, 4.0, 10.0
|
|
|
|
aarch64 = fmul
|
|
generate float64x*_t
|
|
|
|
arm = vmul.
|
|
generate float*_t
|
|
|
|
/// Vector multiply by scalar
|
|
name = vmul
|
|
out-n-suffix
|
|
multi_fn = simd_mul, a, {vdup-nout-noext, b}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 2
|
|
validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
|
|
|
|
arm = vmul
|
|
aarch64 = mul
|
|
generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t
|
|
generate uint16x4_t:u16:uint16x4_t, uint16x8_t:u16:uint16x8_t, uint32x2_t:u32:uint32x2_t, uint32x4_t:u32:uint32x4_t
|
|
|
|
/// Vector multiply by scalar
|
|
name = vmul
|
|
out-n-suffix
|
|
multi_fn = simd_mul, a, {vdup-nout-noext, b}
|
|
a = 1., 2., 3., 4.
|
|
b = 2.
|
|
validate 2., 4., 6., 8.
|
|
|
|
aarch64 = fmul
|
|
generate float64x1_t:f64:float64x1_t, float64x2_t:f64:float64x2_t
|
|
|
|
arm = vmul
|
|
generate float32x2_t:f32:float32x2_t, float32x4_t:f32:float32x4_t
|
|
|
|
/// Multiply
|
|
name = vmul
|
|
lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
|
|
|
|
aarch64 = mul
|
|
arm = vmul
|
|
generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t
|
|
generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t
|
|
generate uint16x4_t, uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
|
|
generate uint32x2_t, uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
|
|
|
|
/// Floating-point multiply
|
|
name = vmul
|
|
lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_mul, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}}
|
|
a = 1., 2., 3., 4.
|
|
b = 2., 0., 0., 0.
|
|
n = 0
|
|
validate 2., 4., 6., 8.
|
|
|
|
aarch64 = fmul
|
|
generate float64x1_t, float64x1_t:float64x2_t:float64x1_t
|
|
|
|
/// Floating-point multiply
|
|
name = vmul
|
|
lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}}
|
|
a = 1., 2., 3., 4.
|
|
b = 2., 0., 0., 0.
|
|
n = 0
|
|
validate 2., 4., 6., 8.
|
|
|
|
aarch64 = fmul
|
|
generate float64x2_t:float64x1_t:float64x2_t, float64x2_t
|
|
|
|
arm = vmul
|
|
generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t
|
|
|
|
/// Floating-point multiply
|
|
name = vmuls_lane
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_extract, b:f32, b, LANE as u32
|
|
multi_fn = a * b
|
|
a = 1.
|
|
b = 2., 0., 0., 0.
|
|
n = 0
|
|
validate 2.
|
|
aarch64 = fmul
|
|
generate f32:float32x2_t:f32, f32:float32x4_t:f32
|
|
|
|
/// Floating-point multiply
|
|
name = vmuld_lane
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_extract, b:f64, b, LANE as u32
|
|
multi_fn = a * b
|
|
a = 1.
|
|
b = 2., 0.
|
|
n = 0
|
|
validate 2.
|
|
aarch64 = fmul
|
|
generate f64:float64x1_t:f64, f64:float64x2_t:f64
|
|
|
|
/// Signed multiply long
|
|
name = vmull
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32
|
|
|
|
arm = vmull.s
|
|
aarch64 = smull
|
|
link-arm = vmulls._EXT_
|
|
link-aarch64 = smull._EXT_
|
|
generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Signed multiply long
|
|
name = vmull_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
|
|
multi_fn = vmull-noqself-noext, a, b
|
|
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 9, 20, 11, 24, 13, 28, 15, 32
|
|
|
|
aarch64 = smull2
|
|
generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Unsigned multiply long
|
|
name = vmull
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2
|
|
validate 1, 4, 3, 8, 5, 12, 7, 16
|
|
|
|
arm = vmull.s
|
|
aarch64 = umull
|
|
link-arm = vmullu._EXT_
|
|
link-aarch64 = umull._EXT_
|
|
generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Unsigned multiply long
|
|
name = vmull_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
|
|
multi_fn = vmull-noqself-noext, a, b
|
|
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 9, 20, 11, 24, 13, 28, 15, 32
|
|
|
|
aarch64 = umull2
|
|
generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Polynomial multiply long
|
|
name = vmull
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8
|
|
b = 1, 3, 1, 3, 1, 3, 1, 3
|
|
validate 1, 6, 3, 12, 5, 10, 7, 24
|
|
|
|
arm = vmull.s
|
|
aarch64 = pmull
|
|
link-arm = vmullp._EXT_
|
|
link-aarch64 = pmull._EXT_
|
|
generate poly8x8_t:poly8x8_t:poly16x8_t
|
|
|
|
/// Polynomial multiply long
|
|
name = vmull
|
|
no-q
|
|
a = 15
|
|
b = 3
|
|
validate 17
|
|
target = aes
|
|
|
|
aarch64 = pmull
|
|
link-aarch64 = pmull64:p64:p64:p64:int8x16_t
|
|
// Because of the support status of llvm, vmull_p64 is currently only available on aarch64
|
|
// arm = vmull
|
|
// link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t
|
|
generate p64:p64:p128
|
|
|
|
|
|
/// Polynomial multiply long
|
|
name = vmull_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right}
|
|
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right}
|
|
multi_fn = vmull-noqself-noext, a, b
|
|
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3
|
|
fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 9, 30, 11, 20, 13, 18, 15, 48
|
|
|
|
aarch64 = pmull
|
|
generate poly8x16_t:poly8x16_t:poly16x8_t
|
|
|
|
/// Polynomial multiply long
|
|
name = vmull_high
|
|
no-q
|
|
multi_fn = vmull-noqself-noext, {simd_extract, a, 1}, {simd_extract, b, 1}
|
|
a = 1, 15
|
|
b = 1, 3
|
|
validate 17
|
|
target = aes
|
|
|
|
aarch64 = pmull
|
|
generate poly64x2_t:poly64x2_t:p128
|
|
|
|
/// Vector long multiply with scalar
|
|
name = vmull_n
|
|
no-q
|
|
multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8
|
|
b = 2
|
|
validate 2, 4, 6, 8, 10, 12, 14, 16
|
|
|
|
arm = vmull
|
|
aarch64 = smull
|
|
generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t
|
|
aarch64 = umull
|
|
generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t
|
|
|
|
/// Vector long multiply by scalar
|
|
name = vmull_lane
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32
|
|
|
|
arm = vmull
|
|
aarch64 = smull
|
|
generate int16x4_t:int16x4_t:int32x4_t, int16x4_t:int16x8_t:int32x4_t
|
|
generate int32x2_t:int32x2_t:int64x2_t, int32x2_t:int32x4_t:int64x2_t
|
|
aarch64 = umull
|
|
generate uint16x4_t:uint16x4_t:uint32x4_t, uint16x4_t:uint16x8_t:uint32x4_t
|
|
generate uint32x2_t:uint32x2_t:uint64x2_t, uint32x2_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Multiply long
|
|
name = vmull_high_n
|
|
no-q
|
|
multi_fn = vmull_high-noqself-noext, a, {vdup-nin0-noext, b}
|
|
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 2
|
|
validate 18, 20, 22, 24, 26, 28, 30, 32
|
|
|
|
aarch64 = smull2
|
|
generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t
|
|
aarch64 = umull2
|
|
generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t
|
|
|
|
/// Multiply long
|
|
name = vmull_high_lane
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
|
|
a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 18, 20, 22, 24, 26, 28, 30, 32
|
|
|
|
aarch64 = smull2
|
|
generate int16x8_t:int16x4_t:int32x4_t, int16x8_t:int16x8_t:int32x4_t
|
|
generate int32x4_t:int32x2_t:int64x2_t, int32x4_t:int32x4_t:int64x2_t
|
|
aarch64 = umull2
|
|
generate uint16x8_t:uint16x4_t:uint32x4_t, uint16x8_t:uint16x8_t:uint32x4_t
|
|
generate uint32x4_t:uint32x2_t:uint64x2_t, uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Floating-point multiply extended
|
|
name = vmulx
|
|
a = 1., 2., 3., 4.
|
|
b = 2., 2., 2., 2.
|
|
validate 2., 4., 6., 8.
|
|
|
|
aarch64 = fmulx
|
|
link-aarch64 = fmulx._EXT_
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Floating-point multiply extended
|
|
name = vmulx
|
|
lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = vmulx-in0-noext, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}}
|
|
a = 1.
|
|
b = 2., 0.
|
|
n = 0
|
|
validate 2.
|
|
|
|
aarch64 = fmulx
|
|
generate float64x1_t, float64x1_t:float64x2_t:float64x1_t
|
|
|
|
/// Floating-point multiply extended
|
|
name = vmulx
|
|
lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}}
|
|
a = 1., 2., 3., 4.
|
|
b = 2., 0., 0., 0.
|
|
n = 0
|
|
validate 2., 4., 6., 8.
|
|
|
|
aarch64 = fmulx
|
|
generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t
|
|
generate float64x2_t:float64x1_t:float64x2_t, float64x2_t
|
|
|
|
/// Floating-point multiply extended
|
|
name = vmulx
|
|
a = 2.
|
|
b = 3.
|
|
validate 6.
|
|
|
|
aarch64 = fmulx
|
|
link-aarch64 = fmulx._EXT_
|
|
generate f32, f64
|
|
|
|
/// Floating-point multiply extended
|
|
name = vmulx
|
|
lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = vmulx-out-noext, a, {simd_extract, b, LANE as u32}
|
|
|
|
a = 2.
|
|
b = 3., 0., 0., 0.
|
|
n = 0
|
|
validate 6.
|
|
|
|
aarch64 = fmulx
|
|
generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:float64x2_t:f64
|
|
|
|
/// Floating-point fused Multiply-Add to accumulator(vector)
|
|
name = vfma
|
|
multi_fn = vfma-self-_, b, c, a
|
|
a = 8.0, 18.0, 12.0, 10.0
|
|
b = 6.0, 4.0, 7.0, 8.0
|
|
c = 2.0, 3.0, 4.0, 5.0
|
|
validate 20.0, 30.0, 40.0, 50.0
|
|
|
|
link-aarch64 = llvm.fma._EXT_
|
|
aarch64 = fmadd
|
|
generate float64x1_t
|
|
aarch64 = fmla
|
|
generate float64x2_t
|
|
|
|
target = vfp4
|
|
arm = vfma
|
|
link-arm = llvm.fma._EXT_
|
|
generate float*_t
|
|
|
|
/// Floating-point fused Multiply-Add to accumulator(vector)
|
|
name = vfma
|
|
n-suffix
|
|
multi_fn = vfma-self-noext, a, b, {vdup-nselfvfp4-noext, c}
|
|
a = 2.0, 3.0, 4.0, 5.0
|
|
b = 6.0, 4.0, 7.0, 8.0
|
|
c = 8.0
|
|
validate 50.0, 35.0, 60.0, 69.0
|
|
|
|
aarch64 = fmadd
|
|
generate float64x1_t:float64x1_t:f64:float64x1_t
|
|
aarch64 = fmla
|
|
generate float64x2_t:float64x2_t:f64:float64x2_t
|
|
|
|
target = vfp4
|
|
arm = vfma
|
|
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
|
|
|
|
/// Floating-point fused multiply-add to accumulator
|
|
name = vfma
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
|
|
a = 2., 3., 4., 5.
|
|
b = 6., 4., 7., 8.
|
|
c = 2., 0., 0., 0.
|
|
n = 0
|
|
validate 14., 11., 18., 21.
|
|
|
|
aarch64 = fmla
|
|
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
|
|
aarch64 = fmadd
|
|
generate float64x1_t
|
|
aarch64 = fmla
|
|
generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t
|
|
|
|
/// Floating-point fused multiply-add to accumulator
|
|
name = vfma
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = simd_extract, c:out_t, c, LANE as u32
|
|
multi_fn = vfma-in2lane-_, b, c, a
|
|
a = 2.
|
|
b = 6.
|
|
c = 3., 0., 0., 0.
|
|
n = 0
|
|
validate 20.
|
|
|
|
aarch64 = fmla
|
|
link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32
|
|
generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32
|
|
link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64
|
|
aarch64 = fmadd
|
|
generate f64:f64:float64x1_t:f64
|
|
aarch64 = fmla
|
|
generate f64:f64:float64x2_t:f64
|
|
|
|
/// Floating-point fused multiply-subtract from accumulator
|
|
name = vfms
|
|
multi_fn = simd_neg, b:in_t, b
|
|
multi_fn = vfma-self-noext, a, b, c
|
|
a = 20.0, 30.0, 40.0, 50.0
|
|
b = 6.0, 4.0, 7.0, 8.0
|
|
c = 2.0, 3.0, 4.0, 5.0
|
|
validate 8.0, 18.0, 12.0, 10.0
|
|
|
|
aarch64 = fmsub
|
|
generate float64x1_t
|
|
aarch64 = fmls
|
|
generate float64x2_t
|
|
|
|
target = vfp4
|
|
arm = vfms
|
|
generate float*_t
|
|
|
|
/// Floating-point fused Multiply-subtract to accumulator(vector)
|
|
name = vfms
|
|
n-suffix
|
|
multi_fn = vfms-self-noext, a, b, {vdup-nselfvfp4-noext, c}
|
|
a = 50.0, 35.0, 60.0, 69.0
|
|
b = 6.0, 4.0, 7.0, 8.0
|
|
c = 8.0
|
|
validate 2.0, 3.0, 4.0, 5.0
|
|
|
|
aarch64 = fmsub
|
|
generate float64x1_t:float64x1_t:f64:float64x1_t
|
|
aarch64 = fmls
|
|
generate float64x2_t:float64x2_t:f64:float64x2_t
|
|
|
|
target = vfp4
|
|
arm = vfms
|
|
generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
|
|
|
|
/// Floating-point fused multiply-subtract to accumulator
|
|
name = vfms
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
|
|
a = 14., 11., 18., 21.
|
|
b = 6., 4., 7., 8.
|
|
c = 2., 0., 0., 0.
|
|
n = 0
|
|
validate 2., 3., 4., 5.
|
|
|
|
aarch64 = fmls
|
|
generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
|
|
aarch64 = fmsub
|
|
generate float64x1_t
|
|
aarch64 = fmls
|
|
generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t
|
|
|
|
/// Floating-point fused multiply-subtract to accumulator
|
|
name = vfms
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = vfma-in2lane-::<LANE>, a, -b, c
|
|
a = 14.
|
|
b = 6.
|
|
c = 2., 0., 0., 0.
|
|
n = 0
|
|
validate 2.
|
|
|
|
aarch64 = fmls
|
|
generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32
|
|
aarch64 = fmsub
|
|
generate f64:f64:float64x1_t:f64
|
|
aarch64 = fmls
|
|
generate f64:f64:float64x2_t:f64
|
|
|
|
/// Divide
|
|
name = vdiv
|
|
fn = simd_div
|
|
a = 2.0, 6.0, 4.0, 10.0
|
|
b = 1.0, 2.0, 1.0, 2.0
|
|
validate 2.0, 3.0, 4.0, 5.0
|
|
|
|
aarch64 = fdiv
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Subtract
|
|
name = vsub
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
validate 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
|
arm = vsub.
|
|
aarch64 = sub
|
|
fn = simd_sub
|
|
generate int*_t, uint*_t, int64x*_t, uint64x*_t
|
|
|
|
/// Subtract
|
|
name = vsub
|
|
fn = simd_sub
|
|
a = 1.0, 4.0, 3.0, 8.0
|
|
b = 1.0, 2.0, 3.0, 4.0
|
|
validate 0.0, 2.0, 0.0, 4.0
|
|
|
|
aarch64 = fsub
|
|
generate float64x*_t
|
|
|
|
arm = vsub.
|
|
generate float*_t
|
|
|
|
/// Signed Add Long across Vector
|
|
name = vaddlv
|
|
a = 1, 2, 3, 4
|
|
validate 10
|
|
|
|
aarch64 = saddlv
|
|
link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_
|
|
generate int16x4_t:i32
|
|
|
|
/// Signed Add Long across Vector
|
|
name = vaddlv
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8
|
|
validate 36
|
|
|
|
aarch64 = saddlv
|
|
link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_
|
|
generate int16x8_t:i32
|
|
|
|
/// Signed Add Long across Vector
|
|
name = vaddlv
|
|
a = 1, 2
|
|
validate 3
|
|
|
|
aarch64 = saddlp
|
|
link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_
|
|
generate int32x2_t:i64
|
|
|
|
/// Signed Add Long across Vector
|
|
name = vaddlv
|
|
a = 1, 2, 3, 4
|
|
validate 10
|
|
|
|
aarch64 = saddlv
|
|
link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_
|
|
generate int32x4_t:i64
|
|
|
|
/// Unsigned Add Long across Vector
|
|
name = vaddlv
|
|
a = 1, 2, 3, 4
|
|
validate 10
|
|
|
|
aarch64 = uaddlv
|
|
link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_
|
|
generate uint16x4_t:u32
|
|
|
|
/// Unsigned Add Long across Vector
|
|
name = vaddlv
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8
|
|
validate 36
|
|
|
|
aarch64 = uaddlv
|
|
link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_
|
|
generate uint16x8_t:u32
|
|
|
|
/// Unsigned Add Long across Vector
|
|
name = vaddlv
|
|
a = 1, 2
|
|
validate 3
|
|
|
|
aarch64 = uaddlp
|
|
link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_
|
|
generate uint32x2_t:u64
|
|
|
|
/// Unsigned Add Long across Vector
|
|
name = vaddlv
|
|
a = 1, 2, 3, 4
|
|
validate 10
|
|
|
|
aarch64 = uaddlv
|
|
link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_
|
|
generate uint32x4_t:u64
|
|
|
|
/// Subtract returning high narrow
|
|
name = vsubhn
|
|
no-q
|
|
multi_fn = fixed, c:in_t
|
|
multi_fn = simd_cast, {simd_shr, {simd_sub, a, b}, transmute(c)}
|
|
a = MAX, MIN, 1, 1, MAX, MIN, 1, 1
|
|
b = 1, 0, 0, 0, 1, 0, 0, 0
|
|
fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS
|
|
validate MAX, MIN, 0, 0, MAX, MIN, 0, 0
|
|
|
|
arm = vsubhn
|
|
aarch64 = subhn
|
|
generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
|
|
generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
|
|
|
|
/// Subtract returning high narrow
|
|
name = vsubhn_high
|
|
no-q
|
|
multi_fn = vsubhn-noqself-noext, d:in_t0, b, c
|
|
multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len}
|
|
a = MAX, 0, MAX, 0, MAX, 0, MAX, 0
|
|
b = MAX, 1, MAX, 1, MAX, 1, MAX, 1
|
|
c = 1, 0, 1, 0, 1, 0, 1, 0
|
|
validate MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0
|
|
|
|
arm = vsubhn
|
|
aarch64 = subhn2
|
|
generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t
|
|
generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t
|
|
|
|
/// Signed halving subtract
|
|
name = vhsub
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
validate 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7
|
|
|
|
arm = vhsub.s
|
|
aarch64 = uhsub
|
|
link-arm = vhsubu._EXT_
|
|
link-aarch64 = uhsub._EXT_
|
|
generate uint*_t
|
|
|
|
arm = vhsub.s
|
|
aarch64 = shsub
|
|
link-arm = vhsubs._EXT_
|
|
link-aarch64 = shsub._EXT_
|
|
generate int*_t
|
|
|
|
/// Signed Subtract Wide
|
|
name = vsubw
|
|
no-q
|
|
multi_fn = simd_sub, a, {simd_cast, b}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
arm = vsubw
|
|
aarch64 = ssubw
|
|
generate int16x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Unsigned Subtract Wide
|
|
name = vsubw
|
|
no-q
|
|
multi_fn = simd_sub, a, {simd_cast, b}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
arm = vsubw
|
|
aarch64 = usubw
|
|
generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Signed Subtract Wide
|
|
name = vsubw_high
|
|
no-q
|
|
multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_sub, a, {simd_cast, c}
|
|
a = 8, 9, 10, 12, 13, 14, 15, 16
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
aarch64 = ssubw
|
|
generate int16x8_t:int8x16_t:int16x8_t
|
|
|
|
/// Signed Subtract Wide
|
|
name = vsubw_high
|
|
no-q
|
|
multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7]
|
|
multi_fn = simd_sub, a, {simd_cast, c}
|
|
a = 8, 9, 10, 11
|
|
b = 0, 1, 2, 3, 8, 9, 10, 11
|
|
validate 0, 0, 0, 0
|
|
|
|
aarch64 = ssubw
|
|
generate int32x4_t:int16x8_t:int32x4_t
|
|
|
|
/// Signed Subtract Wide
|
|
name = vsubw_high
|
|
no-q
|
|
multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3]
|
|
multi_fn = simd_sub, a, {simd_cast, c}
|
|
a = 8, 9
|
|
b = 6, 7, 8, 9
|
|
validate 0, 0
|
|
|
|
aarch64 = ssubw
|
|
generate int64x2_t:int32x4_t:int64x2_t
|
|
|
|
/// Unsigned Subtract Wide
|
|
name = vsubw_high
|
|
no-q
|
|
multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_sub, a, {simd_cast, c}
|
|
a = 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
aarch64 = usubw
|
|
generate uint16x8_t:uint8x16_t:uint16x8_t
|
|
|
|
/// Unsigned Subtract Wide
|
|
name = vsubw_high
|
|
no-q
|
|
multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7]
|
|
multi_fn = simd_sub, a, {simd_cast, c}
|
|
a = 8, 9, 10, 11
|
|
b = 0, 1, 2, 3, 8, 9, 10, 11
|
|
validate 0, 0, 0, 0
|
|
|
|
aarch64 = usubw
|
|
generate uint32x4_t:uint16x8_t:uint32x4_t
|
|
|
|
/// Unsigned Subtract Wide
|
|
name = vsubw_high
|
|
no-q
|
|
multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3]
|
|
multi_fn = simd_sub, a, {simd_cast, c}
|
|
a = 8, 9
|
|
b = 6, 7, 8, 9
|
|
validate 0, 0
|
|
|
|
aarch64 = usubw
|
|
generate uint64x2_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Signed Subtract Long
|
|
name = vsubl
|
|
no-q
|
|
multi_fn = simd_cast, c:out_t, a
|
|
multi_fn = simd_cast, d:out_t, b
|
|
multi_fn = simd_sub, c, d
|
|
|
|
a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
arm = vsubl
|
|
aarch64 = ssubl
|
|
generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Unsigned Subtract Long
|
|
name = vsubl
|
|
no-q
|
|
multi_fn = simd_cast, c:out_t, a
|
|
multi_fn = simd_cast, d:out_t, b
|
|
multi_fn = simd_sub, c, d
|
|
|
|
a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
arm = vsubl
|
|
aarch64 = usubl
|
|
generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Signed Subtract Long
|
|
name = vsubl_high
|
|
no-q
|
|
multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_cast, d:out_t, c
|
|
multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_cast, f:out_t, e
|
|
multi_fn = simd_sub, d, f
|
|
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13
|
|
|
|
aarch64 = ssubl
|
|
generate int8x16_t:int8x16_t:int16x8_t
|
|
|
|
/// Signed Subtract Long
|
|
name = vsubl_high
|
|
no-q
|
|
multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7]
|
|
multi_fn = simd_cast, d:out_t, c
|
|
multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7]
|
|
multi_fn = simd_cast, f:out_t, e
|
|
multi_fn = simd_sub, d, f
|
|
|
|
a = 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 6, 6, 6, 6, 8, 8, 8, 8
|
|
validate 4, 5, 6, 7
|
|
|
|
aarch64 = ssubl
|
|
generate int16x8_t:int16x8_t:int32x4_t
|
|
|
|
/// Signed Subtract Long
|
|
name = vsubl_high
|
|
no-q
|
|
multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3]
|
|
multi_fn = simd_cast, d:out_t, c
|
|
multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3]
|
|
multi_fn = simd_cast, f:out_t, e
|
|
multi_fn = simd_sub, d, f
|
|
|
|
a = 12, 13, 14, 15
|
|
b = 6, 6, 8, 8
|
|
validate 6, 7
|
|
|
|
aarch64 = ssubl
|
|
generate int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Unsigned Subtract Long
|
|
name = vsubl_high
|
|
no-q
|
|
multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_cast, d:out_t, c
|
|
multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_cast, f:out_t, e
|
|
multi_fn = simd_sub, d, f
|
|
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 6, 7, 8, 9, 10, 11, 12, 13
|
|
|
|
aarch64 = usubl
|
|
generate uint8x16_t:uint8x16_t:uint16x8_t
|
|
|
|
/// Unsigned Subtract Long
|
|
name = vsubl_high
|
|
no-q
|
|
multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7]
|
|
multi_fn = simd_cast, d:out_t, c
|
|
multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7]
|
|
multi_fn = simd_cast, f:out_t, e
|
|
multi_fn = simd_sub, d, f
|
|
|
|
a = 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 6, 6, 6, 6, 8, 8, 8, 8
|
|
validate 4, 5, 6, 7
|
|
|
|
aarch64 = usubl
|
|
generate uint16x8_t:uint16x8_t:uint32x4_t
|
|
|
|
/// Unsigned Subtract Long
|
|
name = vsubl_high
|
|
no-q
|
|
multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3]
|
|
multi_fn = simd_cast, d:out_t, c
|
|
multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3]
|
|
multi_fn = simd_cast, f:out_t, e
|
|
multi_fn = simd_sub, d, f
|
|
|
|
a = 12, 13, 14, 15
|
|
b = 6, 6, 8, 8
|
|
validate 6, 7
|
|
|
|
aarch64 = usubl
|
|
generate uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
/// Maximum (vector)
|
|
name = vmax
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
|
|
validate 16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
arm = vmax
|
|
aarch64 = smax
|
|
link-arm = vmaxs._EXT_
|
|
link-aarch64 = smax._EXT_
|
|
generate int*_t
|
|
|
|
arm = vmax
|
|
aarch64 = umax
|
|
link-arm = vmaxu._EXT_
|
|
link-aarch64 = umax._EXT_
|
|
generate uint*_t
|
|
|
|
/// Maximum (vector)
|
|
name = vmax
|
|
a = 1.0, -2.0, 3.0, -4.0
|
|
b = 0.0, 3.0, 2.0, 8.0
|
|
validate 1.0, 3.0, 3.0, 8.0
|
|
|
|
aarch64 = fmax
|
|
link-aarch64 = fmax._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vmax
|
|
aarch64 = fmax
|
|
link-arm = vmaxs._EXT_
|
|
link-aarch64 = fmax._EXT_
|
|
generate float*_t
|
|
|
|
/// Floating-point Maximun Number (vector)
|
|
name = vmaxnm
|
|
a = 1.0, 2.0, 3.0, -4.0
|
|
b = 8.0, 16.0, -1.0, 6.0
|
|
validate 8.0, 16.0, 3.0, 6.0
|
|
|
|
aarch64 = fmaxnm
|
|
link-aarch64 = fmaxnm._EXT_
|
|
generate float64x*_t
|
|
|
|
target = fp-armv8
|
|
arm = vmaxnm
|
|
aarch64 = fmaxnm
|
|
link-arm = vmaxnm._EXT_
|
|
link-aarch64 = fmaxnm._EXT_
|
|
generate float*_t
|
|
|
|
/// Floating-point Maximum Number Pairwise (vector).
|
|
name = vpmaxnm
|
|
a = 1.0, 2.0
|
|
b = 6.0, -3.0
|
|
validate 2.0, 6.0
|
|
aarch64 = fmaxnmp
|
|
link-aarch64 = fmaxnmp._EXT_
|
|
generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t
|
|
|
|
/// Floating-point Maximum Number Pairwise (vector).
|
|
name = vpmaxnm
|
|
a = 1.0, 2.0, 3.0, -4.0
|
|
b = 8.0, 16.0, -1.0, 6.0
|
|
validate 2.0, 3.0, 16.0, 6.0
|
|
aarch64 = fmaxnmp
|
|
link-aarch64 = fmaxnmp._EXT_
|
|
generate float32x4_t:float32x4_t:float32x4_t
|
|
|
|
/// Minimum (vector)
|
|
name = vmin
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1
|
|
|
|
arm = vmin
|
|
aarch64 = smin
|
|
link-arm = vmins._EXT_
|
|
link-aarch64 = smin._EXT_
|
|
generate int*_t
|
|
|
|
arm = vmin
|
|
aarch64 = umin
|
|
link-arm = vminu._EXT_
|
|
link-aarch64 = umin._EXT_
|
|
generate uint*_t
|
|
|
|
/// Minimum (vector)
|
|
name = vmin
|
|
a = 1.0, -2.0, 3.0, -4.0
|
|
b = 0.0, 3.0, 2.0, 8.0
|
|
validate 0.0, -2.0, 2.0, -4.0
|
|
|
|
aarch64 = fmin
|
|
link-aarch64 = fmin._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vmin
|
|
aarch64 = fmin
|
|
link-arm = vmins._EXT_
|
|
link-aarch64 = fmin._EXT_
|
|
generate float*_t
|
|
|
|
/// Floating-point Minimun Number (vector)
|
|
name = vminnm
|
|
a = 1.0, 2.0, 3.0, -4.0
|
|
b = 8.0, 16.0, -1.0, 6.0
|
|
validate 1.0, 2.0, -1.0, -4.0
|
|
|
|
aarch64 = fminnm
|
|
link-aarch64 = fminnm._EXT_
|
|
generate float64x*_t
|
|
|
|
target = fp-armv8
|
|
arm = vminnm
|
|
aarch64 = fminnm
|
|
link-arm = vminnm._EXT_
|
|
link-aarch64 = fminnm._EXT_
|
|
generate float*_t
|
|
|
|
/// Floating-point Minimum Number Pairwise (vector).
|
|
name = vpminnm
|
|
a = 1.0, 2.0
|
|
b = 6.0, -3.0
|
|
validate 1.0, -3.0
|
|
aarch64 = fminnmp
|
|
link-aarch64 = fminnmp._EXT_
|
|
generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t
|
|
|
|
/// Floating-point Minimum Number Pairwise (vector).
|
|
name = vpminnm
|
|
a = 1.0, 2.0, 3.0, -4.0
|
|
b = 8.0, 16.0, -1.0, 6.0
|
|
validate 1.0, -4.0, 8.0, -1.0
|
|
aarch64 = fminnmp
|
|
link-aarch64 = fminnmp._EXT_
|
|
generate float32x4_t:float32x4_t:float32x4_t
|
|
|
|
/// Signed saturating doubling multiply long
|
|
name = vqdmull
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8
|
|
validate 0, 4, 12, 24, 40, 60, 84, 108
|
|
|
|
aarch64 = sqdmull
|
|
link-aarch64 = sqdmull._EXT2_
|
|
arm = vqdmull
|
|
link-arm = vqdmull._EXT2_
|
|
generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply long
|
|
name = vqdmull
|
|
multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
|
|
multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
|
|
multi_fn = simd_extract, {vqdmull-in_ntt-noext, a, b}, 0
|
|
a = 2
|
|
b = 3
|
|
validate 12
|
|
|
|
aarch64 = sqdmull
|
|
generate i16:i16:i32
|
|
|
|
/// Signed saturating doubling multiply long
|
|
name = vqdmull
|
|
a = 2
|
|
b = 3
|
|
validate 12
|
|
|
|
aarch64 = sqdmull
|
|
link-aarch64 = sqdmulls.scalar
|
|
generate i32:i32:i64
|
|
|
|
/// Vector saturating doubling long multiply with scalar
|
|
name = vqdmull_n
|
|
no-q
|
|
multi_fn = vqdmull-in_ntt-noext, a, {vdup_n-in_ntt-noext, b}
|
|
a = 2, 4, 6, 8
|
|
b = 2
|
|
validate 8, 16, 24, 32
|
|
|
|
aarch64 = sqdmull
|
|
arm = vqdmull
|
|
generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply long
|
|
name = vqdmull_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen}
|
|
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen}
|
|
multi_fn = vqdmull-noqself-noext, a, b
|
|
a = 0, 1, 4, 5, 4, 5, 6, 7
|
|
b = 1, 2, 5, 6, 5, 6, 7, 8
|
|
validate 40, 60, 84, 112
|
|
|
|
aarch64 = sqdmull2
|
|
generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply long
|
|
name = vqdmull_high_n
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len}
|
|
multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
|
|
multi_fn = vqdmull-in_ntt-noext, a, b
|
|
a = 0, 2, 8, 10, 8, 10, 12, 14
|
|
b = 2
|
|
validate 32, 40, 48, 56
|
|
|
|
aarch64 = sqdmull2
|
|
generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t
|
|
|
|
/// Vector saturating doubling long multiply by scalar
|
|
name = vqdmull_lane
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32}
|
|
multi_fn = vqdmull-noqself-noext, a, b
|
|
a = 1, 2, 3, 4
|
|
b = 0, 2, 2, 0, 2, 0, 0, 0
|
|
n = HFLEN
|
|
validate 4, 8, 12, 16
|
|
|
|
aarch64 = sqdmull
|
|
generate int16x4_t:int16x8_t:int32x4_t, int32x2_t:int32x4_t:int64x2_t
|
|
|
|
arm = vqdmull
|
|
generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply long
|
|
name = vqdmullh_lane
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_extract, b:in_t0, b, N as u32
|
|
multi_fn = vqdmullh-noqself-noext, a, b
|
|
a = 2
|
|
b = 0, 2, 2, 0, 2, 0, 0, 0
|
|
n = HFLEN
|
|
validate 8
|
|
|
|
aarch64 = sqdmull
|
|
generate i16:int16x4_t:i32, i16:int16x8_t:i32
|
|
|
|
/// Signed saturating doubling multiply long
|
|
name = vqdmulls_lane
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_extract, b:in_t0, b, N as u32
|
|
multi_fn = vqdmulls-noqself-noext, a, b
|
|
a = 2
|
|
b = 0, 2, 2, 0, 2, 0, 0, 0
|
|
n = HFLEN
|
|
validate 8
|
|
|
|
aarch64 = sqdmull
|
|
generate i32:int32x2_t:i64, i32:int32x4_t:i64
|
|
|
|
/// Signed saturating doubling multiply long
|
|
name = vqdmull_high_lane
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len}
|
|
multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32}
|
|
multi_fn = vqdmull-self-noext, a, b
|
|
a = 0, 1, 4, 5, 4, 5, 6, 7
|
|
b = 0, 2, 2, 0, 2, 0, 0, 0
|
|
n = HFLEN
|
|
validate 16, 20, 24, 28
|
|
|
|
aarch64 = sqdmull2
|
|
generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply long
|
|
name = vqdmull_high_lane
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len}
|
|
multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32}
|
|
multi_fn = vqdmull-noqself-noext, a, b
|
|
a = 0, 1, 4, 5, 4, 5, 6, 7
|
|
b = 0, 2, 2, 0, 2, 0, 0, 0
|
|
n = HFLEN
|
|
validate 16, 20, 24, 28
|
|
|
|
aarch64 = sqdmull2
|
|
generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply-add long
|
|
name = vqdmlal
|
|
multi_fn = vqadd-out-noext, a, {vqdmull-self-noext, b, c}
|
|
a = 1, 1, 1, 1
|
|
b = 1, 2, 3, 4
|
|
c = 2, 2, 2, 2
|
|
validate 5, 9, 13, 17
|
|
|
|
aarch64 = sqdmlal
|
|
arm = vqdmlal
|
|
generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Vector widening saturating doubling multiply accumulate with scalar
|
|
name = vqdmlal
|
|
n-suffix
|
|
multi_fn = vqadd-out-noext, a, {vqdmull_n-self-noext, b, c}
|
|
a = 1, 1, 1, 1
|
|
b = 1, 2, 3, 4
|
|
c = 2
|
|
validate 5, 9, 13, 17
|
|
|
|
aarch64 = sqdmlal
|
|
arm = vqdmlal
|
|
generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply-add long
|
|
name = vqdmlal_high
|
|
no-q
|
|
multi_fn = vqadd-out-noext, a, {vqdmull_high-noqself-noext, b, c}
|
|
a = 1, 2, 3, 4
|
|
b = 0, 1, 4, 5, 4, 5, 6, 7
|
|
c = 1, 2, 5, 6, 5, 6, 7, 8
|
|
validate 41, 62, 87, 116
|
|
|
|
aarch64 = sqdmlal2
|
|
generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply-add long
|
|
name = vqdmlal_high_n
|
|
no-q
|
|
multi_fn = vqadd-out-noext, a, {vqdmull_high_n-noqself-noext, b, c}
|
|
a = 1, 2, 3, 4
|
|
b = 0, 2, 8, 10, 8, 10, 12, 14
|
|
c = 2
|
|
validate 33, 42, 51, 60
|
|
|
|
aarch64 = sqdmlal2
|
|
generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
|
|
|
|
/// Vector widening saturating doubling multiply accumulate with scalar
|
|
name = vqdmlal_lane
|
|
in2-suffix
|
|
constn = N
|
|
multi_fn = static_assert_imm-in2_exp_len-N
|
|
multi_fn = vqadd-out-noext, a, {vqdmull_lane-in2-::<N>, b, c}
|
|
a = 1, 2, 3, 4
|
|
b = 1, 2, 3, 4
|
|
c = 0, 2, 2, 0, 2, 0, 0, 0
|
|
n = HFLEN
|
|
validate 5, 10, 15, 20
|
|
|
|
aarch64 = sqdmlal
|
|
generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
|
|
|
|
arm = vqdmlal
|
|
generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply-add long
|
|
name = vqdmlal_high_lane
|
|
in2-suffix
|
|
constn = N
|
|
multi_fn = static_assert_imm-in2_exp_len-N
|
|
multi_fn = vqadd-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c}
|
|
a = 1, 2, 3, 4
|
|
b = 0, 1, 4, 5, 4, 5, 6, 7
|
|
c = 0, 2, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 17, 22, 27, 32
|
|
|
|
aarch64 = sqdmlal2
|
|
generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply-subtract long
|
|
name = vqdmlsl
|
|
multi_fn = vqsub-out-noext, a, {vqdmull-self-noext, b, c}
|
|
a = 3, 7, 11, 15
|
|
b = 1, 2, 3, 4
|
|
c = 2, 2, 2, 2
|
|
validate -1, -1, -1, -1
|
|
|
|
aarch64 = sqdmlsl
|
|
arm = vqdmlsl
|
|
generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Vector widening saturating doubling multiply subtract with scalar
|
|
name = vqdmlsl
|
|
n-suffix
|
|
multi_fn = vqsub-out-noext, a, {vqdmull_n-self-noext, b, c}
|
|
a = 3, 7, 11, 15
|
|
b = 1, 2, 3, 4
|
|
c = 2
|
|
validate -1, -1, -1, -1
|
|
|
|
aarch64 = sqdmlsl
|
|
arm = vqdmlsl
|
|
generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply-subtract long
|
|
name = vqdmlsl_high
|
|
no-q
|
|
multi_fn = vqsub-out-noext, a, {vqdmull_high-noqself-noext, b, c}
|
|
a = 39, 58, 81, 108
|
|
b = 0, 1, 4, 5, 4, 5, 6, 7
|
|
c = 1, 2, 5, 6, 5, 6, 7, 8
|
|
validate -1, -2, -3, -4
|
|
|
|
aarch64 = sqdmlsl2
|
|
generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply-subtract long
|
|
name = vqdmlsl_high_n
|
|
no-q
|
|
multi_fn = vqsub-out-noext, a, {vqdmull_high_n-noqself-noext, b, c}
|
|
a = 31, 38, 45, 52
|
|
b = 0, 2, 8, 10, 8, 10, 12, 14
|
|
c = 2
|
|
validate -1, -2, -3, -4
|
|
|
|
aarch64 = sqdmlsl2
|
|
generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
|
|
|
|
/// Vector widening saturating doubling multiply subtract with scalar
|
|
name = vqdmlsl_lane
|
|
in2-suffix
|
|
constn = N
|
|
multi_fn = static_assert_imm-in2_exp_len-N
|
|
multi_fn = vqsub-out-noext, a, {vqdmull_lane-in2-::<N>, b, c}
|
|
a = 3, 6, 9, 12
|
|
b = 1, 2, 3, 4
|
|
c = 0, 2, 2, 0, 2, 0, 0, 0
|
|
n = HFLEN
|
|
validate -1, -2, -3, -4
|
|
|
|
aarch64 = sqdmlsl
|
|
generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
|
|
|
|
arm = vqdmlsl
|
|
generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply-subtract long
|
|
name = vqdmlsl_high_lane
|
|
in2-suffix
|
|
constn = N
|
|
multi_fn = static_assert_imm-in2_exp_len-N
|
|
multi_fn = vqsub-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c}
|
|
a = 15, 18, 21, 24
|
|
b = 0, 1, 4, 5, 4, 5, 6, 7
|
|
c = 0, 2, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate -1, -2, -3, -4
|
|
|
|
aarch64 = sqdmlsl2
|
|
generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
|
|
/// Signed saturating doubling multiply returning high half
|
|
name = vqdmulh
|
|
a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 1, 1, 1, 1, 1, 1, 1, 1
|
|
|
|
aarch64 = sqdmulh
|
|
link-aarch64 = sqdmulh._EXT_
|
|
arm = vqdmulh
|
|
link-arm = vqdmulh._EXT_
|
|
generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
|
|
|
|
/// Signed saturating doubling multiply returning high half
|
|
name = vqdmulh
|
|
multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
|
|
multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
|
|
multi_fn = simd_extract, {vqdmulh-in_ntt-noext, a, b}, 0
|
|
a = 1
|
|
b = 2
|
|
validate 0
|
|
|
|
aarch64 = sqdmulh
|
|
generate i16, i32
|
|
|
|
/// Vector saturating doubling multiply high with scalar
|
|
name = vqdmulh_n
|
|
out-suffix
|
|
multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
|
|
multi_fn = vqdmulh-out-noext, a, b
|
|
a = MAX, MAX, MAX, MAX
|
|
b = 2
|
|
validate 1, 1, 1, 1
|
|
|
|
aarch64 = sqdmulh
|
|
arm = vqdmulh
|
|
generate int16x4_t:i16:int16x4_t, int32x2_t:i32:int32x2_t
|
|
|
|
/// Vector saturating doubling multiply high with scalar
|
|
name = vqdmulhq_n
|
|
no-q
|
|
multi_fn = vdupq_n-in_ntt-noext, b:out_t, b
|
|
multi_fn = vqdmulh-out-noext, a, b
|
|
a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
b = 2
|
|
validate 1, 1, 1, 1, 1, 1, 1, 1
|
|
|
|
aarch64 = sqdmulh
|
|
arm = vqdmulh
|
|
generate int16x8_t:i16:int16x8_t, int32x4_t:i32:int32x4_t
|
|
|
|
/// Signed saturating doubling multiply returning high half
|
|
name = vqdmulhh_lane
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_extract, b:in_t0, b, N as u32
|
|
multi_fn = vqdmulhh-out_ntt-noext, a, b
|
|
a = 2
|
|
b = 0, 0, MAX, 0, 0, 0, 0, 0
|
|
n = 2
|
|
validate 1
|
|
|
|
aarch64 = sqdmulh
|
|
generate i16:int16x4_t:i16, i16:int16x8_t:i16
|
|
|
|
/// Signed saturating doubling multiply returning high half
|
|
name = vqdmulhs_lane
|
|
constn = N
|
|
multi_fn = static_assert_imm-in_exp_len-N
|
|
multi_fn = simd_extract, b:in_t0, b, N as u32
|
|
multi_fn = vqdmulhs-out_ntt-noext, a, b
|
|
a = 2
|
|
b = 0, MAX, 0, 0
|
|
n = 1
|
|
validate 1
|
|
|
|
aarch64 = sqdmulh
|
|
generate i32:int32x2_t:i32, i32:int32x4_t:i32
|
|
|
|
/// Signed saturating extract narrow
|
|
name = vqmovn
|
|
no-q
|
|
a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
|
|
aarch64 = sqxtn
|
|
link-aarch64 = sqxtn._EXT2_
|
|
arm = vqmovn
|
|
link-arm = vqmovns._EXT2_
|
|
generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
|
|
|
|
/// Unsigned saturating extract narrow
|
|
name = vqmovn
|
|
no-q
|
|
a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
|
|
aarch64 = uqxtn
|
|
link-aarch64 = uqxtn._EXT2_
|
|
arm = vqmovn
|
|
link-arm = vqmovnu._EXT2_
|
|
generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
|
|
|
|
/// Saturating extract narrow
|
|
name = vqmovn
|
|
multi_fn = simd_extract, {vqmovn-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0
|
|
a = 1
|
|
validate 1
|
|
|
|
aarch64 = sqxtn
|
|
generate i16:i8, i32:i16
|
|
aarch64 = uqxtn
|
|
generate u16:u8, u32:u16
|
|
|
|
/// Saturating extract narrow
|
|
name = vqmovn
|
|
a = 1
|
|
validate 1
|
|
|
|
aarch64 = sqxtn
|
|
link-aarch64 = scalar.sqxtn._EXT2_._EXT_
|
|
generate i64:i32
|
|
|
|
aarch64 = uqxtn
|
|
link-aarch64 = scalar.uqxtn._EXT2_._EXT_
|
|
generate u64:u32
|
|
|
|
/// Signed saturating extract narrow
|
|
name = vqmovn_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, a, {vqmovn-noqself-noext, b}, {asc-0-out_len}
|
|
a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
|
|
aarch64 = sqxtn2
|
|
generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
|
|
aarch64 = uqxtn2
|
|
generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
|
|
|
|
/// Signed saturating extract unsigned narrow
|
|
name = vqmovun
|
|
no-q
|
|
a = -1, -1, -1, -1, -1, -1, -1, -1
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
aarch64 = sqxtun
|
|
link-aarch64 = sqxtun._EXT2_
|
|
arm = vqmovun
|
|
link-arm = vqmovnsu._EXT2_
|
|
generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t
|
|
|
|
/// Signed saturating extract unsigned narrow
|
|
name = vqmovun
|
|
multi_fn = simd_extract, {vqmovun-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0
|
|
a = 1
|
|
validate 1
|
|
|
|
aarch64 = sqxtun
|
|
generate i16:u8, i32:u16, i64:u32
|
|
|
|
/// Signed saturating extract unsigned narrow
|
|
name = vqmovun_high
|
|
no-q
|
|
multi_fn = simd_shuffle-out_len-!, a, {vqmovun-noqself-noext, b}, {asc-0-out_len}
|
|
a = 0, 0, 0, 0, 0, 0, 0, 0
|
|
b = -1, -1, -1, -1, -1, -1, -1, -1
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
aarch64 = sqxtun2
|
|
generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t
|
|
|
|
/// Signed saturating rounding doubling multiply returning high half
|
|
name = vqrdmulh
|
|
a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 2, 2, 2, 2, 2, 2, 2, 2
|
|
|
|
aarch64 = sqrdmulh
|
|
link-aarch64 = sqrdmulh._EXT_
|
|
arm = vqrdmulh
|
|
link-arm = vqrdmulh._EXT_
|
|
generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
|
|
|
|
/// Signed saturating rounding doubling multiply returning high half
|
|
name = vqrdmulh
|
|
multi_fn = simd_extract, {vqrdmulh-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
|
|
a = 1
|
|
b = 2
|
|
validate 0
|
|
|
|
aarch64 = sqrdmulh
|
|
generate i16, i32
|
|
|
|
/// Vector saturating rounding doubling multiply high with scalar
|
|
name = vqrdmulh
|
|
out-n-suffix
|
|
multi_fn = vqrdmulh-out-noext, a, {vdup-nout-noext, b}
|
|
a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
b = 2
|
|
validate 2, 2, 2, 2, 2, 2, 2, 2
|
|
|
|
aarch64 = sqrdmulh
|
|
arm = vqrdmulh
|
|
generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t
|
|
|
|
/// Vector rounding saturating doubling multiply high by scalar
|
|
name = vqrdmulh
|
|
lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32}
|
|
multi_fn = vqrdmulh-out-noext, a, b
|
|
a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
b = 0, 2, 0, 0, 0, 0, 0, 0,
|
|
n = 1
|
|
validate 2, 2, 2, 2, 2, 2, 2, 2
|
|
|
|
aarch64 = sqrdmulh
|
|
arm = vqrdmulh
|
|
generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t
|
|
generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t
|
|
|
|
/// Signed saturating rounding doubling multiply returning high half
|
|
name = vqrdmulh
|
|
lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = vqrdmulh-out-noext, a, {simd_extract, b, LANE as u32}
|
|
a = 1
|
|
b = 0, 2, 0, 0, 0, 0, 0, 0,
|
|
n = 1
|
|
validate 0
|
|
|
|
aarch64 = sqrdmulh
|
|
generate i16:int16x4_t:i16, i16:int16x8_t:i16, i32:int32x2_t:i32, i32:int32x4_t:i32
|
|
|
|
/// Signed saturating rounding doubling multiply accumulate returning high half
|
|
name = vqrdmlah
|
|
multi_fn = vqadd-out-noext, a, {vqrdmulh-out-noext, b, c}
|
|
a = 1, 1, 1, 1, 1, 1, 1, 1
|
|
b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
c = 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 3, 3, 3, 3, 3, 3, 3, 3
|
|
|
|
aarch64 = sqrdmulh
|
|
arm = vqrdmulh
|
|
generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
|
|
|
|
/// Signed saturating rounding doubling multiply accumulate returning high half
|
|
name = vqrdmlah
|
|
multi_fn = vqadd-self-noext, a, {vqrdmulh-self-noext, b, c}
|
|
a = 1
|
|
b = 1
|
|
c = 2
|
|
validate 1
|
|
|
|
aarch64 = sqrdmulh
|
|
generate i16, i32
|
|
|
|
/// Signed saturating rounding doubling multiply accumulate returning high half
|
|
name = vqrdmlah
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vqadd-self-noext, a, {vqrdmulh-in2lane-::<LANE>, b, c}
|
|
a = 1, 1, 1, 1, 1, 1, 1, 1
|
|
b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
c = 0, 2, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 3, 3, 3, 3, 3, 3, 3, 3
|
|
|
|
aarch64 = sqrdmulh
|
|
arm = vqrdmulh
|
|
generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
|
|
generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
|
|
|
|
/// Signed saturating rounding doubling multiply accumulate returning high half
|
|
name = vqrdmlah
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vqadd-self-noext, a, {vqrdmulh-in2lane-::<LANE>, b, c}
|
|
a = 1
|
|
b = 1
|
|
c = 0, 2, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 1
|
|
|
|
aarch64 = sqrdmulh
|
|
generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32
|
|
|
|
/// Signed saturating rounding doubling multiply subtract returning high half
|
|
name = vqrdmlsh
|
|
multi_fn = vqsub-out-noext, a, {vqrdmulh-out-noext, b, c}
|
|
a = 1, 1, 1, 1, 1, 1, 1, 1
|
|
b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
c = 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate -1, -1, -1, -1, -1, -1, -1, -1
|
|
|
|
aarch64 = sqrdmulh
|
|
arm = vqrdmulh
|
|
generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
|
|
|
|
/// Signed saturating rounding doubling multiply subtract returning high half
|
|
name = vqrdmlsh
|
|
multi_fn = vqsub-self-noext, a, {vqrdmulh-self-noext, b, c}
|
|
a = 1
|
|
b = 1
|
|
c = 2
|
|
validate 1
|
|
|
|
aarch64 = sqrdmulh
|
|
generate i16, i32
|
|
|
|
/// Signed saturating rounding doubling multiply subtract returning high half
|
|
name = vqrdmlsh
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vqsub-self-noext, a, {vqrdmulh-in2lane-::<LANE>, b, c}
|
|
a = 1, 1, 1, 1, 1, 1, 1, 1
|
|
b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
|
|
c = 0, 2, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate -1, -1, -1, -1, -1, -1, -1, -1
|
|
|
|
aarch64 = sqrdmulh
|
|
arm = vqrdmulh
|
|
generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
|
|
generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
|
|
|
|
/// Signed saturating rounding doubling multiply subtract returning high half
|
|
name = vqrdmlsh
|
|
in2-lane-suffixes
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in2_exp_len-LANE
|
|
multi_fn = vqsub-self-noext, a, {vqrdmulh-in2lane-::<LANE>, b, c}
|
|
a = 1
|
|
b = 1
|
|
c = 0, 2, 0, 0, 0, 0, 0, 0
|
|
n = 1
|
|
validate 1
|
|
|
|
aarch64 = sqrdmulh
|
|
generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32
|
|
|
|
/// Signed saturating rounding shift left
|
|
name = vqrshl
|
|
a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 8, MIN, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
|
|
|
aarch64 = sqrshl
|
|
link-aarch64 = sqrshl._EXT_
|
|
generate i32, i64
|
|
|
|
arm = vqrshl
|
|
link-arm = vqrshifts._EXT_
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Signed saturating rounding shift left
|
|
name = vqrshl
|
|
multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
|
|
multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
|
|
multi_fn = simd_extract, {vqrshl-in_ntt-noext, a, b}, 0
|
|
a = 1
|
|
b = 2
|
|
validate 4
|
|
|
|
aarch64 = sqrshl
|
|
generate i8, i16
|
|
|
|
/// Unsigned signed saturating rounding shift left
|
|
name = vqrshl
|
|
out-suffix
|
|
a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 8, 0, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
|
|
|
aarch64 = uqrshl
|
|
link-aarch64 = uqrshl._EXT_
|
|
generate u32:i32:u32, u64:i64:u64
|
|
|
|
arm = vqrshl
|
|
link-arm = vqrshiftu._EXT_
|
|
generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
|
|
generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t
|
|
|
|
/// Unsigned signed saturating rounding shift left
|
|
name = vqrshl
|
|
out-suffix
|
|
multi_fn = vdup_n-out_ntt-noext, a:out_ntt, a
|
|
multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
|
|
multi_fn = simd_extract, {vqrshl-out_ntt-noext, a, b}, 0
|
|
a = 1
|
|
b = 2
|
|
validate 4
|
|
|
|
aarch64 = uqrshl
|
|
generate u8:i8:u8, u16:i16:u16
|
|
|
|
/// Signed saturating rounded shift right narrow
|
|
name = vqrshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
a = MIN, 4, 8, 12, 16, 20, 24, 28
|
|
n = 2
|
|
validate MIN, 1, 2, 3, 4, 5, 6, 7
|
|
|
|
aarch64 = sqrshrn
|
|
link-aarch64 = sqrshrn._EXT2_
|
|
const-aarch64 = N
|
|
|
|
arm = vqrshrn
|
|
link-arm = vqrshiftns._EXT2_
|
|
const-arm = -N as ttn
|
|
arm-aarch64-separate
|
|
generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
|
|
|
|
/// Signed saturating rounded shift right narrow
|
|
name = vqrshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a
|
|
multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0
|
|
a = 4
|
|
n = 2
|
|
validate 1
|
|
|
|
aarch64 = sqrshrn
|
|
generate i16:i8, i32:i16, i64:i32
|
|
|
|
/// Signed saturating rounded shift right narrow
|
|
name = vqrshrn_high
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
|
|
a = 0, 1, 2, 3, 2, 3, 6, 7
|
|
b = 8, 12, 24, 28, 48, 52, 56, 60
|
|
n = 2
|
|
validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15
|
|
|
|
aarch64 = sqrshrn2
|
|
generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
|
|
|
|
/// Unsigned signed saturating rounded shift right narrow
|
|
name = vqrshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
a = MIN, 4, 8, 12, 16, 20, 24, 28
|
|
n = 2
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7
|
|
|
|
aarch64 = uqrshrn
|
|
link-aarch64 = uqrshrn._EXT2_
|
|
const-aarch64 = N
|
|
|
|
arm = vqrshrn
|
|
link-arm = vqrshiftnu._EXT2_
|
|
const-arm = -N as ttn
|
|
arm-aarch64-separate
|
|
generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
|
|
|
|
/// Unsigned saturating rounded shift right narrow
|
|
name = vqrshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a
|
|
multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0
|
|
a = 4
|
|
n = 2
|
|
validate 1
|
|
|
|
aarch64 = uqrshrn
|
|
generate u16:u8, u32:u16, u64:u32
|
|
|
|
/// Unsigned saturating rounded shift right narrow
|
|
name = vqrshrn_high
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
|
|
a = 0, 1, 2, 3, 2, 3, 6, 7
|
|
b = 8, 12, 24, 28, 48, 52, 56, 60
|
|
n = 2
|
|
validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15
|
|
|
|
aarch64 = uqrshrn2
|
|
generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
|
|
|
|
/// Signed saturating rounded shift right unsigned narrow
|
|
name = vqrshrun
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
a = 0, 4, 8, 12, 16, 20, 24, 28
|
|
n = 2
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7
|
|
|
|
aarch64 = sqrshrun
|
|
link-aarch64 = sqrshrun._EXT2_
|
|
const-aarch64 = N
|
|
|
|
arm = vqrshrun
|
|
link-arm = vqrshiftnsu._EXT2_
|
|
const-arm = -N as ttn
|
|
arm-aarch64-separate
|
|
generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t
|
|
|
|
/// Signed saturating rounded shift right unsigned narrow
|
|
name = vqrshrun
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a
|
|
multi_fn = simd_extract, {vqrshrun_n-in_ntt-::<N>, a}, 0
|
|
a = 4
|
|
n = 2
|
|
validate 1
|
|
|
|
aarch64 = sqrshrun
|
|
generate i16:u8, i32:u16, i64:u32
|
|
|
|
/// Signed saturating rounded shift right unsigned narrow
|
|
name = vqrshrun_high
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len}
|
|
a = 0, 1, 2, 3, 2, 3, 6, 7
|
|
b = 8, 12, 24, 28, 48, 52, 56, 60
|
|
n = 2
|
|
validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15
|
|
|
|
aarch64 = sqrshrun2
|
|
generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t
|
|
|
|
/// Signed saturating shift left
|
|
name = vqshl
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
|
|
|
aarch64 = sqshl
|
|
link-aarch64 = sqshl._EXT_
|
|
generate i64
|
|
|
|
arm = vqshl
|
|
link-arm = vqshifts._EXT_
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Signed saturating shift left
|
|
name = vqshl
|
|
multi_fn = vqshl-in_ntt-noext, c:in_ntt, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}
|
|
multi_fn = simd_extract, c, 0
|
|
a = 1
|
|
b = 2
|
|
validate 4
|
|
|
|
aarch64 = sqshl
|
|
generate i8, i16, i32
|
|
|
|
/// Unsigned saturating shift left
|
|
name = vqshl
|
|
out-suffix
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
|
|
|
aarch64 = uqshl
|
|
link-aarch64 = uqshl._EXT_
|
|
generate u64:i64:u64
|
|
|
|
arm = vqshl
|
|
link-arm = vqshiftu._EXT_
|
|
generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
|
|
generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t
|
|
|
|
/// Unsigned saturating shift left
|
|
name = vqshl
|
|
out-suffix
|
|
multi_fn = vqshl-out_ntt-noext, c:out_ntt, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}
|
|
multi_fn = simd_extract, c, 0
|
|
a = 1
|
|
b = 2
|
|
validate 4
|
|
|
|
aarch64 = uqshl
|
|
generate u8:i8:u8, u16:i16:u16, u32:i32:u32
|
|
|
|
/// Signed saturating shift left
|
|
name = vqshl
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert_imm-out_bits_exp_len-N
|
|
multi_fn = vqshl-self-noext, a, {vdup-nself-noext, N.try_into().unwrap()}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
n = 2
|
|
validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
|
|
|
aarch64 = sqshl
|
|
arm = vqshl
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Signed saturating shift left
|
|
name = vqshl
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert_imm-out_bits_exp_len-N
|
|
multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
|
|
a = 1
|
|
n = 2
|
|
validate 4
|
|
|
|
aarch64 = sqshl
|
|
generate i8, i16, i32, i64
|
|
|
|
/// Unsigned saturating shift left
|
|
name = vqshl
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert_imm-out_bits_exp_len-N
|
|
multi_fn = vqshl-self-noext, a, {vdup-nsigned-noext, N.try_into().unwrap()}
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
n = 2
|
|
validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
|
|
|
aarch64 = uqshl
|
|
arm = vqshl
|
|
generate uint*_t, uint64x*_t
|
|
|
|
/// Unsigned saturating shift left
|
|
name = vqshl
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert_imm-out_bits_exp_len-N
|
|
multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
|
|
a = 1
|
|
n = 2
|
|
validate 4
|
|
|
|
aarch64 = uqshl
|
|
generate u8, u16, u32, u64
|
|
|
|
/// Signed saturating shift right narrow
|
|
name = vqshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
a = 0, 4, 8, 12, 16, 20, 24, 28
|
|
n = 2
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = sqshrn
|
|
link-aarch64 = sqshrn._EXT2_
|
|
const-aarch64 = N
|
|
generate i64:i32
|
|
|
|
arm = vqshrn
|
|
link-arm = vqshiftns._EXT2_
|
|
const-arm = -N as ttn
|
|
generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
|
|
|
|
/// Signed saturating shift right narrow
|
|
name = vqshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
|
|
a = 4
|
|
n = 2
|
|
validate 1
|
|
|
|
aarch64 = sqshrn
|
|
generate i16:i8, i32:i16
|
|
|
|
/// Signed saturating shift right narrow
|
|
name = vqshrn_high
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
|
|
a = 0, 1, 8, 9, 8, 9, 10, 11
|
|
b = 32, 36, 40, 44, 48, 52, 56, 60
|
|
n = 2
|
|
validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = sqshrn2
|
|
generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
|
|
|
|
/// Unsigned saturating shift right narrow
|
|
name = vqshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
a = 0, 4, 8, 12, 16, 20, 24, 28
|
|
n = 2
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = uqshrn
|
|
link-aarch64 = uqshrn._EXT2_
|
|
const-aarch64 = N
|
|
generate u64:u32
|
|
|
|
arm = vqshrn
|
|
link-arm = vqshiftnu._EXT2_
|
|
const-arm = -N as ttn
|
|
generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
|
|
|
|
/// Unsigned saturating shift right narrow
|
|
name = vqshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
|
|
a = 4
|
|
n = 2
|
|
validate 1
|
|
|
|
aarch64 = uqshrn
|
|
generate u16:u8, u32:u16
|
|
|
|
/// Unsigned saturating shift right narrow
|
|
name = vqshrn_high
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len}
|
|
a = 0, 1, 8, 9, 8, 9, 10, 11
|
|
b = 32, 36, 40, 44, 48, 52, 56, 60
|
|
n = 2
|
|
validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = uqshrn2
|
|
generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
|
|
|
|
/// Signed saturating shift right unsigned narrow
|
|
name = vqshrun
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
a = 0, 4, 8, 12, 16, 20, 24, 28
|
|
n = 2
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = sqshrun
|
|
link-aarch64 = sqshrun._EXT2_
|
|
const-aarch64 = N
|
|
|
|
arm = vqshrun
|
|
link-arm = vqshiftnsu._EXT2_
|
|
const-arm = -N as ttn
|
|
generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t
|
|
|
|
/// Signed saturating shift right unsigned narrow
|
|
name = vqshrun
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_extract, {vqshrun_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
|
|
a = 4
|
|
n = 2
|
|
validate 1
|
|
|
|
aarch64 = sqshrun
|
|
generate i16:u8, i32:u16, i64:u32
|
|
|
|
/// Signed saturating shift right unsigned narrow
|
|
name = vqshrun_high
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len}
|
|
a = 0, 1, 8, 9, 8, 9, 10, 11
|
|
b = 32, 36, 40, 44, 48, 52, 56, 60
|
|
n = 2
|
|
validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = sqshrun2
|
|
generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t
|
|
|
|
/// Calculates the square root of each lane.
|
|
name = vsqrt
|
|
fn = simd_fsqrt
|
|
a = 4.0, 9.0, 16.0, 25.0
|
|
validate 2.0, 3.0, 4.0, 5.0
|
|
|
|
aarch64 = fsqrt
|
|
generate float*_t, float64x*_t
|
|
|
|
/// Reciprocal square-root estimate.
|
|
name = vrsqrte
|
|
a = 1.0, 2.0, 3.0, 4.0
|
|
validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375
|
|
|
|
aarch64 = frsqrte
|
|
link-aarch64 = frsqrte._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vrsqrte
|
|
link-arm = vrsqrte._EXT_
|
|
generate float*_t
|
|
|
|
/// Reciprocal estimate.
|
|
name = vrecpe
|
|
a = 4.0, 3.0, 2.0, 1.0
|
|
validate 0.24951171875, 0.3330078125, 0.4990234375, 0.998046875
|
|
|
|
aarch64 = frecpe
|
|
link-aarch64 = frecpe._EXT_
|
|
generate float64x*_t
|
|
|
|
arm = vrecpe
|
|
link-arm = vrecpe._EXT_
|
|
generate float*_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = nop
|
|
generate poly64x1_t:int64x1_t, poly64x1_t:uint64x1_t, int64x1_t:poly64x1_t, uint64x1_t:poly64x1_t
|
|
generate poly64x2_t:int64x2_t, poly64x2_t:uint64x2_t, int64x2_t:poly64x2_t, uint64x2_t:poly64x2_t
|
|
|
|
arm = nop
|
|
generate uint8x8_t:int8x8_t, poly8x8_t:int8x8_t, poly16x4_t:int16x4_t, uint16x4_t:int16x4_t, uint32x2_t:int32x2_t, uint64x1_t:int64x1_t
|
|
generate uint8x16_t:int8x16_t, poly8x16_t:int8x16_t, poly16x8_t:int16x8_t, uint16x8_t:int16x8_t, uint32x4_t:int32x4_t, uint64x2_t:int64x2_t
|
|
generate poly8x8_t:uint8x8_t, int8x8_t:uint8x8_t, poly16x4_t:uint16x4_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t
|
|
generate poly8x16_t:uint8x16_t, int8x16_t:uint8x16_t, poly16x8_t:uint16x8_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t
|
|
generate int8x8_t:poly8x8_t, uint8x8_t:poly8x8_t, int16x4_t:poly16x4_t, uint16x4_t:poly16x4_t
|
|
generate int8x16_t:poly8x16_t, uint8x16_t:poly8x16_t, int16x8_t:poly16x8_t, uint16x8_t:poly16x8_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0, 1, 2, 3, 4, 5, 6, 7
|
|
validate 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
|
|
|
|
aarch64 = nop
|
|
generate poly64x1_t:int32x2_t, poly64x1_t:uint32x2_t
|
|
generate poly64x2_t:int32x4_t, poly64x2_t:uint32x4_t
|
|
|
|
arm = nop
|
|
generate int16x4_t:int8x8_t, uint16x4_t:int8x8_t, poly16x4_t:int8x8_t, int32x2_t:int16x4_t, uint32x2_t:int16x4_t, int64x1_t:int32x2_t, uint64x1_t:int32x2_t
|
|
generate int16x8_t:int8x16_t, uint16x8_t:int8x16_t, poly16x8_t:int8x16_t, int32x4_t:int16x8_t, uint32x4_t:int16x8_t, int64x2_t:int32x4_t, uint64x2_t:int32x4_t
|
|
generate poly16x4_t:uint8x8_t, int16x4_t:uint8x8_t, uint16x4_t:uint8x8_t, int32x2_t:uint16x4_t, uint32x2_t:uint16x4_t, int64x1_t:uint32x2_t, uint64x1_t:uint32x2_t
|
|
generate poly16x8_t:uint8x16_t, int16x8_t:uint8x16_t, uint16x8_t:uint8x16_t, int32x4_t:uint16x8_t, uint32x4_t:uint16x8_t, int64x2_t:uint32x4_t, uint64x2_t:uint32x4_t
|
|
generate poly16x4_t:poly8x8_t, int16x4_t:poly8x8_t, uint16x4_t:poly8x8_t, int32x2_t:poly16x4_t, uint32x2_t:poly16x4_t
|
|
generate poly16x8_t:poly8x16_t, int16x8_t:poly8x16_t, uint16x8_t:poly8x16_t, int32x4_t:poly16x8_t, uint32x4_t:poly16x8_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7
|
|
|
|
aarch64 = nop
|
|
generate int32x2_t:poly64x1_t, uint32x2_t:poly64x1_t
|
|
generate int32x4_t:poly64x2_t, uint32x4_t:poly64x2_t
|
|
|
|
arm = nop
|
|
generate poly8x8_t:int16x4_t, int8x8_t:int16x4_t, uint8x8_t:int16x4_t, poly16x4_t:int32x2_t, int16x4_t:int32x2_t, uint16x4_t:int32x2_t, int32x2_t:int64x1_t, uint32x2_t:int64x1_t
|
|
generate poly8x16_t:int16x8_t, int8x16_t:int16x8_t, uint8x16_t:int16x8_t, poly16x8_t:int32x4_t, int16x8_t:int32x4_t, uint16x8_t:int32x4_t, int32x4_t:int64x2_t, uint32x4_t:int64x2_t
|
|
generate poly8x8_t:uint16x4_t, int8x8_t:uint16x4_t, uint8x8_t:uint16x4_t, poly16x4_t:uint32x2_t, int16x4_t:uint32x2_t, uint16x4_t:uint32x2_t, int32x2_t:uint64x1_t, uint32x2_t:uint64x1_t
|
|
generate poly8x16_t:uint16x8_t, int8x16_t:uint16x8_t, uint8x16_t:uint16x8_t, poly16x8_t:uint32x4_t, int16x8_t:uint32x4_t, uint16x8_t:uint32x4_t, int32x4_t:uint64x2_t, uint32x4_t:uint64x2_t
|
|
generate poly8x8_t:poly16x4_t, int8x8_t:poly16x4_t, uint8x8_t:poly16x4_t
|
|
generate poly8x16_t:poly16x8_t, int8x16_t:poly16x8_t, uint8x16_t:poly16x8_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0, 1, 2, 3
|
|
validate 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
|
|
|
aarch64 = nop
|
|
generate poly64x1_t:int16x4_t, poly64x1_t:uint16x4_t, poly64x1_t:poly16x4_t
|
|
generate poly64x2_t:int16x8_t, poly64x2_t:uint16x8_t, poly64x2_t:poly16x8_t
|
|
|
|
arm = nop
|
|
generate int32x2_t:int8x8_t, uint32x2_t:int8x8_t, int64x1_t:int16x4_t, uint64x1_t:int16x4_t
|
|
generate int32x4_t:int8x16_t, uint32x4_t:int8x16_t, int64x2_t:int16x8_t, uint64x2_t:int16x8_t
|
|
generate int32x2_t:uint8x8_t, uint32x2_t:uint8x8_t, int64x1_t:uint16x4_t, uint64x1_t:uint16x4_t
|
|
generate int32x4_t:uint8x16_t, uint32x4_t:uint8x16_t, int64x2_t:uint16x8_t, uint64x2_t:uint16x8_t
|
|
generate int32x2_t:poly8x8_t, uint32x2_t:poly8x8_t, int64x1_t:poly16x4_t, uint64x1_t:poly16x4_t
|
|
generate int32x4_t:poly8x16_t, uint32x4_t:poly8x16_t, int64x2_t:poly16x8_t, uint64x2_t:poly16x8_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
|
|
validate 0, 1, 2, 3
|
|
|
|
aarch64 = nop
|
|
generate poly16x4_t:poly64x1_t, int16x4_t:poly64x1_t, uint16x4_t:poly64x1_t
|
|
generate poly16x8_t:poly64x2_t, int16x8_t:poly64x2_t, uint16x8_t:poly64x2_t
|
|
|
|
arm = nop
|
|
generate poly8x8_t:int32x2_t, int8x8_t:int32x2_t, uint8x8_t:int32x2_t, poly16x4_t:int64x1_t, int16x4_t:int64x1_t, uint16x4_t:int64x1_t
|
|
generate poly8x16_t:int32x4_t, int8x16_t:int32x4_t, uint8x16_t:int32x4_t, poly16x8_t:int64x2_t, int16x8_t:int64x2_t, uint16x8_t:int64x2_t
|
|
generate poly8x8_t:uint32x2_t, int8x8_t:uint32x2_t, uint8x8_t:uint32x2_t, poly16x4_t:uint64x1_t, int16x4_t:uint64x1_t, uint16x4_t:uint64x1_t
|
|
generate poly8x16_t:uint32x4_t, int8x16_t:uint32x4_t, uint8x16_t:uint32x4_t, poly16x8_t:uint64x2_t, int16x8_t:uint64x2_t, uint16x8_t:uint64x2_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0, 1
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
aarch64 = nop
|
|
generate poly64x1_t:int8x8_t, poly64x1_t:uint8x8_t, poly64x1_t:poly8x8_t
|
|
generate poly64x2_t:int8x16_t, poly64x2_t:uint8x16_t, poly64x2_t:poly8x16_t
|
|
|
|
arm = nop
|
|
generate int64x1_t:int8x8_t, uint64x1_t:int8x8_t, int64x1_t:uint8x8_t, uint64x1_t:uint8x8_t, int64x1_t:poly8x8_t, uint64x1_t:poly8x8_t
|
|
generate int64x2_t:int8x16_t, uint64x2_t:int8x16_t, int64x2_t:uint8x16_t, uint64x2_t:uint8x16_t, int64x2_t:poly8x16_t, uint64x2_t:poly8x16_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
|
|
validate 0, 1
|
|
|
|
aarch64 = nop
|
|
generate poly8x8_t:poly64x1_t, int8x8_t:poly64x1_t, uint8x8_t:poly64x1_t
|
|
generate poly8x16_t:poly64x2_t, int8x16_t:poly64x2_t, uint8x16_t:poly64x2_t
|
|
|
|
arm = nop
|
|
generate poly8x8_t:int64x1_t, int8x8_t:int64x1_t, uint8x8_t:int64x1_t, poly8x8_t:uint64x1_t, int8x8_t:uint64x1_t, uint8x8_t:uint64x1_t
|
|
generate poly8x16_t:int64x2_t, int8x16_t:int64x2_t, uint8x16_t:int64x2_t, poly8x16_t:uint64x2_t, int8x16_t:uint64x2_t, uint8x16_t:uint64x2_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0., 0., 0., 0., 0., 0., 0., 0.
|
|
validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
aarch64 = nop
|
|
generate float64x1_t:int8x8_t, float64x1_t:int16x4_t, float64x1_t:int32x2_t, float64x1_t:int64x1_t
|
|
generate float64x2_t:int8x16_t, float64x2_t:int16x8_t, float64x2_t:int32x4_t, float64x2_t:int64x2_t
|
|
generate float64x1_t:uint8x8_t, float64x1_t:uint16x4_t, float64x1_t:uint32x2_t, float64x1_t:uint64x1_t
|
|
generate float64x2_t:uint8x16_t, float64x2_t:uint16x8_t, float64x2_t:uint32x4_t, float64x2_t:uint64x2_t
|
|
generate float64x1_t:poly8x8_t, float64x1_t:poly16x4_t, float32x2_t:poly64x1_t, float64x1_t:poly64x1_t
|
|
generate float64x2_t:poly8x16_t, float64x2_t:poly16x8_t, float32x4_t:poly64x2_t, float64x2_t:poly64x2_t
|
|
|
|
arm = nop
|
|
generate float32x2_t:int8x8_t, float32x2_t:int16x4_t, float32x2_t:int32x2_t, float32x2_t:int64x1_t
|
|
generate float32x4_t:int8x16_t, float32x4_t:int16x8_t, float32x4_t:int32x4_t, float32x4_t:int64x2_t
|
|
generate float32x2_t:uint8x8_t, float32x2_t:uint16x4_t, float32x2_t:uint32x2_t, float32x2_t:uint64x1_t
|
|
generate float32x4_t:uint8x16_t, float32x4_t:uint16x8_t, float32x4_t:uint32x4_t, float32x4_t:uint64x2_t
|
|
generate float32x2_t:poly8x8_t, float32x2_t:poly16x4_t
|
|
generate float32x4_t:poly8x16_t, float32x4_t:poly16x8_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
validate 0., 0., 0., 0., 0., 0., 0., 0.
|
|
|
|
aarch64 = nop
|
|
generate int8x8_t:float64x1_t, int16x4_t:float64x1_t, int32x2_t:float64x1_t, int64x1_t:float64x1_t
|
|
generate int8x16_t:float64x2_t, int16x8_t:float64x2_t, int32x4_t:float64x2_t, int64x2_t:float64x2_t
|
|
generate poly8x8_t:float64x1_t, uint16x4_t:float64x1_t, uint32x2_t:float64x1_t, uint64x1_t:float64x1_t
|
|
generate poly8x16_t:float64x2_t, uint16x8_t:float64x2_t, uint32x4_t:float64x2_t, uint64x2_t:float64x2_t
|
|
generate uint8x8_t:float64x1_t, poly16x4_t:float64x1_t, poly64x1_t:float64x1_t, poly64x1_t:float32x2_t
|
|
generate uint8x16_t:float64x2_t, poly16x8_t:float64x2_t, poly64x2_t:float64x2_t, poly64x2_t:float32x4_t
|
|
|
|
arm = nop
|
|
generate int8x8_t:float32x2_t, int16x4_t:float32x2_t, int32x2_t:float32x2_t, int64x1_t:float32x2_t
|
|
generate int8x16_t:float32x4_t, int16x8_t:float32x4_t, int32x4_t:float32x4_t, int64x2_t:float32x4_t
|
|
generate uint8x8_t:float32x2_t, uint16x4_t:float32x2_t, uint32x2_t:float32x2_t, uint64x1_t:float32x2_t
|
|
generate uint8x16_t:float32x4_t, uint16x8_t:float32x4_t, uint32x4_t:float32x4_t, uint64x2_t:float32x4_t
|
|
generate poly8x8_t:float32x2_t, poly16x4_t:float32x2_t
|
|
generate poly8x16_t:float32x4_t, poly16x8_t:float32x4_t
|
|
|
|
/// Vector reinterpret cast operation
|
|
name = vreinterpret
|
|
double-suffixes
|
|
fn = transmute
|
|
a = 0., 0., 0., 0., 0., 0., 0., 0.
|
|
validate 0., 0., 0., 0., 0., 0., 0., 0.
|
|
|
|
aarch64 = nop
|
|
generate float32x2_t:float64x1_t, float64x1_t:float32x2_t
|
|
generate float32x4_t:float64x2_t, float64x2_t:float32x4_t
|
|
|
|
/// Signed rounding shift left
|
|
name = vrshl
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
|
|
aarch64 = srshl
|
|
link-aarch64 = srshl._EXT_
|
|
generate i64
|
|
|
|
arm = vrshl
|
|
link-arm = vrshifts._EXT_
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Unsigned rounding shift left
|
|
name = vrshl
|
|
out-suffix
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
|
|
aarch64 = urshl
|
|
link-aarch64 = urshl._EXT_
|
|
generate u64:i64:u64
|
|
|
|
arm = vrshl
|
|
link-arm = vrshiftu._EXT_
|
|
generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
|
|
generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t
|
|
|
|
/// Signed rounding shift right
|
|
name = vrshr
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N).try_into().unwrap()}
|
|
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = srshr
|
|
arm = vrshr
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Signed rounding shift right
|
|
name = vrshr
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = vrshl-self-noext, a, -N as i64
|
|
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = srshr
|
|
generate i64
|
|
|
|
/// Unsigned rounding shift right
|
|
name = vrshr
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N).try_into().unwrap()}
|
|
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = urshr
|
|
arm = vrshr
|
|
generate uint*_t, uint64x*_t
|
|
|
|
/// Unsigned rounding shift right
|
|
name = vrshr
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = vrshl-self-noext, a, -N as i64
|
|
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = urshr
|
|
generate u64
|
|
|
|
/// Rounding shift right narrow
|
|
name = vrshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
arm-aarch64-separate
|
|
|
|
aarch64 = rshrn
|
|
link-aarch64 = rshrn._EXT2_
|
|
const-aarch64 = N
|
|
|
|
arm = vrshrn
|
|
link-arm = vrshiftn._EXT2_
|
|
const-arm = -N as ttn
|
|
generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
|
|
|
|
/// Rounding shift right narrow
|
|
name = vrshrn
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = transmute, {vrshrn_n-noqsigned-::<N>, transmute(a)}
|
|
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = rshrn
|
|
arm = vrshrn
|
|
generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
|
|
|
|
/// Rounding shift right narrow
|
|
name = vrshrn_high
|
|
noq-n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len}
|
|
a = 0, 1, 8, 9, 8, 9, 10, 11
|
|
b = 32, 36, 40, 44, 48, 52, 56, 60
|
|
n = 2
|
|
validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = rshrn2
|
|
generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
|
|
generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
|
|
|
|
/// Signed rounding shift right and accumulate
|
|
name = vrsra
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = simd_add, a, {vrshr-nself-::<N>, b}
|
|
a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
|
|
|
|
aarch64 = srsra
|
|
arm = vrsra
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Unsigned rounding shift right and accumulate
|
|
name = vrsra
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = simd_add, a, {vrshr-nself-::<N>, b}
|
|
a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
|
|
|
|
aarch64 = ursra
|
|
arm = vrsra
|
|
generate uint*_t, uint64x*_t
|
|
|
|
/// Signed rounding shift right and accumulate.
|
|
name = vrsra
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = vrshr-nself-::<N>, b:in_t, b
|
|
multi_fn = a + b
|
|
a = 1
|
|
b = 4
|
|
n = 2
|
|
validate 2
|
|
|
|
aarch64 = srsra
|
|
generate i64
|
|
|
|
/// Ungisned rounding shift right and accumulate.
|
|
name = vrsra
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = vrshr-nself-::<N>, b:in_t, b
|
|
multi_fn = a + b
|
|
a = 1
|
|
b = 4
|
|
n = 2
|
|
validate 2
|
|
|
|
aarch64 = ursra
|
|
generate u64
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vset_lane
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_insert, b, LANE as u32, a
|
|
a = 1
|
|
b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
n = 0
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = nop
|
|
arm = nop
|
|
generate i8:int8x8_t:int8x8_t, i16:int16x4_t:int16x4_t
|
|
generate i32:int32x2_t:int32x2_t, i64:int64x1_t:int64x1_t
|
|
generate u8:uint8x8_t:uint8x8_t, u16:uint16x4_t:uint16x4_t
|
|
generate u32:uint32x2_t:uint32x2_t, u64:uint64x1_t:uint64x1_t
|
|
generate p8:poly8x8_t:poly8x8_t, p16:poly16x4_t:poly16x4_t
|
|
|
|
target = aes
|
|
generate p64:poly64x1_t:poly64x1_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vsetq_lane
|
|
no-q
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_insert, b, LANE as u32, a
|
|
a = 1
|
|
b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
n = 0
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
aarch64 = nop
|
|
arm = nop
|
|
generate i8:int8x16_t:int8x16_t, i16:int16x8_t:int16x8_t
|
|
generate i32:int32x4_t:int32x4_t, i64:int64x2_t:int64x2_t
|
|
generate u8:uint8x16_t:uint8x16_t, u16:uint16x8_t:uint16x8_t
|
|
generate u32:uint32x4_t:uint32x4_t, u64:uint64x2_t:uint64x2_t
|
|
generate p8:poly8x16_t:poly8x16_t, p16:poly16x8_t:poly16x8_t
|
|
|
|
target = aes
|
|
generate p64:poly64x2_t:poly64x2_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vset_lane
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_insert, b, LANE as u32, a
|
|
a = 1.
|
|
b = 0., 2., 3., 4.
|
|
n = 0
|
|
validate 1., 2., 3., 4.
|
|
|
|
aarch64 = nop
|
|
generate f64:float64x1_t:float64x1_t
|
|
|
|
arm = nop
|
|
generate f32:float32x2_t:float32x2_t
|
|
|
|
/// Insert vector element from another vector element
|
|
name = vsetq_lane
|
|
no-q
|
|
constn = LANE
|
|
multi_fn = static_assert_imm-in_exp_len-LANE
|
|
multi_fn = simd_insert, b, LANE as u32, a
|
|
a = 1.
|
|
b = 0., 2., 3., 4.
|
|
n = 0
|
|
validate 1., 2., 3., 4.
|
|
|
|
aarch64 = nop
|
|
generate f64:float64x2_t:float64x2_t
|
|
|
|
arm = nop
|
|
generate f32:float32x4_t:float32x4_t
|
|
|
|
/// Signed Shift left
|
|
name = vshl
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
|
|
aarch64 = sshl
|
|
link-aarch64 = sshl._EXT_
|
|
arm = vshl
|
|
link-arm = vshifts._EXT_
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Signed Shift left
|
|
name = vshl
|
|
multi_fn = transmute, {vshl-in_ntt-noext, transmute(a), transmute(b)}
|
|
a = 1
|
|
b = 2
|
|
validate 4
|
|
|
|
aarch64 = sshl
|
|
generate i64
|
|
|
|
/// Unsigned Shift left
|
|
name = vshl
|
|
out-suffix
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
|
validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
|
|
aarch64 = ushl
|
|
link-aarch64 = ushl._EXT_
|
|
arm = vshl
|
|
link-arm = vshiftu._EXT_
|
|
generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
|
|
generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t
|
|
|
|
/// Unsigned Shift left
|
|
out-suffix
|
|
name = vshl
|
|
multi_fn = transmute, {vshl-out_ntt-noext, transmute(a), transmute(b)}
|
|
a = 1
|
|
b = 2
|
|
validate 4
|
|
|
|
aarch64 = ushl
|
|
generate u64:i64:u64
|
|
|
|
/// Shift left
|
|
name = vshl
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert_imm-out_bits_exp_len-N
|
|
multi_fn = simd_shl, a, {vdup-nself-noext, N.try_into().unwrap()}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
n = 2
|
|
validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
|
|
arm = vshl
|
|
aarch64 = shl
|
|
generate int*_t, uint*_t, int64x*_t, uint64x*_t
|
|
|
|
/// Signed shift left long
|
|
name = vshll
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-0-bits
|
|
multi_fn = simd_shl, {simd_cast, a}, {vdup-nout-noext, N.try_into().unwrap()}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8
|
|
n = 2
|
|
validate 4, 8, 12, 16, 20, 24, 28, 32
|
|
|
|
arm = vshll.s
|
|
aarch64 = sshll
|
|
generate int8x8_t:int16x8_t, int16x4_t:int32x4_t, int32x2_t:int64x2_t
|
|
aarch64 = ushll
|
|
generate uint8x8_t:uint16x8_t, uint16x4_t:uint32x4_t, uint32x2_t:uint64x2_t
|
|
|
|
/// Signed shift left long
|
|
name = vshll_high_n
|
|
no-q
|
|
constn = N
|
|
multi_fn = static_assert-N-0-bits
|
|
multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen}
|
|
multi_fn = vshll_n-noqself-::<N>, b
|
|
a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8
|
|
n = 2
|
|
validate 4, 8, 12, 16, 20, 24, 28, 32
|
|
|
|
aarch64 = sshll2
|
|
generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t
|
|
aarch64 = ushll2
|
|
generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t
|
|
|
|
/// Shift right
|
|
name = vshr
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = simd_shr, a, {vdup-nself-noext, N.try_into().unwrap()}
|
|
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
arm = vshr.s
|
|
aarch64 = sshr
|
|
generate int*_t, int64x*_t
|
|
aarch64 = ushr
|
|
generate uint*_t, uint64x*_t
|
|
|
|
/// Shift right narrow
|
|
name = vshrn_n
|
|
no-q
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_cast, {simd_shr, a, {vdup-nself-noext, N.try_into().unwrap()}}
|
|
a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
|
|
arm = vshrn.
|
|
aarch64 = shrn
|
|
generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t
|
|
generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
|
|
|
|
/// Shift right narrow
|
|
name = vshrn_high_n
|
|
no-q
|
|
constn = N
|
|
multi_fn = static_assert-N-1-halfbits
|
|
multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len}
|
|
a = 1, 2, 5, 6, 5, 6, 7, 8
|
|
b = 20, 24, 28, 32, 52, 56, 60, 64
|
|
n = 2
|
|
validate 1, 2, 5, 6, 5, 6, 7, 8, 5, 6, 7, 8, 13, 14, 15, 16
|
|
|
|
aarch64 = shrn2
|
|
generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t
|
|
generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t
|
|
|
|
/// Signed shift right and accumulate
|
|
name = vsra
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = simd_add, a, {vshr-nself-::<N>, b}
|
|
a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
|
|
|
|
aarch64 = ssra
|
|
arm = vsra
|
|
generate int*_t, int64x*_t
|
|
|
|
/// Unsigned shift right and accumulate
|
|
name = vsra
|
|
n-suffix
|
|
constn = N
|
|
multi_fn = static_assert-N-1-bits
|
|
multi_fn = simd_add, a, {vshr-nself-::<N>, b}
|
|
a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
|
b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
|
|
n = 2
|
|
validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
|
|
|
|
aarch64 = usra
|
|
arm = vsra
|
|
generate uint*_t, uint64x*_t
|
|
|
|
/// Transpose vectors
|
|
name = vtrn1
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
|
|
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
|
|
validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
|
|
|
|
aarch64 = trn1
|
|
generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
|
|
|
|
aarch64 = zip1
|
|
generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
|
|
|
|
/// Transpose vectors
|
|
name = vtrn1
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len}
|
|
a = 0., 2., 4., 6., 8., 10., 12., 14.
|
|
b = 1., 3., 5., 7., 9., 11., 13., 15.
|
|
validate 0., 1., 4., 5., 8., 9., 12., 13.
|
|
|
|
aarch64 = trn1
|
|
generate float32x4_t
|
|
|
|
aarch64 = zip1
|
|
generate float32x2_t, float64x2_t
|
|
|
|
/// Transpose vectors
|
|
name = vtrn2
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
|
|
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
|
|
validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
|
|
|
|
aarch64 = trn2
|
|
generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
|
|
|
|
aarch64 = zip2
|
|
generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
|
|
|
|
/// Transpose vectors
|
|
name = vtrn2
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len}
|
|
a = 0., 2., 4., 6., 8., 10., 12., 14.
|
|
b = 1., 3., 5., 7., 9., 11., 13., 15.
|
|
validate 2., 3., 6., 7., 10., 11., 14., 15.
|
|
|
|
aarch64 = trn2
|
|
generate float32x4_t
|
|
|
|
aarch64 = zip2
|
|
generate float32x2_t, float64x2_t
|
|
|
|
/// Zip vectors
|
|
name = vzip1
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
|
|
a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
|
|
validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
aarch64 = zip1
|
|
generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t
|
|
|
|
/// Zip vectors
|
|
name = vzip1
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len}
|
|
a = 0., 2., 4., 6., 8., 10., 12., 14.
|
|
b = 1., 3., 5., 7., 9., 11., 13., 15.
|
|
validate 0., 1., 2., 3., 4., 5., 6., 7.
|
|
|
|
aarch64 = zip1
|
|
generate float32x2_t, float32x4_t, float64x2_t
|
|
|
|
/// Zip vectors
|
|
name = vzip2
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
|
|
a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30
|
|
b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31
|
|
validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
|
|
|
aarch64 = zip2
|
|
generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t
|
|
|
|
/// Zip vectors
|
|
name = vzip2
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len}
|
|
a = 0., 8., 8., 10., 8., 10., 12., 14.
|
|
b = 1., 9., 9., 11., 9., 11., 13., 15.
|
|
validate 8., 9., 10., 11., 12., 13., 14., 15.
|
|
|
|
aarch64 = zip2
|
|
generate float32x2_t, float32x4_t, float64x2_t
|
|
|
|
/// Unzip vectors
|
|
name = vuzp1
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
|
|
a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0
|
|
b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0
|
|
validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16
|
|
|
|
aarch64 = uzp1
|
|
generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
|
|
|
|
aarch64 = zip1
|
|
generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
|
|
|
|
/// Unzip vectors
|
|
name = vuzp1
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len}
|
|
a = 0., 8., 1., 9., 4., 12., 5., 13.
|
|
b = 1., 10., 3., 11., 6., 14., 7., 15.
|
|
validate 0., 1., 1., 3., 4., 5., 6., 7.
|
|
|
|
aarch64 = uzp1
|
|
generate float32x4_t
|
|
|
|
aarch64 = zip1
|
|
generate float32x2_t, float64x2_t
|
|
|
|
/// Unzip vectors
|
|
name = vuzp2
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
|
|
a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24
|
|
b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32
|
|
validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32
|
|
|
|
aarch64 = uzp2
|
|
generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t
|
|
|
|
aarch64 = zip2
|
|
generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t
|
|
|
|
/// Unzip vectors
|
|
name = vuzp2
|
|
multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len}
|
|
a = 0., 8., 1., 9., 4., 12., 5., 13.
|
|
b = 2., 9., 3., 11., 6., 14., 7., 15.
|
|
validate 8., 9., 9., 11., 12., 13., 14., 15.
|
|
|
|
aarch64 = uzp2
|
|
generate float32x4_t
|
|
|
|
aarch64 = zip2
|
|
generate float32x2_t, float64x2_t
|
|
|
|
////////////////////
|
|
// Unsigned Absolute difference and Accumulate Long
|
|
////////////////////
|
|
|
|
/// Unsigned Absolute difference and Accumulate Long
|
|
name = vabal
|
|
multi_fn = vabd-unsigned-noext, b, c, d:in_t
|
|
multi_fn = simd_add, a, {simd_cast, d}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
|
|
validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20
|
|
|
|
arm = vabal.s
|
|
aarch64 = uabal
|
|
generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
|
|
|
|
/// Unsigned Absolute difference and Accumulate Long
|
|
name = vabal_high
|
|
no-q
|
|
multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = vabd_u8, d, e, f:uint8x8_t
|
|
multi_fn = simd_add, a, {simd_cast, f}
|
|
a = 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
|
|
validate 20, 20, 20, 20, 20, 20, 20, 20
|
|
|
|
aarch64 = uabal
|
|
generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t
|
|
|
|
/// Unsigned Absolute difference and Accumulate Long
|
|
name = vabal_high
|
|
no-q
|
|
multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7]
|
|
multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7]
|
|
multi_fn = vabd_u16, d, e, f:uint16x4_t
|
|
multi_fn = simd_add, a, {simd_cast, f}
|
|
a = 9, 10, 11, 12
|
|
b = 1, 2, 3, 4, 9, 10, 11, 12
|
|
c = 10, 10, 10, 10, 20, 0, 2, 4
|
|
validate 20, 20, 20, 20
|
|
|
|
aarch64 = uabal
|
|
generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
|
|
|
|
/// Unsigned Absolute difference and Accumulate Long
|
|
name = vabal_high
|
|
no-q
|
|
multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3]
|
|
multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3]
|
|
multi_fn = vabd_u32, d, e, f:uint32x2_t
|
|
multi_fn = simd_add, a, {simd_cast, f}
|
|
a = 15, 16
|
|
b = 1, 2, 15, 16
|
|
c = 10, 10, 10, 12
|
|
validate 20, 20
|
|
|
|
aarch64 = uabal
|
|
generate uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
|
|
|
|
////////////////////
|
|
// Signed Absolute difference and Accumulate Long
|
|
////////////////////
|
|
|
|
/// Signed Absolute difference and Accumulate Long
|
|
name = vabal
|
|
multi_fn = vabd-signed-noext, b, c, d:int8x8_t
|
|
multi_fn = simd_cast, e:uint8x8_t, d
|
|
multi_fn = simd_add, a, {simd_cast, e}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
|
|
validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20
|
|
|
|
arm = vabal.s
|
|
aarch64 = sabal
|
|
generate int16x8_t:int8x8_t:int8x8_t:int16x8_t
|
|
|
|
/// Signed Absolute difference and Accumulate Long
|
|
name = vabal
|
|
multi_fn = vabd-signed-noext, b, c, d:int16x4_t
|
|
multi_fn = simd_cast, e:uint16x4_t, d
|
|
multi_fn = simd_add, a, {simd_cast, e}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
|
|
validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20
|
|
|
|
arm = vabal.s
|
|
aarch64 = sabal
|
|
generate int32x4_t:int16x4_t:int16x4_t:int32x4_t
|
|
|
|
/// Signed Absolute difference and Accumulate Long
|
|
name = vabal
|
|
multi_fn = vabd-signed-noext, b, c, d:int32x2_t
|
|
multi_fn = simd_cast, e:uint32x2_t, d
|
|
multi_fn = simd_add, a, {simd_cast, e}
|
|
a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
|
|
validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20
|
|
|
|
arm = vabal.s
|
|
aarch64 = sabal
|
|
generate int64x2_t:int32x2_t:int32x2_t:int64x2_t
|
|
|
|
/// Signed Absolute difference and Accumulate Long
|
|
name = vabal_high
|
|
no-q
|
|
multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15]
|
|
multi_fn = vabd_s8, d, e, f:int8x8_t
|
|
multi_fn = simd_cast, f:uint8x8_t, f
|
|
multi_fn = simd_add, a, {simd_cast, f}
|
|
a = 9, 10, 11, 12, 13, 14, 15, 16
|
|
b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
|
c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12
|
|
validate 20, 20, 20, 20, 20, 20, 20, 20
|
|
|
|
aarch64 = sabal
|
|
generate int16x8_t:int8x16_t:int8x16_t:int16x8_t
|
|
|
|
/// Signed Absolute difference and Accumulate Long
|
|
name = vabal_high
|
|
no-q
|
|
multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7]
|
|
multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7]
|
|
multi_fn = vabd_s16, d, e, f:int16x4_t
|
|
multi_fn = simd_cast, f:uint16x4_t, f
|
|
multi_fn = simd_add, a, {simd_cast, f}
|
|
a = 9, 10, 11, 12
|
|
b = 1, 2, 3, 4, 9, 10, 11, 12
|
|
c = 10, 10, 10, 10, 20, 0, 2, 4
|
|
validate 20, 20, 20, 20
|
|
|
|
aarch64 = sabal
|
|
generate int32x4_t:int16x8_t:int16x8_t:int32x4_t
|
|
|
|
/// Signed Absolute difference and Accumulate Long
|
|
name = vabal_high
|
|
no-q
|
|
multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3]
|
|
multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3]
|
|
multi_fn = vabd_s32, d, e, f:int32x2_t
|
|
multi_fn = simd_cast, f:uint32x2_t, f
|
|
multi_fn = simd_add, a, {simd_cast, f}
|
|
a = 15, 16
|
|
b = 1, 2, 15, 16
|
|
c = 10, 10, 10, 12
|
|
validate 20, 20
|
|
|
|
aarch64 = sabal
|
|
generate int64x2_t:int32x4_t:int32x4_t:int64x2_t
|
|
|
|
////////////////////
|
|
// Singned saturating Absolute value
|
|
////////////////////
|
|
|
|
/// Singned saturating Absolute value
|
|
name = vqabs
|
|
a = MIN, MAX, -6, -5, -4, -3, -2, -1, 0, -127, 127, 1, 2, 3, 4, 5
|
|
validate MAX, MAX, 6, 5, 4, 3, 2, 1, 0, 127, 127, 1, 2, 3, 4, 5
|
|
|
|
arm = vqabs.s
|
|
aarch64 = sqabs
|
|
link-arm = vqabs._EXT_
|
|
link-aarch64 = sqabs._EXT_
|
|
generate int*_t
|
|
|
|
/// Singned saturating Absolute value
|
|
name = vqabs
|
|
a = MIN, -7
|
|
validate MAX, 7
|
|
|
|
aarch64 = sqabs
|
|
link-aarch64 = sqabs._EXT_
|
|
generate int64x*_t
|