// ARM Neon intrinsic specification. // // This file contains the specification for a number of // intrinsics that allows us to generate them along with // their test cases. // // To the syntax of the file - it's not very intelligently parsed! // // # Comments // start with AT LEAST two, or four or more slashes so // is a // comment /////// is too. // // # Sections // Sections start with EXACTLY three slashes followed // by AT LEAST one space. Sections are used for two things: // // 1) they serve as the doc comment for the given intrinics. // 2) they reset all variables (name, fn, etc.) // // # Variables // // name - The prefix of the function, suffixes are auto // generated by the type they get passed. // // fn - The function to call in rust-land. // // aarch64 - The intrinsic to check on aarch64 architecture. // If this is given but no arm intrinsic is provided, // the function will exclusively be generated for // aarch64. // This is used to generate both aarch64 specific and // shared intrinics by first only specifying th aarch64 // variant then the arm variant. // // arm - The arm v7 intrinics used to checked for arm code // generation. All neon functions available in arm are // also available in aarch64. If no aarch64 intrinic was // set they are assumed to be the same. // Intrinics ending with a `.` will have a size suffixes // added (such as `i8` or `i64`) that is not sign specific // Intrinics ending with a `.s` will have a size suffixes // added (such as `s8` or `u64`) that is sign specific // // a - First input for tests, it gets scaled to the size of // the type. // // b - Second input for tests, it gets scaled to the size of // the type. // // # special values // // TRUE - 'true' all bits are set to 1 // FALSE - 'false' all bits are set to 0 // FF - same as 'true' // MIN - minimal value (either 0 or the lowest negative number) // MAX - maximal value proper to overflow // // # validate // Validates a and b aginst the expected result of the test. // The special values 'TRUE' and 'FALSE' can be used to // represent the correct NEON representation of true or // false values. It too gets scaled to the type. // // Validate needs to be called before generate as it sets // up the rules for validation that get generated for each // type. // # generate // The generate command generates the intrinsics, it uses the // Variables set and can be called multiple times while overwriting // some of the variables. /// Vector bitwise and name = vand fn = simd_and arm = vand aarch64 = and a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 b = 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 validate 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Vector bitwise or (immediate, inclusive) name = vorr fn = simd_or arm = vorr aarch64 = orr a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Vector bitwise exclusive or (vector) name = veor fn = simd_xor arm = veor aarch64 = eor a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F generate int*_t, uint*_t, int64x*_t, uint64x*_t //////////////////// // Absolute difference between the arguments //////////////////// /// Absolute difference between the arguments name = vabd a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 validate 15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15 arm = vabd.s aarch64 = sabd link-arm = vabds._EXT_ link-aarch64 = sabd._EXT_ generate int*_t arm = vabd.s aarch64 = uabd link-arm = vabdu._EXT_ link-aarch64 = uabd._EXT_ generate uint*_t /// Absolute difference between the arguments of Floating name = vabd a = 1.0, 2.0, 5.0, -4.0 b = 9.0, 3.0, 2.0, 8.0 validate 8.0, 1.0, 3.0, 12.0 aarch64 = fabd link-aarch64 = fabd._EXT_ generate float64x*_t arm = vabd.s aarch64 = fabd link-arm = vabds._EXT_ link-aarch64 = fabd._EXT_ generate float*_t //////////////////// // Absolute difference Long //////////////////// /// Unsigned Absolute difference Long name = vabdl multi_fn = simd_cast, {vabd-unsigned-noext, a, b} a = 1, 2, 3, 4, 4, 3, 2, 1 b = 10, 10, 10, 10, 10, 10, 10, 10 validate 9, 8, 7, 6, 6, 7, 8, 9 arm = vabdl.s aarch64 = uabdl generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t /// Signed Absolute difference Long name = vabdl multi_fn = simd_cast, c:uint8x8_t, {vabd-signed-noext, a, b} multi_fn = simd_cast, c a = 1, 2, 3, 4, 4, 3, 2, 1 b = 10, 10, 10, 10, 10, 10, 10, 10 validate 9, 8, 7, 6, 6, 7, 8, 9 arm = vabdl.s aarch64 = sabdl generate int8x8_t:int8x8_t:int16x8_t /// Signed Absolute difference Long name = vabdl multi_fn = simd_cast, c:uint16x4_t, {vabd-signed-noext, a, b} multi_fn = simd_cast, c a = 1, 2, 11, 12 b = 10, 10, 10, 10 validate 9, 8, 1, 2 arm = vabdl.s aarch64 = sabdl generate int16x4_t:int16x4_t:int32x4_t /// Signed Absolute difference Long name = vabdl multi_fn = simd_cast, c:uint32x2_t, {vabd-signed-noext, a, b} multi_fn = simd_cast, c a = 1, 11 b = 10, 10 validate 9, 1 arm = vabdl.s aarch64 = sabdl generate int32x2_t:int32x2_t:int64x2_t /// Unsigned Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, {vabd_u8, c, d} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 validate 1, 0, 1, 2, 3, 4, 5, 6 aarch64 = uabdl generate uint8x16_t:uint8x16_t:uint16x8_t /// Unsigned Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7] multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_cast, {vabd_u16, c, d} a = 1, 2, 3, 4, 8, 9, 11, 12 b = 10, 10, 10, 10, 10, 10, 10, 10 validate 2, 1, 1, 2 aarch64 = uabdl generate uint16x8_t:uint16x8_t:uint32x4_t /// Unsigned Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3] multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3] multi_fn = simd_cast, {vabd_u32, c, d} a = 1, 2, 3, 4 b = 10, 10, 10, 10 validate 7, 6 aarch64 = uabdl generate uint32x4_t:uint32x4_t:uint64x2_t /// Signed Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d} multi_fn = simd_cast, e a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 validate 1, 0, 1, 2, 3, 4, 5, 6 aarch64 = sabdl generate int8x16_t:int8x16_t:int16x8_t /// Signed Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7] multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d} multi_fn = simd_cast, e a = 1, 2, 3, 4, 9, 10, 11, 12 b = 10, 10, 10, 10, 10, 10, 10, 10 validate 1, 0, 1, 2 aarch64 = sabdl generate int16x8_t:int16x8_t:int32x4_t /// Signed Absolute difference Long name = vabdl_high no-q multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3] multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3] multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d} multi_fn = simd_cast, e a = 1, 2, 3, 4 b = 10, 10, 10, 10 validate 7, 6 aarch64 = sabdl generate int32x4_t:int32x4_t:int64x2_t //////////////////// // equality //////////////////// /// Compare bitwise Equal (vector) name = vceq fn = simd_eq a = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX b = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE a = MIN, MIN, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, MAX b = MIN, MAX, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, MIN validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE aarch64 = cmeq generate uint64x*_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t arm = vceq. generate uint*_t, int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t /// Floating-point compare equal name = vceq fn = simd_eq a = 1.2, 3.4, 5.6, 7.8 b = 1.2, 3.4, 5.6, 7.8 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmeq generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vceq. // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Signed compare bitwise equal to zero name = vceqz fn = simd_eq a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = cmeq generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t /// Unsigned compare bitwise equal to zero name = vceqz fn = simd_eq a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = cmeq generate uint*_t, uint64x*_t /// Floating-point compare bitwise equal to zero name = vceqz fn = simd_eq a = 0.0, 1.2, 3.4, 5.6 fixed = 0.0, 0.0, 0.0, 0.0 validate TRUE, FALSE, FALSE, FALSE aarch64 = fcmeq generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Signed compare bitwise Test bits nonzero name = vtst multi_fn = simd_and, c:in_t, a, b multi_fn = fixed, d:in_t multi_fn = simd_ne, c, transmute(d) a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmtst generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t arm = vtst generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t /// Unsigned compare bitwise Test bits nonzero name = vtst multi_fn = simd_and, c:in_t, a, b multi_fn = fixed, d:in_t multi_fn = simd_ne, c, transmute(d) a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmtst generate uint64x*_t arm = vtst generate uint*_t //////////////////// // Floating-point absolute value //////////////////// /// Floating-point absolute value name = vabs fn = simd_fabs a = -0.1, -2.2, -3.3, -6.6 validate 0.1, 2.2, 3.3, 6.6 aarch64 = fabs generate float64x1_t:float64x1_t, float64x2_t:float64x2_t arm = vabs generate float32x2_t:float32x2_t, float32x4_t:float32x4_t //////////////////// // greater then //////////////////// /// Compare signed greater than name = vcgt fn = simd_gt a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmgt generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t arm = vcgt.s generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t /// Compare unsigned highe name = vcgt fn = simd_gt a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmhi generate uint64x*_t arm = vcgt.s generate uint*_t /// Floating-point compare greater than name = vcgt fn = simd_gt a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmgt generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vcgt.s // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t //////////////////// // lesser then //////////////////// /// Compare signed less than name = vclt fn = simd_lt a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmgt generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t arm = vcgt.s generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t /// Compare unsigned less than name = vclt fn = simd_lt a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmhi generate uint64x*_t arm = vcgt.s generate uint*_t /// Floating-point compare less than name = vclt fn = simd_lt a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmgt generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vcgt.s // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t //////////////////// // lesser then equals //////////////////// /// Compare signed less than or equal name = vcle fn = simd_le a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmge generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t arm = vcge.s generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t /// Compare unsigned less than or equal name = vcle fn = simd_le a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmhs generate uint64x*_t arm = vcge.s generate uint*_t /// Floating-point compare less than or equal name = vcle fn = simd_le a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmge generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t arm = vcge.s generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t //////////////////// // greater then equals //////////////////// /// Compare signed greater than or equal name = vcge fn = simd_ge a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmge generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t arm = vcge.s generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t /// Compare unsigned greater than or equal name = vcge fn = simd_ge a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmhs generate uint64x*_t arm = vcge.s generate uint*_t /// Floating-point compare greater than or equal name = vcge fn = simd_ge a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmge generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vcge.s // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Compare signed greater than or equal to zero name = vcgez fn = simd_ge a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmge generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t /// Floating-point compare greater than or equal to zero name = vcgez fn = simd_ge a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 validate FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmge generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Compare signed greater than zero name = vcgtz fn = simd_gt a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = cmgt generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t /// Floating-point compare greater than zero name = vcgtz fn = simd_gt a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE aarch64 = fcmgt generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Compare signed less than or equal to zero name = vclez fn = simd_le a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = cmgt generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t /// Floating-point compare less than or equal to zero name = vclez fn = simd_le a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = fcmle generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Compare signed less than zero name = vcltz fn = simd_lt a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = sshr generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t /// Floating-point compare less than zero name = vcltz fn = simd_lt a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 validate TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE aarch64 = fcmlt generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Count leading sign bits name = vcls a = MIN, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX validate 0, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 arm = vcls.s aarch64 = cls link-arm = vcls._EXT_ link-aarch64 = cls._EXT_ generate int*_t /// Signed count leading sign bits name = vclz multi_fn = self-signed-ext, a a = MIN, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX validate 0, 0, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 1 arm = vclz. aarch64 = clz generate int*_t /// Unsigned count leading sign bits name = vclz multi_fn = transmute, {self-signed-ext, transmute(a)} a = MIN, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX validate BITS, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 arm = vclz. aarch64 = clz generate uint*_t /// Floating-point absolute compare greater than name = vcagt a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE aarch64 = facgt link-aarch64 = facgt._EXT2_._EXT_ generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vacgt.s link-arm = vacgt._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Floating-point absolute compare greater than or equal name = vcage a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 validate TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE aarch64 = facge link-aarch64 = facge._EXT2_._EXT_ generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vacge.s link-arm = vacge._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Floating-point absolute compare less than name = vcalt multi_fn = vcagt-self-noext, b, a a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 validate FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE aarch64 = facgt generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vacgt.s generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Floating-point absolute compare less than or equal name = vcale multi_fn = vcage-self-noext , b, a a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 validate FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE aarch64 = facge generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t arm = vacge.s generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 0:1 validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = mov generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x2_t, int32x4_t, int64x2_t generate uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x2_t, uint32x4_t, uint64x2_t generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1., 2., 3., 4. b = 0., 0.5, 0., 0. n = 0:1 validate 0.5, 2., 3., 4. aarch64 = mov generate float32x2_t, float32x4_t, float64x2_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 0:1 validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = mov generate int8x8_t:int8x16_t:int8x8_t, int16x4_t:int16x8_t:int16x4_t, int32x2_t:int32x4_t:int32x2_t generate uint8x8_t:uint8x16_t:uint8x8_t, uint16x4_t:uint16x8_t:uint16x4_t, uint32x2_t:uint32x4_t:uint32x2_t generate poly8x8_t:poly8x16_t:poly8x8_t, poly16x4_t:poly16x8_t:poly16x4_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2} a = 1., 2., 3., 4. b = 0., 0.5, 0., 0. n = 0:1 validate 0.5, 2., 3., 4. aarch64 = mov generate float32x2_t:float32x4_t:float32x2_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 0:1 validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = mov generate int8x16_t:int8x8_t:int8x16_t, int16x8_t:int16x4_t:int16x8_t, int32x4_t:int32x2_t:int32x4_t generate uint8x16_t:uint8x8_t:uint8x16_t, uint16x8_t:uint16x4_t:uint16x8_t, uint32x4_t:uint32x2_t:uint32x4_t generate poly8x16_t:poly8x8_t:poly8x16_t, poly16x8_t:poly16x4_t:poly16x8_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1:0 validate 1, MAX aarch64 = zip1 generate int64x2_t:int64x1_t:int64x2_t, uint64x2_t:uint64x1_t:uint64x2_t, poly64x2_t:poly64x1_t:poly64x2_t /// Insert vector element from another vector element name = vcopy lane-suffixes constn = LANE1:LANE2 multi_fn = static_assert_imm-in0_exp_len-LANE1 multi_fn = static_assert_imm-in_exp_len-LANE2 multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} a = 1., 2., 3., 4. b = 0.5, 0., 0., 0. n = 1:0 validate 1., 0.5, 3., 4. aarch64 = mov generate float32x4_t:float32x2_t:float32x4_t aarch64 = zip1 generate float64x2_t:float64x1_t:float64x2_t /// Insert vector element from another vector element name = vcreate out-suffix multi_fn = transmute, a a = 1 validate 1, 0, 0, 0, 0, 0, 0, 0 aarch64 = nop arm = nop generate u64:int8x8_t, u64:int16x4_t: u64:int32x2_t, u64:int64x1_t generate u64:uint8x8_t, u64:uint16x4_t: u64:uint32x2_t, u64:uint64x1_t generate u64:poly8x8_t, u64:poly16x4_t target = aes generate u64:poly64x1_t /// Insert vector element from another vector element name = vcreate out-suffix multi_fn = transmute, a a = 0 validate 0., 0. aarch64 = nop generate u64:float64x1_t arm = nop generate u64:float32x2_t /// Fixed-point convert to floating-point name = vcvt double-suffixes fn = simd_cast a = 1, 2, 3, 4 validate 1., 2., 3., 4. aarch64 = scvtf generate int64x1_t:float64x1_t, int64x2_t:float64x2_t aarch64 = ucvtf generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t arm = vcvt aarch64 = scvtf generate int32x2_t:float32x2_t, int32x4_t:float32x4_t aarch64 = ucvtf generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t /// Floating-point convert to higher precision long name = vcvt double-suffixes fn = simd_cast a = -1.2, 1.2 validate -1.2f32 as f64, 1.2f32 as f64 aarch64 = fcvtl generate float32x2_t:float64x2_t /// Floating-point convert to higher precision long name = vcvt_high noq-double-suffixes multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3] multi_fn = simd_cast, b a = -1.2, 1.2, 2.3, 3.4 validate 2.3f32 as f64, 3.4f32 as f64 aarch64 = fcvtl generate float32x4_t:float64x2_t /// Floating-point convert to lower precision narrow name = vcvt double-suffixes fn = simd_cast a = -1.2, 1.2 validate -1.2f64 as f32, 1.2f64 as f32 aarch64 = fcvtn generate float64x2_t:float32x2_t /// Floating-point convert to lower precision narrow name = vcvt_high noq-double-suffixes multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3] a = -1.2, 1.2 b = -2.3, 3.4 validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32 aarch64 = fcvtn generate float32x2_t:float64x2_t:float32x4_t /// Floating-point convert to lower precision narrow, rounding to odd name = vcvtx double-suffixes a = -1.0, 2.0 validate -1.0, 2.0 aarch64 = fcvtxn link-aarch64 = fcvtxn._EXT2_._EXT_ generate float64x2_t:float32x2_t /// Floating-point convert to lower precision narrow, rounding to odd name = vcvtx_high noq-double-suffixes multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3] a = -1.0, 2.0 b = -3.0, 4.0 validate -1.0, 2.0, -3.0, 4.0 aarch64 = fcvtxn generate float32x2_t:float64x2_t:float32x4_t /// Fixed-point convert to floating-point name = vcvt double-n-suffixes constn = N multi_fn = static_assert-N-1-bits a = 1, 2, 3, 4 n = 2 validate 0.25, 0.5, 0.75, 1. arm-aarch64-separate aarch64 = scvtf link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ const-aarch64 = N generate int64x1_t:float64x1_t, int64x2_t:float64x2_t, i32:f32, i64:f64 aarch64 = ucvtf link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ const-aarch64 = N generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t, u32:f32, u64:f64 aarch64 = scvtf link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ arm = vcvt link-arm = vcvtfxs2fp._EXT2_._EXT_ const-arm = N:i32 generate int32x2_t:float32x2_t, int32x4_t:float32x4_t aarch64 = ucvtf link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ arm = vcvt link-arm = vcvtfxu2fp._EXT2_._EXT_ const-arm = N:i32 generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t /// Floating-point convert to fixed-point, rounding toward zero name = vcvt double-n-suffixes constn = N multi_fn = static_assert-N-1-bits a = 0.25, 0.5, 0.75, 1. n = 2 validate 1, 2, 3, 4 arm-aarch64-separate aarch64 = fcvtzs link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ const-aarch64 = N generate float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 aarch64 = fcvtzu link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ const-aarch64 = N generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 aarch64 = fcvtzs link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ arm = vcvt link-arm = vcvtfp2fxs._EXT2_._EXT_ const-arm = N:i32 generate float32x2_t:int32x2_t, float32x4_t:int32x4_t aarch64 = fcvtzu link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ arm = vcvt link-arm = vcvtfp2fxu._EXT2_._EXT_ const-arm = N:i32 generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Fixed-point convert to floating-point name = vcvt double-suffixes multi_fn = a as out_t a = 1 validate 1. aarch64 = scvtf generate i32:f32, i64:f64 aarch64 = ucvtf generate u32:f32, u64:f64 /// Fixed-point convert to floating-point name = vcvt double-suffixes multi_fn = a as out_t a = 1. validate 1 aarch64 = fcvtzs generate f32:i32, f64:i64 aarch64 = fcvtzu generate f32:u32, f64:u64 /// Floating-point convert to signed fixed-point, rounding toward zero name = vcvt double-suffixes link-aarch64 = llvm.fptosi.sat._EXT2_._EXT_ a = -1.1, 2.1, -2.9, 3.9 validate -1, 2, -2, 3 aarch64 = fcvtzs generate float64x1_t:int64x1_t, float64x2_t:int64x2_t link-arm = llvm.fptosi.sat._EXT2_._EXT_ arm = vcvt generate float32x2_t:int32x2_t, float32x4_t:int32x4_t /// Floating-point convert to unsigned fixed-point, rounding toward zero name = vcvt double-suffixes link-aarch64 = llvm.fptoui.sat._EXT2_._EXT_ a = 1.1, 2.1, 2.9, 3.9 validate 1, 2, 2, 3 aarch64 = fcvtzu generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t link-arm = llvm.fptoui.sat._EXT2_._EXT_ arm = vcvt generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t /// Floating-point convert to signed integer, rounding to nearest with ties to away name = vcvta double-suffixes a = -1.1, 2.1, -2.9, 3.9 validate -1, 2, -3, 4 aarch64 = fcvtas link-aarch64 = fcvtas._EXT2_._EXT_ generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t /// Floating-point convert to integer, rounding to nearest with ties to away name = vcvta double-suffixes a = 2.9 validate 3 aarch64 = fcvtas link-aarch64 = fcvtas._EXT2_._EXT_ generate f32:i32, f64:i64 aarch64 = fcvtau link-aarch64 = fcvtau._EXT2_._EXT_ generate f32:u32, f64:u64 /// Floating-point convert to signed integer, rounding to nearest with ties to even name = vcvtn double-suffixes a = -1.5, 2.1, -2.9, 3.9 validate -2, 2, -3, 4 aarch64 = fcvtns link-aarch64 = fcvtns._EXT2_._EXT_ generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 /// Floating-point convert to signed integer, rounding toward minus infinity name = vcvtm double-suffixes a = -1.1, 2.1, -2.9, 3.9 validate -2, 2, -3, 3 aarch64 = fcvtms link-aarch64 = fcvtms._EXT2_._EXT_ generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 /// Floating-point convert to signed integer, rounding toward plus infinity name = vcvtp double-suffixes a = -1.1, 2.1, -2.9, 3.9 validate -1, 3, -2, 4 aarch64 = fcvtps link-aarch64 = fcvtps._EXT2_._EXT_ generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 /// Floating-point convert to unsigned integer, rounding to nearest with ties to away name = vcvta double-suffixes a = 1.1, 2.1, 2.9, 3.9 validate 1, 2, 3, 4 aarch64 = fcvtau link-aarch64 = fcvtau._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t /// Floating-point convert to unsigned integer, rounding to nearest with ties to even name = vcvtn double-suffixes a = 1.5, 2.1, 2.9, 3.9 validate 2, 2, 3, 4 aarch64 = fcvtnu link-aarch64 = fcvtnu._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 /// Floating-point convert to unsigned integer, rounding toward minus infinity name = vcvtm double-suffixes a = 1.1, 2.1, 2.9, 3.9 validate 1, 2, 2, 3 aarch64 = fcvtmu link-aarch64 = fcvtmu._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 /// Floating-point convert to unsigned integer, rounding toward plus infinity name = vcvtp double-suffixes a = 1.1, 2.1, 2.9, 3.9 validate 2, 3, 3, 4 aarch64 = fcvtpu link-aarch64 = fcvtpu._EXT2_._EXT_ generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 n = HFLEN validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 aarch64 = dup generate poly64x2_t, poly64x1_t:poly64x2_t arm = vdup.l generate int*_t generate int8x16_t:int8x8_t, int16x8_t:int16x4_t, int32x4_t:int32x2_t generate int8x8_t:int8x16_t, int16x4_t:int16x8_t, int32x2_t:int32x4_t generate uint*_t generate uint8x16_t:uint8x8_t, uint16x8_t:uint16x4_t, uint32x4_t:uint32x2_t generate uint8x8_t:uint8x16_t, uint16x4_t:uint16x8_t, uint32x2_t:uint32x4_t generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t generate poly8x16_t:poly8x8_t, poly16x8_t:poly16x4_t generate poly8x8_t:poly8x16_t, poly16x4_t:poly16x8_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 n = HFLEN validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 aarch64 = dup arm = vmov generate int64x2_t, int64x1_t:int64x2_t, uint64x2_t, uint64x1_t:uint64x2_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} a = 1., 1., 1., 4. n = HFLEN validate 1., 1., 1., 1. aarch64 = dup generate float64x2_t, float64x1_t:float64x2_t arm = vdup.l generate float*_t, float32x4_t:float32x2_t, float32x2_t:float32x4_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = a a = 0 n = HFLEN validate 0 aarch64 = nop generate poly64x1_t arm = nop generate int64x1_t, uint64x1_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = a a = 0. n = HFLEN validate 0. aarch64 = nop generate float64x1_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = transmute--, {simd_extract, a, N as u32} a = 0, 1 n = HFLEN validate 1 aarch64 = nop generate poly64x2_t:poly64x1_t arm = vmov generate int64x2_t:int64x1_t, uint64x2_t:uint64x1_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = transmute--, {simd_extract, a, N as u32} a = 0., 1. n = HFLEN validate 1. aarch64 = nop generate float64x2_t:float64x1_t /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, a, N as u32 a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 n = HFLEN validate 1 aarch64 = nop generate int8x8_t:i8, int8x16_t:i8, int16x4_t:i16, int16x8_t:i16, int32x2_t:i32, int32x4_t:i32, int64x1_t:i64, int64x2_t:i64 generate uint8x8_t:u8, uint8x16_t:u8, uint16x4_t:u16, uint16x8_t:u16, uint32x2_t:u32, uint32x4_t:u32, uint64x1_t:u64, uint64x2_t:u64 generate poly8x8_t:p8, poly8x16_t:p8, poly16x4_t:p16, poly16x8_t:p16 /// Set all vector lanes to the same value name = vdup lane-suffixes constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, a, N as u32 a = 1., 1., 1., 4. n = HFLEN validate 1. aarch64 = nop generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64 /// Extract vector from pair of vectors name = vext constn = N multi_fn = static_assert_imm-out_exp_len-N multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15 b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11 n = HFLEN validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19 arm = "vext.8" aarch64 = ext generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t /// Extract vector from pair of vectors name = vext constn = N multi_fn = static_assert_imm-out_exp_len-N multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15 b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11 n = HFLEN validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19 aarch64 = ext generate poly64x2_t arm = vmov generate int64x2_t, uint64x2_t /// Extract vector from pair of vectors name = vext constn = N multi_fn = static_assert_imm-out_exp_len-N multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} a = 0., 2., 2., 3. b = 3., 4., 5., 6., n = HFLEN validate 2., 3., 3., 4. aarch64 = ext generate float64x2_t arm = "vext.8" generate float*_t /// Multiply-add to accumulator name = vmla multi_fn = simd_add, a, {simd_mul, b, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmla. aarch64 = mla generate int*_t, uint*_t /// Floating-point multiply-add to accumulator name = vmla multi_fn = simd_add, a, {simd_mul, b, c} a = 0., 1., 2., 3. b = 2., 2., 2., 2. c = 3., 3., 3., 3. validate 6., 7., 8., 9. aarch64 = fmul generate float64x*_t arm = vmla. generate float*_t /// Vector multiply accumulate with scalar name = vmla n-suffix multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 aarch64 = mla arm = vmla. generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t /// Vector multiply accumulate with scalar name = vmla n-suffix multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} a = 0., 1., 2., 3. b = 2., 2., 2., 2. c = 3. validate 6., 7., 8., 9. aarch64 = fmul arm = vmla. generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t /// Vector multiply accumulate with scalar name = vmla in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 aarch64 = mla arm = vmla. generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t /// Vector multiply accumulate with scalar name = vmla in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 0., 1., 2., 3. b = 2., 2., 2., 2. c = 0., 3., 0., 0. n = 1 validate 6., 7., 8., 9. aarch64 = fmul arm = vmla. generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Signed multiply-add long name = vmlal multi_fn = simd_add, a, {vmull-self-noext, b, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmlal.s aarch64 = smlal generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Unsigned multiply-add long name = vmlal multi_fn = simd_add, a, {vmull-self-noext, b, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmlal.s aarch64 = umlal generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t /// Vector widening multiply accumulate with scalar name = vmlal n-suffix multi_fn = vmlal-self-noext, a, b, {vdup-nself-noext, c} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmlal.s aarch64 = smlal generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t aarch64 = umlal generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t /// Vector widening multiply accumulate with scalar name = vmlal_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 arm = vmlal.s aarch64 = smlal generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t aarch64 = umlal generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t /// Signed multiply-add long name = vmlal_high no-q multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} multi_fn = vmlal-noqself-noext, a, b, c a = 8, 7, 6, 5, 4, 3, 2, 1 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = smlal2 generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Unsigned multiply-add long name = vmlal_high no-q multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} multi_fn = vmlal-noqself-noext, a, b, c a = 8, 7, 6, 5, 4, 3, 2, 1 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = umlal2 generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t /// Multiply-add long name = vmlal_high_n no-q multi_fn = vmlal_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} a = 8, 7, 6, 5, 4, 3, 2, 1 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 c = 2 validate 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = smlal2 generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t aarch64 = umlal2 generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t /// Multiply-add long name = vmlal_high_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 8, 7, 6, 5, 4, 3, 2, 1 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = smlal2 generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t aarch64 = umlal2 generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t /// Multiply-subtract from accumulator name = vmls multi_fn = simd_sub, a, {simd_mul, b, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmls. aarch64 = mls generate int*_t, uint*_t /// Floating-point multiply-subtract from accumulator name = vmls multi_fn = simd_sub, a, {simd_mul, b, c} a = 6., 7., 8., 9. b = 2., 2., 2., 2. c = 3., 3., 3., 3. validate 0., 1., 2., 3. aarch64 = fmul generate float64x*_t arm = vmls. generate float*_t /// Vector multiply subtract with scalar name = vmls n-suffix multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = mls arm = vmls. generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t /// Vector multiply subtract with scalar name = vmls n-suffix multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} a = 6., 7., 8., 9. b = 2., 2., 2., 2. c = 3. validate 0., 1., 2., 3. aarch64 = fmul arm = vmls. generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t /// Vector multiply subtract with scalar name = vmls in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = mls arm = vmls. generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t /// Vector multiply subtract with scalar name = vmls in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 6., 7., 8., 9. b = 2., 2., 2., 2. c = 0., 3., 0., 0. n = 1 validate 0., 1., 2., 3. aarch64 = fmul arm = vmls. generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Signed multiply-subtract long name = vmlsl multi_fn = simd_sub, a, {vmull-self-noext, b, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmlsl.s aarch64 = smlsl generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Unsigned multiply-subtract long name = vmlsl multi_fn = simd_sub, a, {vmull-self-noext, b, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmlsl.s aarch64 = umlsl generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t /// Vector widening multiply subtract with scalar name = vmlsl n-suffix multi_fn = vmlsl-self-noext, a, b, {vdup-nself-noext, c} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmlsl.s aarch64 = smlsl generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t aarch64 = umlsl generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t /// Vector widening multiply subtract with scalar name = vmlsl_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 arm = vmlsl.s aarch64 = smlsl generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t aarch64 = umlsl generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t /// Signed multiply-subtract long name = vmlsl_high no-q multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} multi_fn = vmlsl-noqself-noext, a, b, c a = 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 14, 13, 12, 11, 10, 9, 8, 7 aarch64 = smlsl2 generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Unsigned multiply-subtract long name = vmlsl_high no-q multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} multi_fn = vmlsl-noqself-noext, a, b, c a = 14, 15, 16, 17, 18, 19, 20, 21 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 14, 13, 12, 11, 10, 9, 8, 7 aarch64 = umlsl2 generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t /// Multiply-subtract long name = vmlsl_high_n no-q multi_fn = vmlsl_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} a = 14, 15, 16, 17, 18, 19, 20, 21 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 c = 2 validate 14, 13, 12, 11, 10, 9, 8, 7 aarch64 = smlsl2 generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t aarch64 = umlsl2 generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t /// Multiply-subtract long name = vmlsl_high_lane in2-suffix constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} a = 14, 15, 16, 17, 18, 19, 20, 21 b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 14, 13, 12, 11, 10, 9, 8, 7 aarch64 = smlsl2 generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t aarch64 = umlsl2 generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t /// Extract narrow name = vmovn_high no-q multi_fn = simd_cast, c:in_t0, b multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len} a = 0, 1, 2, 3, 2, 3, 4, 5 b = 2, 3, 4, 5, 12, 13, 14, 15 validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15 aarch64 = xtn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Negate name = vneg fn = simd_neg a = 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8 validate 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8 aarch64 = neg generate int64x*_t arm = vneg.s generate int*_t /// Negate name = vneg fn = simd_neg a = 0., 1., -1., 2., -2., 3., -3., 4. validate 0., -1., 1., -2., 2., -3., 3., -4. aarch64 = fneg generate float64x*_t arm = vneg.s generate float*_t /// Signed saturating negate name = vqneg a = MIN, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7 validate MAX, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7 link-arm = vqneg._EXT_ link-aarch64 = sqneg._EXT_ aarch64 = sqneg generate int64x*_t arm = vqneg.s generate int*_t /// Saturating subtract name = vqsub a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26 arm = vqsub.s aarch64 = uqsub link-arm = llvm.usub.sat._EXT_ link-aarch64 = uqsub._EXT_ generate uint*_t, uint64x*_t arm = vqsub.s aarch64 = sqsub link-arm = llvm.ssub.sat._EXT_ link-aarch64 = sqsub._EXT_ generate int*_t, int64x*_t /// Saturating subtract name = vqsub multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqsub-in_ntt-noext, a, b}, 0 a = 42 b = 1 validate 41 aarch64 = sqsub generate i8, i16 aarch64 = uqsub generate u8, u16 /// Saturating subtract name = vqsub a = 42 b = 1 validate 41 aarch64 = uqsub link-aarch64 = uqsub._EXT_ generate u32, u64 aarch64 = sqsub link-aarch64 = sqsub._EXT_ generate i32, i64 /// Halving add name = vhadd a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29 arm = vhadd.s aarch64 = uhadd link-aarch64 = uhadd._EXT_ link-arm = vhaddu._EXT_ generate uint*_t arm = vhadd.s aarch64 = shadd link-aarch64 = shadd._EXT_ link-arm = vhadds._EXT_ generate int*_t /// Reverse bit order name = vrbit a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 aarch64 = rbit link-aarch64 = rbit._EXT_ generate int8x8_t, int8x16_t /// Reverse bit order name = vrbit multi_fn = transmute, {vrbit-signed-noext, transmute(a)} a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 aarch64 = rbit generate uint8x8_t, uint8x16_t, poly8x8_t, poly8x16_t /// Rounding halving add name = vrhadd a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29 arm = vrhadd.s aarch64 = urhadd link-arm = vrhaddu._EXT_ link-aarch64 = urhadd._EXT_ generate uint*_t arm = vrhadd.s aarch64 = srhadd link-arm = vrhadds._EXT_ link-aarch64 = srhadd._EXT_ generate int*_t /// Floating-point round to integral exact, using current rounding mode name = vrndx a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 0.0, 2.0, 2.0 aarch64 = frintx link-aarch64 = llvm.rint._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, to nearest with ties to away name = vrnda a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 1.0, 2.0, 3.0 aarch64 = frinta link-aarch64 = llvm.round._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, to nearest with ties to even name = vrndn a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 0.0, 2.0, 2.0 link-aarch64 = frintn._EXT_ aarch64 = frintn generate float64x*_t target = fp-armv8 arm = vrintn link-arm = vrintn._EXT_ generate float*_t /// Floating-point round to integral, toward minus infinity name = vrndm a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 0.0, 1.0, 2.0 aarch64 = frintm link-aarch64 = llvm.floor._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, toward plus infinity name = vrndp a = -1.5, 0.5, 1.5, 2.5 validate -1.0, 1.0, 2.0, 3.0 aarch64 = frintp link-aarch64 = llvm.ceil._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, toward zero name = vrnd a = -1.5, 0.5, 1.5, 2.5 validate -1.0, 0.0, 1.0, 2.0 aarch64 = frintz link-aarch64 = llvm.trunc._EXT_ generate float*_t, float64x*_t /// Floating-point round to integral, using current rounding mode name = vrndi a = -1.5, 0.5, 1.5, 2.5 validate -2.0, 0.0, 2.0, 2.0 aarch64 = frinti link-aarch64 = llvm.nearbyint._EXT_ generate float*_t, float64x*_t /// Saturating add name = vqadd a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 arm = vqadd.s aarch64 = uqadd link-arm = llvm.uadd.sat._EXT_ link-aarch64 = uqadd._EXT_ generate uint*_t, uint64x*_t arm = vqadd.s aarch64 = sqadd link-arm = llvm.sadd.sat._EXT_ link-aarch64 = sqadd._EXT_ generate int*_t, int64x*_t /// Saturating add name = vqadd multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqadd-in_ntt-noext, a, b}, 0 a = 42 b = 1 validate 43 aarch64 = sqadd generate i8, i16 aarch64 = uqadd generate u8, u16 /// Saturating add name = vqadd a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 aarch64 = uqadd link-aarch64 = uqadd._EXT_ generate u32, u64 aarch64 = sqadd link-aarch64 = sqadd._EXT_ generate i32, i64 /// Load multiple single-element structures to one, two, three, or four registers name = vld1 out-suffix a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 load_fn aarch64 = ld1 link-aarch64 = ld1x2._EXT2_ arm = vld1 link-arm = vld1x2._EXT2_ generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t link-aarch64 = ld1x3._EXT2_ link-arm = vld1x3._EXT2_ generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t, *const i64:int64x2x3_t link-aarch64 = ld1x4._EXT2_ link-arm = vld1x4._EXT2_ generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t, *const i64:int64x2x4_t /// Load multiple single-element structures to one, two, three, or four registers name = vld1 out-suffix multi_fn = transmute, {vld1-outsigned-noext, transmute(a)} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 load_fn aarch64 = ld1 arm = vld1 generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t, *const u64:uint64x2x3_t generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t, *const u64:uint64x2x4_t generate *const p8:poly8x8x2_t, *const p8:poly8x8x3_t, *const p8:poly8x8x4_t generate *const p8:poly8x16x2_t, *const p8:poly8x16x3_t, *const p8:poly8x16x4_t generate *const p16:poly16x4x2_t, *const p16:poly16x4x3_t, *const p16:poly16x4x4_t generate *const p16:poly16x8x2_t, *const p16:poly16x8x3_t, *const p16:poly16x8x4_t target = aes generate *const p64:poly64x1x2_t arm = ldr generate *const p64:poly64x1x3_t, *const p64:poly64x1x4_t generate *const p64:poly64x2x2_t, *const p64:poly64x2x3_t, *const p64:poly64x2x4_t /// Load multiple single-element structures to one, two, three, or four registers name = vld1 out-suffix a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. load_fn aarch64 = ld1 link-aarch64 = ld1x2._EXT2_ generate *const f64:float64x1x2_t, *const f64:float64x2x2_t link-aarch64 = ld1x3._EXT2_ generate *const f64:float64x1x3_t, *const f64:float64x2x3_t link-aarch64 = ld1x4._EXT2_ generate *const f64:float64x1x4_t, *const f64:float64x2x4_t arm = vld1 link-aarch64 = ld1x2._EXT2_ link-arm = vld1x2._EXT2_ generate *const f32:float32x2x2_t, *const f32:float32x4x2_t link-aarch64 = ld1x3._EXT2_ link-arm = vld1x3._EXT2_ generate *const f32:float32x2x3_t, *const f32:float32x4x3_t link-aarch64 = ld1x4._EXT2_ link-arm = vld1x4._EXT2_ generate *const f32:float32x2x4_t, *const f32:float32x4x4_t /// Load multiple 2-element structures to two registers name = vld2 out-nox a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 load_fn aarch64 = ld2 link-aarch64 = ld2._EXTv2_ arm = vld2 link-arm = vld2._EXTpi82_ //generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t //generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t /// Load multiple 2-element structures to two registers name = vld2 out-nox multi_fn = transmute, {vld2-outsignednox-noext, transmute(a)} a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 load_fn aarch64 = ld2 arm = vld2 //generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t //generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t //generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t /// Load multiple 2-element structures to two registers name = vld2 out-nox a = 0., 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9. validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. load_fn aarch64 = ld2 link-aarch64 = ld2._EXTv2_ //generate *const f64:float64x1x2_t, *const f64:float64x2x2_t arm = vld2 link-arm = vld2._EXTpi82_ //generate *const f32:float32x2x2_t, *const f32:float32x4x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 out-dup-nox a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn arm = vld2dup link-arm = vld2dup._EXTpi82_ aarch64 = ld2r link-aarch64 = ld2r._EXT2_ //generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t //generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 out-dup-nox multi_fn = transmute, {vld2-outsigneddupnox-noext, transmute(a)} a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn arm = vld2dup aarch64 = ld2r //generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t //generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t //generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 out-dup-nox a = 0., 1., 1., 2., 3., 1., 4., 3., 5. validate 1., 1., 1., 1., 1., 1., 1., 1. load_fn aarch64 = ld2r link-aarch64 = ld2r._EXT2_ //generate *const f64:float64x1x2_t, *const f64:float64x2x2_t arm = vld2dup link-arm = vld2dup._EXTpi82_ //generate *const f32:float32x2x2_t, *const f32:float32x4x2_t /// Load multiple 2-element structures to two registers name = vld2 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE constn = LANE a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 n = 0 validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 load_fn arm-aarch64-separate aarch64 = ld2lane const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ //generate *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t arm = vld2lane const-arm = LANE link-arm = vld2lane._EXTpi82_ //generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t //generate *const i8:int8x16x2_t:int8x16x2_t, *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t /// Load multiple 2-element structures to two registers name = vld2 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = transmute, {vld2-outsignedlanenox-::, transmute(a), transmute(b)} constn = LANE a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 n = 0 validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 load_fn arm-aarch64-separate aarch64 = ld2lane const-aarch64 = LANE target = aes //generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t target = default //generate *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t arm = vld2lane const-arm = LANE //generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t //generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t //generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t //generate *const p8:poly8x16x2_t:poly8x16x2_t, *const p16:poly16x8x2_t:poly16x8x2_t /// Load multiple 2-element structures to two registers name = vld2 out-lane-nox multi_fn = static_assert_imm-in_exp_len-LANE constn = LANE a = 0., 1., 2., 3., 4., 5., 6., 7., 8. b = 0., 2., 2., 14., 2., 16., 17., 18. n = 0 validate 1., 2., 2., 14., 2., 16., 17., 18. load_fn arm-aarch64-separate aarch64 = ld2lane const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ //generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t arm = vld2lane const-arm = LANE link-arm = vld2lane._EXTpi82_ //generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t /// Store multiple single-element structures from one, two, three, or four registers name = vst1 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 store_fn arm-aarch64-separate aarch64 = st1 link-aarch64 = st1x2._EXT3_ arm = vst1 link-arm = vst1x2._EXTr3_ generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void, *mut i64:int64x2x2_t:void link-aarch64 = st1x3._EXT3_ link-arm = vst1x3._EXTr3_ generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void, *mut i64:int64x2x3_t:void link-aarch64 = st1x4._EXT3_ link-arm = vst1x4._EXTr3_ generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void, *mut i64:int64x2x4_t:void /// Store multiple single-element structures to one, two, three, or four registers name = vst1 multi_fn = vst1-signed-noext, transmute(a), transmute(b) a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 store_fn aarch64 = st1 arm = vst1 generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void, *mut u64:uint64x2x2_t:void generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void, *mut u64:uint64x2x3_t:void generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void, *mut u64:uint64x2x4_t:void generate *mut p8:poly8x8x2_t:void, *mut p8:poly8x8x3_t:void, *mut p8:poly8x8x4_t:void generate *mut p8:poly8x16x2_t:void, *mut p8:poly8x16x3_t:void, *mut p8:poly8x16x4_t:void generate *mut p16:poly16x4x2_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x4x4_t:void generate *mut p16:poly16x8x2_t:void, *mut p16:poly16x8x3_t:void, *mut p16:poly16x8x4_t:void /// Store multiple single-element structures to one, two, three, or four registers name = vst1 a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. store_fn arm-aarch64-separate aarch64 = st1 link-aarch64 = st1x2._EXT3_ generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void link-aarch64 = st1x3._EXT3_ generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void link-aarch64 = st1x4._EXT3_ generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void arm = vst1 link-aarch64 = st1x2._EXT3_ link-arm = vst1x2._EXTr3_ generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void link-aarch64 = st1x3._EXT3_ link-arm = vst1x3._EXTr3_ generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void link-aarch64 = st1x4._EXT3_ link-arm = vst1x4._EXTr3_ generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void /// Multiply name = vmul a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 arm = vmul. aarch64 = mul fn = simd_mul generate int*_t, uint*_t /// Polynomial multiply name = vmul a = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 validate 1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48 aarch64 = pmul link-aarch64 = pmul._EXT_ arm = vmul link-arm = vmulp._EXT_ generate poly8x8_t, poly8x16_t /// Multiply name = vmul fn = simd_mul a = 1.0, 2.0, 1.0, 2.0 b = 2.0, 3.0, 4.0, 5.0 validate 2.0, 6.0, 4.0, 10.0 aarch64 = fmul generate float64x*_t arm = vmul. generate float*_t /// Vector multiply by scalar name = vmul out-n-suffix multi_fn = simd_mul, a, {vdup-nout-noext, b} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2 validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 arm = vmul aarch64 = mul generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t generate uint16x4_t:u16:uint16x4_t, uint16x8_t:u16:uint16x8_t, uint32x2_t:u32:uint32x2_t, uint32x4_t:u32:uint32x4_t /// Vector multiply by scalar name = vmul out-n-suffix multi_fn = simd_mul, a, {vdup-nout-noext, b} a = 1., 2., 3., 4. b = 2. validate 2., 4., 6., 8. aarch64 = fmul generate float64x1_t:f64:float64x1_t, float64x2_t:f64:float64x2_t arm = vmul generate float32x2_t:f32:float32x2_t, float32x4_t:f32:float32x4_t /// Multiply name = vmul lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 aarch64 = mul arm = vmul generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t generate uint16x4_t, uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t generate uint32x2_t, uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t /// Floating-point multiply name = vmul lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_mul, a, {transmute--, {simd_extract, b, LANE as u32}} a = 1., 2., 3., 4. b = 2., 0., 0., 0. n = 0 validate 2., 4., 6., 8. aarch64 = fmul generate float64x1_t, float64x1_t:float64x2_t:float64x1_t /// Floating-point multiply name = vmul lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}} a = 1., 2., 3., 4. b = 2., 0., 0., 0. n = 0 validate 2., 4., 6., 8. aarch64 = fmul generate float64x2_t:float64x1_t:float64x2_t, float64x2_t arm = vmul generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t /// Floating-point multiply name = vmuls_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_extract, b:f32, b, LANE as u32 multi_fn = a * b a = 1. b = 2., 0., 0., 0. n = 0 validate 2. aarch64 = fmul generate f32:float32x2_t:f32, f32:float32x4_t:f32 /// Floating-point multiply name = vmuld_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_extract, b:f64, b, LANE as u32 multi_fn = a * b a = 1. b = 2., 0. n = 0 validate 2. aarch64 = fmul generate f64:float64x1_t:f64, f64:float64x2_t:f64 /// Signed multiply long name = vmull a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 arm = vmull.s aarch64 = smull link-arm = vmulls._EXT_ link-aarch64 = smull._EXT_ generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t /// Signed multiply long name = vmull_high no-q multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = vmull-noqself-noext, a, b a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 9, 20, 11, 24, 13, 28, 15, 32 aarch64 = smull2 generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t /// Unsigned multiply long name = vmull a = 1, 2, 3, 4, 5, 6, 7, 8 b = 1, 2, 1, 2, 1, 2, 1, 2 validate 1, 4, 3, 8, 5, 12, 7, 16 arm = vmull.s aarch64 = umull link-arm = vmullu._EXT_ link-aarch64 = umull._EXT_ generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t /// Unsigned multiply long name = vmull_high no-q multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = vmull-noqself-noext, a, b a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 9, 20, 11, 24, 13, 28, 15, 32 aarch64 = umull2 generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t /// Polynomial multiply long name = vmull a = 1, 2, 3, 4, 5, 6, 7, 8 b = 1, 3, 1, 3, 1, 3, 1, 3 validate 1, 6, 3, 12, 5, 10, 7, 24 arm = vmull.s aarch64 = pmull link-arm = vmullp._EXT_ link-aarch64 = pmull._EXT_ generate poly8x8_t:poly8x8_t:poly16x8_t /// Polynomial multiply long name = vmull no-q a = 15 b = 3 validate 17 target = aes aarch64 = pmull link-aarch64 = pmull64:p64:p64:p64:int8x16_t // Because of the support status of llvm, vmull_p64 is currently only available on aarch64 // arm = vmull // link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t generate p64:p64:p128 /// Polynomial multiply long name = vmull_high no-q multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} multi_fn = vmull-noqself-noext, a, b a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 9, 30, 11, 20, 13, 18, 15, 48 aarch64 = pmull generate poly8x16_t:poly8x16_t:poly16x8_t /// Polynomial multiply long name = vmull_high no-q multi_fn = vmull-noqself-noext, {simd_extract, a, 1}, {simd_extract, b, 1} a = 1, 15 b = 1, 3 validate 17 target = aes aarch64 = pmull generate poly64x2_t:poly64x2_t:p128 /// Vector long multiply with scalar name = vmull_n no-q multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b} a = 1, 2, 3, 4, 5, 6, 7, 8 b = 2 validate 2, 4, 6, 8, 10, 12, 14, 16 arm = vmull aarch64 = smull generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t aarch64 = umull generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t /// Vector long multiply by scalar name = vmull_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 arm = vmull aarch64 = smull generate int16x4_t:int16x4_t:int32x4_t, int16x4_t:int16x8_t:int32x4_t generate int32x2_t:int32x2_t:int64x2_t, int32x2_t:int32x4_t:int64x2_t aarch64 = umull generate uint16x4_t:uint16x4_t:uint32x4_t, uint16x4_t:uint16x8_t:uint32x4_t generate uint32x2_t:uint32x2_t:uint64x2_t, uint32x2_t:uint32x4_t:uint64x2_t /// Multiply long name = vmull_high_n no-q multi_fn = vmull_high-noqself-noext, a, {vdup-nin0-noext, b} a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 2 validate 18, 20, 22, 24, 26, 28, 30, 32 aarch64 = smull2 generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t aarch64 = umull2 generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t /// Multiply long name = vmull_high_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 n = 1 validate 18, 20, 22, 24, 26, 28, 30, 32 aarch64 = smull2 generate int16x8_t:int16x4_t:int32x4_t, int16x8_t:int16x8_t:int32x4_t generate int32x4_t:int32x2_t:int64x2_t, int32x4_t:int32x4_t:int64x2_t aarch64 = umull2 generate uint16x8_t:uint16x4_t:uint32x4_t, uint16x8_t:uint16x8_t:uint32x4_t generate uint32x4_t:uint32x2_t:uint64x2_t, uint32x4_t:uint32x4_t:uint64x2_t /// Floating-point multiply extended name = vmulx a = 1., 2., 3., 4. b = 2., 2., 2., 2. validate 2., 4., 6., 8. aarch64 = fmulx link-aarch64 = fmulx._EXT_ generate float*_t, float64x*_t /// Floating-point multiply extended name = vmulx lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmulx-in0-noext, a, {transmute--, {simd_extract, b, LANE as u32}} a = 1. b = 2., 0. n = 0 validate 2. aarch64 = fmulx generate float64x1_t, float64x1_t:float64x2_t:float64x1_t /// Floating-point multiply extended name = vmulx lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} a = 1., 2., 3., 4. b = 2., 0., 0., 0. n = 0 validate 2., 4., 6., 8. aarch64 = fmulx generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t generate float64x2_t:float64x1_t:float64x2_t, float64x2_t /// Floating-point multiply extended name = vmulx a = 2. b = 3. validate 6. aarch64 = fmulx link-aarch64 = fmulx._EXT_ generate f32, f64 /// Floating-point multiply extended name = vmulx lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vmulx-out-noext, a, {simd_extract, b, LANE as u32} a = 2. b = 3., 0., 0., 0. n = 0 validate 6. aarch64 = fmulx generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:float64x2_t:f64 /// Floating-point fused Multiply-Add to accumulator(vector) name = vfma multi_fn = vfma-self-_, b, c, a a = 8.0, 18.0, 12.0, 10.0 b = 6.0, 4.0, 7.0, 8.0 c = 2.0, 3.0, 4.0, 5.0 validate 20.0, 30.0, 40.0, 50.0 link-aarch64 = llvm.fma._EXT_ aarch64 = fmadd generate float64x1_t aarch64 = fmla generate float64x2_t target = vfp4 arm = vfma link-arm = llvm.fma._EXT_ generate float*_t /// Floating-point fused Multiply-Add to accumulator(vector) name = vfma n-suffix multi_fn = vfma-self-noext, a, b, {vdup-nselfvfp4-noext, c} a = 2.0, 3.0, 4.0, 5.0 b = 6.0, 4.0, 7.0, 8.0 c = 8.0 validate 50.0, 35.0, 60.0, 69.0 aarch64 = fmadd generate float64x1_t:float64x1_t:f64:float64x1_t aarch64 = fmla generate float64x2_t:float64x2_t:f64:float64x2_t target = vfp4 arm = vfma generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t /// Floating-point fused multiply-add to accumulator name = vfma in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} a = 2., 3., 4., 5. b = 6., 4., 7., 8. c = 2., 0., 0., 0. n = 0 validate 14., 11., 18., 21. aarch64 = fmla generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t aarch64 = fmadd generate float64x1_t aarch64 = fmla generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t /// Floating-point fused multiply-add to accumulator name = vfma in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = simd_extract, c:out_t, c, LANE as u32 multi_fn = vfma-in2lane-_, b, c, a a = 2. b = 6. c = 3., 0., 0., 0. n = 0 validate 20. aarch64 = fmla link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32 generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64 aarch64 = fmadd generate f64:f64:float64x1_t:f64 aarch64 = fmla generate f64:f64:float64x2_t:f64 /// Floating-point fused multiply-subtract from accumulator name = vfms multi_fn = simd_neg, b:in_t, b multi_fn = vfma-self-noext, a, b, c a = 20.0, 30.0, 40.0, 50.0 b = 6.0, 4.0, 7.0, 8.0 c = 2.0, 3.0, 4.0, 5.0 validate 8.0, 18.0, 12.0, 10.0 aarch64 = fmsub generate float64x1_t aarch64 = fmls generate float64x2_t target = vfp4 arm = vfms generate float*_t /// Floating-point fused Multiply-subtract to accumulator(vector) name = vfms n-suffix multi_fn = vfms-self-noext, a, b, {vdup-nselfvfp4-noext, c} a = 50.0, 35.0, 60.0, 69.0 b = 6.0, 4.0, 7.0, 8.0 c = 8.0 validate 2.0, 3.0, 4.0, 5.0 aarch64 = fmsub generate float64x1_t:float64x1_t:f64:float64x1_t aarch64 = fmls generate float64x2_t:float64x2_t:f64:float64x2_t target = vfp4 arm = vfms generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t /// Floating-point fused multiply-subtract to accumulator name = vfms in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} a = 14., 11., 18., 21. b = 6., 4., 7., 8. c = 2., 0., 0., 0. n = 0 validate 2., 3., 4., 5. aarch64 = fmls generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t aarch64 = fmsub generate float64x1_t aarch64 = fmls generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t /// Floating-point fused multiply-subtract to accumulator name = vfms in2-lane-suffixes constn = LANE multi_fn = vfma-in2lane-::, a, -b, c a = 14. b = 6. c = 2., 0., 0., 0. n = 0 validate 2. aarch64 = fmls generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 aarch64 = fmsub generate f64:f64:float64x1_t:f64 aarch64 = fmls generate f64:f64:float64x2_t:f64 /// Divide name = vdiv fn = simd_div a = 2.0, 6.0, 4.0, 10.0 b = 1.0, 2.0, 1.0, 2.0 validate 2.0, 3.0, 4.0, 5.0 aarch64 = fdiv generate float*_t, float64x*_t /// Subtract name = vsub a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 validate 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 arm = vsub. aarch64 = sub fn = simd_sub generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Subtract name = vsub fn = simd_sub a = 1.0, 4.0, 3.0, 8.0 b = 1.0, 2.0, 3.0, 4.0 validate 0.0, 2.0, 0.0, 4.0 aarch64 = fsub generate float64x*_t arm = vsub. generate float*_t /// Signed Add Long across Vector name = vaddlv a = 1, 2, 3, 4 validate 10 aarch64 = saddlv link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ generate int16x4_t:i32 /// Signed Add Long across Vector name = vaddlv a = 1, 2, 3, 4, 5, 6, 7, 8 validate 36 aarch64 = saddlv link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ generate int16x8_t:i32 /// Signed Add Long across Vector name = vaddlv a = 1, 2 validate 3 aarch64 = saddlp link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ generate int32x2_t:i64 /// Signed Add Long across Vector name = vaddlv a = 1, 2, 3, 4 validate 10 aarch64 = saddlv link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ generate int32x4_t:i64 /// Unsigned Add Long across Vector name = vaddlv a = 1, 2, 3, 4 validate 10 aarch64 = uaddlv link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ generate uint16x4_t:u32 /// Unsigned Add Long across Vector name = vaddlv a = 1, 2, 3, 4, 5, 6, 7, 8 validate 36 aarch64 = uaddlv link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ generate uint16x8_t:u32 /// Unsigned Add Long across Vector name = vaddlv a = 1, 2 validate 3 aarch64 = uaddlp link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ generate uint32x2_t:u64 /// Unsigned Add Long across Vector name = vaddlv a = 1, 2, 3, 4 validate 10 aarch64 = uaddlv link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ generate uint32x4_t:u64 /// Subtract returning high narrow name = vsubhn no-q multi_fn = fixed, c:in_t multi_fn = simd_cast, {simd_shr, {simd_sub, a, b}, transmute(c)} a = MAX, MIN, 1, 1, MAX, MIN, 1, 1 b = 1, 0, 0, 0, 1, 0, 0, 0 fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS validate MAX, MIN, 0, 0, MAX, MIN, 0, 0 arm = vsubhn aarch64 = subhn generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Subtract returning high narrow name = vsubhn_high no-q multi_fn = vsubhn-noqself-noext, d:in_t0, b, c multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len} a = MAX, 0, MAX, 0, MAX, 0, MAX, 0 b = MAX, 1, MAX, 1, MAX, 1, MAX, 1 c = 1, 0, 1, 0, 1, 0, 1, 0 validate MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0 arm = vsubhn aarch64 = subhn2 generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t /// Signed halving subtract name = vhsub a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 validate 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 arm = vhsub.s aarch64 = uhsub link-arm = vhsubu._EXT_ link-aarch64 = uhsub._EXT_ generate uint*_t arm = vhsub.s aarch64 = shsub link-arm = vhsubs._EXT_ link-aarch64 = shsub._EXT_ generate int*_t /// Signed Subtract Wide name = vsubw no-q multi_fn = simd_sub, a, {simd_cast, b} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 arm = vsubw aarch64 = ssubw generate int16x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int64x2_t /// Unsigned Subtract Wide name = vsubw no-q multi_fn = simd_sub, a, {simd_cast, b} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 arm = vsubw aarch64 = usubw generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint64x2_t /// Signed Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9, 10, 12, 13, 14, 15, 16 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 validate 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = ssubw generate int16x8_t:int8x16_t:int16x8_t /// Signed Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9, 10, 11 b = 0, 1, 2, 3, 8, 9, 10, 11 validate 0, 0, 0, 0 aarch64 = ssubw generate int32x4_t:int16x8_t:int32x4_t /// Signed Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9 b = 6, 7, 8, 9 validate 0, 0 aarch64 = ssubw generate int64x2_t:int32x4_t:int64x2_t /// Unsigned Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9, 10, 11, 12, 13, 14, 15 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = usubw generate uint16x8_t:uint8x16_t:uint16x8_t /// Unsigned Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9, 10, 11 b = 0, 1, 2, 3, 8, 9, 10, 11 validate 0, 0, 0, 0 aarch64 = usubw generate uint32x4_t:uint16x8_t:uint32x4_t /// Unsigned Subtract Wide name = vsubw_high no-q multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3] multi_fn = simd_sub, a, {simd_cast, c} a = 8, 9 b = 6, 7, 8, 9 validate 0, 0 aarch64 = usubw generate uint64x2_t:uint32x4_t:uint64x2_t /// Signed Subtract Long name = vsubl no-q multi_fn = simd_cast, c:out_t, a multi_fn = simd_cast, d:out_t, b multi_fn = simd_sub, c, d a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 arm = vsubl aarch64 = ssubl generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t /// Unsigned Subtract Long name = vsubl no-q multi_fn = simd_cast, c:out_t, a multi_fn = simd_cast, d:out_t, b multi_fn = simd_sub, c, d a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 arm = vsubl aarch64 = usubl generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t /// Signed Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 validate 6, 7, 8, 9, 10, 11, 12, 13 aarch64 = ssubl generate int8x16_t:int8x16_t:int16x8_t /// Signed Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 8, 9, 10, 11, 12, 13, 14, 15 b = 6, 6, 6, 6, 8, 8, 8, 8 validate 4, 5, 6, 7 aarch64 = ssubl generate int16x8_t:int16x8_t:int32x4_t /// Signed Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 12, 13, 14, 15 b = 6, 6, 8, 8 validate 6, 7 aarch64 = ssubl generate int32x4_t:int32x4_t:int64x2_t /// Unsigned Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 validate 6, 7, 8, 9, 10, 11, 12, 13 aarch64 = usubl generate uint8x16_t:uint8x16_t:uint16x8_t /// Unsigned Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 8, 9, 10, 11, 12, 13, 14, 15 b = 6, 6, 6, 6, 8, 8, 8, 8 validate 4, 5, 6, 7 aarch64 = usubl generate uint16x8_t:uint16x8_t:uint32x4_t /// Unsigned Subtract Long name = vsubl_high no-q multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3] multi_fn = simd_cast, d:out_t, c multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3] multi_fn = simd_cast, f:out_t, e multi_fn = simd_sub, d, f a = 12, 13, 14, 15 b = 6, 6, 8, 8 validate 6, 7 aarch64 = usubl generate uint32x4_t:uint32x4_t:uint64x2_t /// Maximum (vector) name = vmax a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 validate 16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16 arm = vmax aarch64 = smax link-arm = vmaxs._EXT_ link-aarch64 = smax._EXT_ generate int*_t arm = vmax aarch64 = umax link-arm = vmaxu._EXT_ link-aarch64 = umax._EXT_ generate uint*_t /// Maximum (vector) name = vmax a = 1.0, -2.0, 3.0, -4.0 b = 0.0, 3.0, 2.0, 8.0 validate 1.0, 3.0, 3.0, 8.0 aarch64 = fmax link-aarch64 = fmax._EXT_ generate float64x*_t arm = vmax aarch64 = fmax link-arm = vmaxs._EXT_ link-aarch64 = fmax._EXT_ generate float*_t /// Floating-point Maximun Number (vector) name = vmaxnm a = 1.0, 2.0, 3.0, -4.0 b = 8.0, 16.0, -1.0, 6.0 validate 8.0, 16.0, 3.0, 6.0 aarch64 = fmaxnm link-aarch64 = fmaxnm._EXT_ generate float64x*_t target = fp-armv8 arm = vmaxnm aarch64 = fmaxnm link-arm = vmaxnm._EXT_ link-aarch64 = fmaxnm._EXT_ generate float*_t /// Floating-point Maximum Number Pairwise (vector). name = vpmaxnm a = 1.0, 2.0 b = 6.0, -3.0 validate 2.0, 6.0 aarch64 = fmaxnmp link-aarch64 = fmaxnmp._EXT_ generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t /// Floating-point Maximum Number Pairwise (vector). name = vpmaxnm a = 1.0, 2.0, 3.0, -4.0 b = 8.0, 16.0, -1.0, 6.0 validate 2.0, 3.0, 16.0, 6.0 aarch64 = fmaxnmp link-aarch64 = fmaxnmp._EXT_ generate float32x4_t:float32x4_t:float32x4_t /// Minimum (vector) name = vmin a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 validate 1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1 arm = vmin aarch64 = smin link-arm = vmins._EXT_ link-aarch64 = smin._EXT_ generate int*_t arm = vmin aarch64 = umin link-arm = vminu._EXT_ link-aarch64 = umin._EXT_ generate uint*_t /// Minimum (vector) name = vmin a = 1.0, -2.0, 3.0, -4.0 b = 0.0, 3.0, 2.0, 8.0 validate 0.0, -2.0, 2.0, -4.0 aarch64 = fmin link-aarch64 = fmin._EXT_ generate float64x*_t arm = vmin aarch64 = fmin link-arm = vmins._EXT_ link-aarch64 = fmin._EXT_ generate float*_t /// Floating-point Minimun Number (vector) name = vminnm a = 1.0, 2.0, 3.0, -4.0 b = 8.0, 16.0, -1.0, 6.0 validate 1.0, 2.0, -1.0, -4.0 aarch64 = fminnm link-aarch64 = fminnm._EXT_ generate float64x*_t target = fp-armv8 arm = vminnm aarch64 = fminnm link-arm = vminnm._EXT_ link-aarch64 = fminnm._EXT_ generate float*_t /// Floating-point Minimum Number Pairwise (vector). name = vpminnm a = 1.0, 2.0 b = 6.0, -3.0 validate 1.0, -3.0 aarch64 = fminnmp link-aarch64 = fminnmp._EXT_ generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t /// Floating-point Minimum Number Pairwise (vector). name = vpminnm a = 1.0, 2.0, 3.0, -4.0 b = 8.0, 16.0, -1.0, 6.0 validate 1.0, -4.0, 8.0, -1.0 aarch64 = fminnmp link-aarch64 = fminnmp._EXT_ generate float32x4_t:float32x4_t:float32x4_t /// Signed saturating doubling multiply long name = vqdmull a = 0, 1, 2, 3, 4, 5, 6, 7 b = 1, 2, 3, 4, 5, 6, 7, 8 validate 0, 4, 12, 24, 40, 60, 84, 108 aarch64 = sqdmull link-aarch64 = sqdmull._EXT2_ arm = vqdmull link-arm = vqdmull._EXT2_ generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply long name = vqdmull multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqdmull-in_ntt-noext, a, b}, 0 a = 2 b = 3 validate 12 aarch64 = sqdmull generate i16:i16:i32 /// Signed saturating doubling multiply long name = vqdmull a = 2 b = 3 validate 12 aarch64 = sqdmull link-aarch64 = sqdmulls.scalar generate i32:i32:i64 /// Vector saturating doubling long multiply with scalar name = vqdmull_n no-q multi_fn = vqdmull-in_ntt-noext, a, {vdup_n-in_ntt-noext, b} a = 2, 4, 6, 8 b = 2 validate 8, 16, 24, 32 aarch64 = sqdmull arm = vqdmull generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t /// Signed saturating doubling multiply long name = vqdmull_high no-q multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen} multi_fn = vqdmull-noqself-noext, a, b a = 0, 1, 4, 5, 4, 5, 6, 7 b = 1, 2, 5, 6, 5, 6, 7, 8 validate 40, 60, 84, 112 aarch64 = sqdmull2 generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply long name = vqdmull_high_n no-q multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len} multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = vqdmull-in_ntt-noext, a, b a = 0, 2, 8, 10, 8, 10, 12, 14 b = 2 validate 32, 40, 48, 56 aarch64 = sqdmull2 generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t /// Vector saturating doubling long multiply by scalar name = vqdmull_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32} multi_fn = vqdmull-noqself-noext, a, b a = 1, 2, 3, 4 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 4, 8, 12, 16 aarch64 = sqdmull generate int16x4_t:int16x8_t:int32x4_t, int32x2_t:int32x4_t:int64x2_t arm = vqdmull generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply long name = vqdmullh_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, b:in_t0, b, N as u32 multi_fn = vqdmullh-noqself-noext, a, b a = 2 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 8 aarch64 = sqdmull generate i16:int16x4_t:i32, i16:int16x8_t:i32 /// Signed saturating doubling multiply long name = vqdmulls_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, b:in_t0, b, N as u32 multi_fn = vqdmulls-noqself-noext, a, b a = 2 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 8 aarch64 = sqdmull generate i32:int32x2_t:i64, i32:int32x4_t:i64 /// Signed saturating doubling multiply long name = vqdmull_high_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len} multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32} multi_fn = vqdmull-self-noext, a, b a = 0, 1, 4, 5, 4, 5, 6, 7 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 16, 20, 24, 28 aarch64 = sqdmull2 generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply long name = vqdmull_high_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len} multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32} multi_fn = vqdmull-noqself-noext, a, b a = 0, 1, 4, 5, 4, 5, 6, 7 b = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 16, 20, 24, 28 aarch64 = sqdmull2 generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply-add long name = vqdmlal multi_fn = vqadd-out-noext, a, {vqdmull-self-noext, b, c} a = 1, 1, 1, 1 b = 1, 2, 3, 4 c = 2, 2, 2, 2 validate 5, 9, 13, 17 aarch64 = sqdmlal arm = vqdmlal generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Vector widening saturating doubling multiply accumulate with scalar name = vqdmlal n-suffix multi_fn = vqadd-out-noext, a, {vqdmull_n-self-noext, b, c} a = 1, 1, 1, 1 b = 1, 2, 3, 4 c = 2 validate 5, 9, 13, 17 aarch64 = sqdmlal arm = vqdmlal generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t /// Signed saturating doubling multiply-add long name = vqdmlal_high no-q multi_fn = vqadd-out-noext, a, {vqdmull_high-noqself-noext, b, c} a = 1, 2, 3, 4 b = 0, 1, 4, 5, 4, 5, 6, 7 c = 1, 2, 5, 6, 5, 6, 7, 8 validate 41, 62, 87, 116 aarch64 = sqdmlal2 generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply-add long name = vqdmlal_high_n no-q multi_fn = vqadd-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} a = 1, 2, 3, 4 b = 0, 2, 8, 10, 8, 10, 12, 14 c = 2 validate 33, 42, 51, 60 aarch64 = sqdmlal2 generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t /// Vector widening saturating doubling multiply accumulate with scalar name = vqdmlal_lane in2-suffix constn = N multi_fn = static_assert_imm-in2_exp_len-N multi_fn = vqadd-out-noext, a, {vqdmull_lane-in2-::, b, c} a = 1, 2, 3, 4 b = 1, 2, 3, 4 c = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate 5, 10, 15, 20 aarch64 = sqdmlal generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t arm = vqdmlal generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply-add long name = vqdmlal_high_lane in2-suffix constn = N multi_fn = static_assert_imm-in2_exp_len-N multi_fn = vqadd-out-noext, a, {vqdmull_high_lane-in2-::, b, c} a = 1, 2, 3, 4 b = 0, 1, 4, 5, 4, 5, 6, 7 c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 17, 22, 27, 32 aarch64 = sqdmlal2 generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply-subtract long name = vqdmlsl multi_fn = vqsub-out-noext, a, {vqdmull-self-noext, b, c} a = 3, 7, 11, 15 b = 1, 2, 3, 4 c = 2, 2, 2, 2 validate -1, -1, -1, -1 aarch64 = sqdmlsl arm = vqdmlsl generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Vector widening saturating doubling multiply subtract with scalar name = vqdmlsl n-suffix multi_fn = vqsub-out-noext, a, {vqdmull_n-self-noext, b, c} a = 3, 7, 11, 15 b = 1, 2, 3, 4 c = 2 validate -1, -1, -1, -1 aarch64 = sqdmlsl arm = vqdmlsl generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t /// Signed saturating doubling multiply-subtract long name = vqdmlsl_high no-q multi_fn = vqsub-out-noext, a, {vqdmull_high-noqself-noext, b, c} a = 39, 58, 81, 108 b = 0, 1, 4, 5, 4, 5, 6, 7 c = 1, 2, 5, 6, 5, 6, 7, 8 validate -1, -2, -3, -4 aarch64 = sqdmlsl2 generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply-subtract long name = vqdmlsl_high_n no-q multi_fn = vqsub-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} a = 31, 38, 45, 52 b = 0, 2, 8, 10, 8, 10, 12, 14 c = 2 validate -1, -2, -3, -4 aarch64 = sqdmlsl2 generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t /// Vector widening saturating doubling multiply subtract with scalar name = vqdmlsl_lane in2-suffix constn = N multi_fn = static_assert_imm-in2_exp_len-N multi_fn = vqsub-out-noext, a, {vqdmull_lane-in2-::, b, c} a = 3, 6, 9, 12 b = 1, 2, 3, 4 c = 0, 2, 2, 0, 2, 0, 0, 0 n = HFLEN validate -1, -2, -3, -4 aarch64 = sqdmlsl generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t arm = vqdmlsl generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Signed saturating doubling multiply-subtract long name = vqdmlsl_high_lane in2-suffix constn = N multi_fn = static_assert_imm-in2_exp_len-N multi_fn = vqsub-out-noext, a, {vqdmull_high_lane-in2-::, b, c} a = 15, 18, 21, 24 b = 0, 1, 4, 5, 4, 5, 6, 7 c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate -1, -2, -3, -4 aarch64 = sqdmlsl2 generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t /// Signed saturating doubling multiply returning high half name = vqdmulh a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 2, 2, 2, 2, 2, 2, 2, 2 validate 1, 1, 1, 1, 1, 1, 1, 1 aarch64 = sqdmulh link-aarch64 = sqdmulh._EXT_ arm = vqdmulh link-arm = vqdmulh._EXT_ generate int16x4_t, int16x8_t, int32x2_t, int32x4_t /// Signed saturating doubling multiply returning high half name = vqdmulh multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqdmulh-in_ntt-noext, a, b}, 0 a = 1 b = 2 validate 0 aarch64 = sqdmulh generate i16, i32 /// Vector saturating doubling multiply high with scalar name = vqdmulh_n out-suffix multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = vqdmulh-out-noext, a, b a = MAX, MAX, MAX, MAX b = 2 validate 1, 1, 1, 1 aarch64 = sqdmulh arm = vqdmulh generate int16x4_t:i16:int16x4_t, int32x2_t:i32:int32x2_t /// Vector saturating doubling multiply high with scalar name = vqdmulhq_n no-q multi_fn = vdupq_n-in_ntt-noext, b:out_t, b multi_fn = vqdmulh-out-noext, a, b a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 2 validate 1, 1, 1, 1, 1, 1, 1, 1 aarch64 = sqdmulh arm = vqdmulh generate int16x8_t:i16:int16x8_t, int32x4_t:i32:int32x4_t /// Signed saturating doubling multiply returning high half name = vqdmulhh_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, b:in_t0, b, N as u32 multi_fn = vqdmulhh-out_ntt-noext, a, b a = 2 b = 0, 0, MAX, 0, 0, 0, 0, 0 n = 2 validate 1 aarch64 = sqdmulh generate i16:int16x4_t:i16, i16:int16x8_t:i16 /// Signed saturating doubling multiply returning high half name = vqdmulhs_lane constn = N multi_fn = static_assert_imm-in_exp_len-N multi_fn = simd_extract, b:in_t0, b, N as u32 multi_fn = vqdmulhs-out_ntt-noext, a, b a = 2 b = 0, MAX, 0, 0 n = 1 validate 1 aarch64 = sqdmulh generate i32:int32x2_t:i32, i32:int32x4_t:i32 /// Signed saturating extract narrow name = vqmovn no-q a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX aarch64 = sqxtn link-aarch64 = sqxtn._EXT2_ arm = vqmovn link-arm = vqmovns._EXT2_ generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t /// Unsigned saturating extract narrow name = vqmovn no-q a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX aarch64 = uqxtn link-aarch64 = uqxtn._EXT2_ arm = vqmovn link-arm = vqmovnu._EXT2_ generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Saturating extract narrow name = vqmovn multi_fn = simd_extract, {vqmovn-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 a = 1 validate 1 aarch64 = sqxtn generate i16:i8, i32:i16 aarch64 = uqxtn generate u16:u8, u32:u16 /// Saturating extract narrow name = vqmovn a = 1 validate 1 aarch64 = sqxtn link-aarch64 = scalar.sqxtn._EXT2_._EXT_ generate i64:i32 aarch64 = uqxtn link-aarch64 = scalar.uqxtn._EXT2_._EXT_ generate u64:u32 /// Signed saturating extract narrow name = vqmovn_high no-q multi_fn = simd_shuffle-out_len-!, a, {vqmovn-noqself-noext, b}, {asc-0-out_len} a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX aarch64 = sqxtn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t aarch64 = uqxtn2 generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed saturating extract unsigned narrow name = vqmovun no-q a = -1, -1, -1, -1, -1, -1, -1, -1 validate 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = sqxtun link-aarch64 = sqxtun._EXT2_ arm = vqmovun link-arm = vqmovnsu._EXT2_ generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t /// Signed saturating extract unsigned narrow name = vqmovun multi_fn = simd_extract, {vqmovun-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 a = 1 validate 1 aarch64 = sqxtun generate i16:u8, i32:u16, i64:u32 /// Signed saturating extract unsigned narrow name = vqmovun_high no-q multi_fn = simd_shuffle-out_len-!, a, {vqmovun-noqself-noext, b}, {asc-0-out_len} a = 0, 0, 0, 0, 0, 0, 0, 0 b = -1, -1, -1, -1, -1, -1, -1, -1 validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = sqxtun2 generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t /// Signed saturating rounding doubling multiply returning high half name = vqrdmulh a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 2, 2, 2, 2, 2, 2, 2, 2 validate 2, 2, 2, 2, 2, 2, 2, 2 aarch64 = sqrdmulh link-aarch64 = sqrdmulh._EXT_ arm = vqrdmulh link-arm = vqrdmulh._EXT_ generate int16x4_t, int16x8_t, int32x2_t, int32x4_t /// Signed saturating rounding doubling multiply returning high half name = vqrdmulh multi_fn = simd_extract, {vqrdmulh-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 a = 1 b = 2 validate 0 aarch64 = sqrdmulh generate i16, i32 /// Vector saturating rounding doubling multiply high with scalar name = vqrdmulh out-n-suffix multi_fn = vqrdmulh-out-noext, a, {vdup-nout-noext, b} a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 2 validate 2, 2, 2, 2, 2, 2, 2, 2 aarch64 = sqrdmulh arm = vqrdmulh generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t /// Vector rounding saturating doubling multiply high by scalar name = vqrdmulh lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32} multi_fn = vqrdmulh-out-noext, a, b a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX b = 0, 2, 0, 0, 0, 0, 0, 0, n = 1 validate 2, 2, 2, 2, 2, 2, 2, 2 aarch64 = sqrdmulh arm = vqrdmulh generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t /// Signed saturating rounding doubling multiply returning high half name = vqrdmulh lane-suffixes constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = vqrdmulh-out-noext, a, {simd_extract, b, LANE as u32} a = 1 b = 0, 2, 0, 0, 0, 0, 0, 0, n = 1 validate 0 aarch64 = sqrdmulh generate i16:int16x4_t:i16, i16:int16x8_t:i16, i32:int32x2_t:i32, i32:int32x4_t:i32 /// Signed saturating rounding doubling multiply accumulate returning high half name = vqrdmlah multi_fn = vqadd-out-noext, a, {vqrdmulh-out-noext, b, c} a = 1, 1, 1, 1, 1, 1, 1, 1 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX c = 2, 2, 2, 2, 2, 2, 2, 2 validate 3, 3, 3, 3, 3, 3, 3, 3 aarch64 = sqrdmulh arm = vqrdmulh generate int16x4_t, int16x8_t, int32x2_t, int32x4_t /// Signed saturating rounding doubling multiply accumulate returning high half name = vqrdmlah multi_fn = vqadd-self-noext, a, {vqrdmulh-self-noext, b, c} a = 1 b = 1 c = 2 validate 1 aarch64 = sqrdmulh generate i16, i32 /// Signed saturating rounding doubling multiply accumulate returning high half name = vqrdmlah in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vqadd-self-noext, a, {vqrdmulh-in2lane-::, b, c} a = 1, 1, 1, 1, 1, 1, 1, 1 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 3, 3, 3, 3, 3, 3, 3, 3 aarch64 = sqrdmulh arm = vqrdmulh generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t /// Signed saturating rounding doubling multiply accumulate returning high half name = vqrdmlah in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vqadd-self-noext, a, {vqrdmulh-in2lane-::, b, c} a = 1 b = 1 c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 1 aarch64 = sqrdmulh generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 /// Signed saturating rounding doubling multiply subtract returning high half name = vqrdmlsh multi_fn = vqsub-out-noext, a, {vqrdmulh-out-noext, b, c} a = 1, 1, 1, 1, 1, 1, 1, 1 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX c = 2, 2, 2, 2, 2, 2, 2, 2 validate -1, -1, -1, -1, -1, -1, -1, -1 aarch64 = sqrdmulh arm = vqrdmulh generate int16x4_t, int16x8_t, int32x2_t, int32x4_t /// Signed saturating rounding doubling multiply subtract returning high half name = vqrdmlsh multi_fn = vqsub-self-noext, a, {vqrdmulh-self-noext, b, c} a = 1 b = 1 c = 2 validate 1 aarch64 = sqrdmulh generate i16, i32 /// Signed saturating rounding doubling multiply subtract returning high half name = vqrdmlsh in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vqsub-self-noext, a, {vqrdmulh-in2lane-::, b, c} a = 1, 1, 1, 1, 1, 1, 1, 1 b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate -1, -1, -1, -1, -1, -1, -1, -1 aarch64 = sqrdmulh arm = vqrdmulh generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t /// Signed saturating rounding doubling multiply subtract returning high half name = vqrdmlsh in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE multi_fn = vqsub-self-noext, a, {vqrdmulh-in2lane-::, b, c} a = 1 b = 1 c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 1 aarch64 = sqrdmulh generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 /// Signed saturating rounding shift left name = vqrshl a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 8, MIN, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = sqrshl link-aarch64 = sqrshl._EXT_ generate i32, i64 arm = vqrshl link-arm = vqrshifts._EXT_ generate int*_t, int64x*_t /// Signed saturating rounding shift left name = vqrshl multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqrshl-in_ntt-noext, a, b}, 0 a = 1 b = 2 validate 4 aarch64 = sqrshl generate i8, i16 /// Unsigned signed saturating rounding shift left name = vqrshl out-suffix a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 8, 0, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = uqrshl link-aarch64 = uqrshl._EXT_ generate u32:i32:u32, u64:i64:u64 arm = vqrshl link-arm = vqrshiftu._EXT_ generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t /// Unsigned signed saturating rounding shift left name = vqrshl out-suffix multi_fn = vdup_n-out_ntt-noext, a:out_ntt, a multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b multi_fn = simd_extract, {vqrshl-out_ntt-noext, a, b}, 0 a = 1 b = 2 validate 4 aarch64 = uqrshl generate u8:i8:u8, u16:i16:u16 /// Signed saturating rounded shift right narrow name = vqrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = MIN, 4, 8, 12, 16, 20, 24, 28 n = 2 validate MIN, 1, 2, 3, 4, 5, 6, 7 aarch64 = sqrshrn link-aarch64 = sqrshrn._EXT2_ const-aarch64 = N arm = vqrshrn link-arm = vqrshiftns._EXT2_ const-arm = -N as ttn arm-aarch64-separate generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t /// Signed saturating rounded shift right narrow name = vqrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a multi_fn = simd_extract, {vqrshrn_n-in_ntt-::, a}, 0 a = 4 n = 2 validate 1 aarch64 = sqrshrn generate i16:i8, i32:i16, i64:i32 /// Signed saturating rounded shift right narrow name = vqrshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 2, 3, 2, 3, 6, 7 b = 8, 12, 24, 28, 48, 52, 56, 60 n = 2 validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 aarch64 = sqrshrn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t /// Unsigned signed saturating rounded shift right narrow name = vqrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = MIN, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 aarch64 = uqrshrn link-aarch64 = uqrshrn._EXT2_ const-aarch64 = N arm = vqrshrn link-arm = vqrshiftnu._EXT2_ const-arm = -N as ttn arm-aarch64-separate generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Unsigned saturating rounded shift right narrow name = vqrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a multi_fn = simd_extract, {vqrshrn_n-in_ntt-::, a}, 0 a = 4 n = 2 validate 1 aarch64 = uqrshrn generate u16:u8, u32:u16, u64:u32 /// Unsigned saturating rounded shift right narrow name = vqrshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 2, 3, 2, 3, 6, 7 b = 8, 12, 24, 28, 48, 52, 56, 60 n = 2 validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 aarch64 = uqrshrn2 generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed saturating rounded shift right unsigned narrow name = vqrshrun noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 0, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 aarch64 = sqrshrun link-aarch64 = sqrshrun._EXT2_ const-aarch64 = N arm = vqrshrun link-arm = vqrshiftnsu._EXT2_ const-arm = -N as ttn arm-aarch64-separate generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t /// Signed saturating rounded shift right unsigned narrow name = vqrshrun noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a multi_fn = simd_extract, {vqrshrun_n-in_ntt-::, a}, 0 a = 4 n = 2 validate 1 aarch64 = sqrshrun generate i16:u8, i32:u16, i64:u32 /// Signed saturating rounded shift right unsigned narrow name = vqrshrun_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 2, 3, 2, 3, 6, 7 b = 8, 12, 24, 28, 48, 52, 56, 60 n = 2 validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 aarch64 = sqrshrun2 generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t /// Signed saturating shift left name = vqshl a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = sqshl link-aarch64 = sqshl._EXT_ generate i64 arm = vqshl link-arm = vqshifts._EXT_ generate int*_t, int64x*_t /// Signed saturating shift left name = vqshl multi_fn = vqshl-in_ntt-noext, c:in_ntt, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b} multi_fn = simd_extract, c, 0 a = 1 b = 2 validate 4 aarch64 = sqshl generate i8, i16, i32 /// Unsigned saturating shift left name = vqshl out-suffix a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = uqshl link-aarch64 = uqshl._EXT_ generate u64:i64:u64 arm = vqshl link-arm = vqshiftu._EXT_ generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t /// Unsigned saturating shift left name = vqshl out-suffix multi_fn = vqshl-out_ntt-noext, c:out_ntt, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b} multi_fn = simd_extract, c, 0 a = 1 b = 2 validate 4 aarch64 = uqshl generate u8:i8:u8, u16:i16:u16, u32:i32:u32 /// Signed saturating shift left name = vqshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = vqshl-self-noext, a, {vdup-nself-noext, N.try_into().unwrap()} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 n = 2 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = sqshl arm = vqshl generate int*_t, int64x*_t /// Signed saturating shift left name = vqshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = simd_extract, {vqshl_n-in_ntt-::, {vdup_n-in_ntt-noext, a}}, 0 a = 1 n = 2 validate 4 aarch64 = sqshl generate i8, i16, i32, i64 /// Unsigned saturating shift left name = vqshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = vqshl-self-noext, a, {vdup-nsigned-noext, N.try_into().unwrap()} a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 n = 2 validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 aarch64 = uqshl arm = vqshl generate uint*_t, uint64x*_t /// Unsigned saturating shift left name = vqshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = simd_extract, {vqshl_n-in_ntt-::, {vdup_n-in_ntt-noext, a}}, 0 a = 1 n = 2 validate 4 aarch64 = uqshl generate u8, u16, u32, u64 /// Signed saturating shift right narrow name = vqshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 0, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 arm-aarch64-separate aarch64 = sqshrn link-aarch64 = sqshrn._EXT2_ const-aarch64 = N generate i64:i32 arm = vqshrn link-arm = vqshiftns._EXT2_ const-arm = -N as ttn generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t /// Signed saturating shift right narrow name = vqshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_extract, {vqshrn_n-in_ntt-::, {vdupq_n-in_ntt-noext, a}}, 0 a = 4 n = 2 validate 1 aarch64 = sqshrn generate i16:i8, i32:i16 /// Signed saturating shift right narrow name = vqshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 8, 9, 8, 9, 10, 11 b = 32, 36, 40, 44, 48, 52, 56, 60 n = 2 validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = sqshrn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t /// Unsigned saturating shift right narrow name = vqshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 0, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 arm-aarch64-separate aarch64 = uqshrn link-aarch64 = uqshrn._EXT2_ const-aarch64 = N generate u64:u32 arm = vqshrn link-arm = vqshiftnu._EXT2_ const-arm = -N as ttn generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Unsigned saturating shift right narrow name = vqshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_extract, {vqshrn_n-in_ntt-::, {vdupq_n-in_ntt-noext, a}}, 0 a = 4 n = 2 validate 1 aarch64 = uqshrn generate u16:u8, u32:u16 /// Unsigned saturating shift right narrow name = vqshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 8, 9, 8, 9, 10, 11 b = 32, 36, 40, 44, 48, 52, 56, 60 n = 2 validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = uqshrn2 generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed saturating shift right unsigned narrow name = vqshrun noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 0, 4, 8, 12, 16, 20, 24, 28 n = 2 validate 0, 1, 2, 3, 4, 5, 6, 7 arm-aarch64-separate aarch64 = sqshrun link-aarch64 = sqshrun._EXT2_ const-aarch64 = N arm = vqshrun link-arm = vqshiftnsu._EXT2_ const-arm = -N as ttn generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t /// Signed saturating shift right unsigned narrow name = vqshrun noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_extract, {vqshrun_n-in_ntt-::, {vdupq_n-in_ntt-noext, a}}, 0 a = 4 n = 2 validate 1 aarch64 = sqshrun generate i16:u8, i32:u16, i64:u32 /// Signed saturating shift right unsigned narrow name = vqshrun_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 8, 9, 8, 9, 10, 11 b = 32, 36, 40, 44, 48, 52, 56, 60 n = 2 validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = sqshrun2 generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t /// Calculates the square root of each lane. name = vsqrt fn = simd_fsqrt a = 4.0, 9.0, 16.0, 25.0 validate 2.0, 3.0, 4.0, 5.0 aarch64 = fsqrt generate float*_t, float64x*_t /// Reciprocal square-root estimate. name = vrsqrte a = 1.0, 2.0, 3.0, 4.0 validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375 aarch64 = frsqrte link-aarch64 = frsqrte._EXT_ generate float64x*_t arm = vrsqrte link-arm = vrsqrte._EXT_ generate float*_t /// Reciprocal estimate. name = vrecpe a = 4.0, 3.0, 2.0, 1.0 validate 0.24951171875, 0.3330078125, 0.4990234375, 0.998046875 aarch64 = frecpe link-aarch64 = frecpe._EXT_ generate float64x*_t arm = vrecpe link-arm = vrecpe._EXT_ generate float*_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = nop generate poly64x1_t:int64x1_t, poly64x1_t:uint64x1_t, int64x1_t:poly64x1_t, uint64x1_t:poly64x1_t generate poly64x2_t:int64x2_t, poly64x2_t:uint64x2_t, int64x2_t:poly64x2_t, uint64x2_t:poly64x2_t arm = nop generate uint8x8_t:int8x8_t, poly8x8_t:int8x8_t, poly16x4_t:int16x4_t, uint16x4_t:int16x4_t, uint32x2_t:int32x2_t, uint64x1_t:int64x1_t generate uint8x16_t:int8x16_t, poly8x16_t:int8x16_t, poly16x8_t:int16x8_t, uint16x8_t:int16x8_t, uint32x4_t:int32x4_t, uint64x2_t:int64x2_t generate poly8x8_t:uint8x8_t, int8x8_t:uint8x8_t, poly16x4_t:uint16x4_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t generate poly8x16_t:uint8x16_t, int8x16_t:uint8x16_t, poly16x8_t:uint16x8_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t generate int8x8_t:poly8x8_t, uint8x8_t:poly8x8_t, int16x4_t:poly16x4_t, uint16x4_t:poly16x4_t generate int8x16_t:poly8x16_t, uint8x16_t:poly8x16_t, int16x8_t:poly16x8_t, uint16x8_t:poly16x8_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 1, 2, 3, 4, 5, 6, 7 validate 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 aarch64 = nop generate poly64x1_t:int32x2_t, poly64x1_t:uint32x2_t generate poly64x2_t:int32x4_t, poly64x2_t:uint32x4_t arm = nop generate int16x4_t:int8x8_t, uint16x4_t:int8x8_t, poly16x4_t:int8x8_t, int32x2_t:int16x4_t, uint32x2_t:int16x4_t, int64x1_t:int32x2_t, uint64x1_t:int32x2_t generate int16x8_t:int8x16_t, uint16x8_t:int8x16_t, poly16x8_t:int8x16_t, int32x4_t:int16x8_t, uint32x4_t:int16x8_t, int64x2_t:int32x4_t, uint64x2_t:int32x4_t generate poly16x4_t:uint8x8_t, int16x4_t:uint8x8_t, uint16x4_t:uint8x8_t, int32x2_t:uint16x4_t, uint32x2_t:uint16x4_t, int64x1_t:uint32x2_t, uint64x1_t:uint32x2_t generate poly16x8_t:uint8x16_t, int16x8_t:uint8x16_t, uint16x8_t:uint8x16_t, int32x4_t:uint16x8_t, uint32x4_t:uint16x8_t, int64x2_t:uint32x4_t, uint64x2_t:uint32x4_t generate poly16x4_t:poly8x8_t, int16x4_t:poly8x8_t, uint16x4_t:poly8x8_t, int32x2_t:poly16x4_t, uint32x2_t:poly16x4_t generate poly16x8_t:poly8x16_t, int16x8_t:poly8x16_t, uint16x8_t:poly8x16_t, int32x4_t:poly16x8_t, uint32x4_t:poly16x8_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 validate 0, 1, 2, 3, 4, 5, 6, 7 aarch64 = nop generate int32x2_t:poly64x1_t, uint32x2_t:poly64x1_t generate int32x4_t:poly64x2_t, uint32x4_t:poly64x2_t arm = nop generate poly8x8_t:int16x4_t, int8x8_t:int16x4_t, uint8x8_t:int16x4_t, poly16x4_t:int32x2_t, int16x4_t:int32x2_t, uint16x4_t:int32x2_t, int32x2_t:int64x1_t, uint32x2_t:int64x1_t generate poly8x16_t:int16x8_t, int8x16_t:int16x8_t, uint8x16_t:int16x8_t, poly16x8_t:int32x4_t, int16x8_t:int32x4_t, uint16x8_t:int32x4_t, int32x4_t:int64x2_t, uint32x4_t:int64x2_t generate poly8x8_t:uint16x4_t, int8x8_t:uint16x4_t, uint8x8_t:uint16x4_t, poly16x4_t:uint32x2_t, int16x4_t:uint32x2_t, uint16x4_t:uint32x2_t, int32x2_t:uint64x1_t, uint32x2_t:uint64x1_t generate poly8x16_t:uint16x8_t, int8x16_t:uint16x8_t, uint8x16_t:uint16x8_t, poly16x8_t:uint32x4_t, int16x8_t:uint32x4_t, uint16x8_t:uint32x4_t, int32x4_t:uint64x2_t, uint32x4_t:uint64x2_t generate poly8x8_t:poly16x4_t, int8x8_t:poly16x4_t, uint8x8_t:poly16x4_t generate poly8x16_t:poly16x8_t, int8x16_t:poly16x8_t, uint8x16_t:poly16x8_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 1, 2, 3 validate 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 aarch64 = nop generate poly64x1_t:int16x4_t, poly64x1_t:uint16x4_t, poly64x1_t:poly16x4_t generate poly64x2_t:int16x8_t, poly64x2_t:uint16x8_t, poly64x2_t:poly16x8_t arm = nop generate int32x2_t:int8x8_t, uint32x2_t:int8x8_t, int64x1_t:int16x4_t, uint64x1_t:int16x4_t generate int32x4_t:int8x16_t, uint32x4_t:int8x16_t, int64x2_t:int16x8_t, uint64x2_t:int16x8_t generate int32x2_t:uint8x8_t, uint32x2_t:uint8x8_t, int64x1_t:uint16x4_t, uint64x1_t:uint16x4_t generate int32x4_t:uint8x16_t, uint32x4_t:uint8x16_t, int64x2_t:uint16x8_t, uint64x2_t:uint16x8_t generate int32x2_t:poly8x8_t, uint32x2_t:poly8x8_t, int64x1_t:poly16x4_t, uint64x1_t:poly16x4_t generate int32x4_t:poly8x16_t, uint32x4_t:poly8x16_t, int64x2_t:poly16x8_t, uint64x2_t:poly16x8_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 validate 0, 1, 2, 3 aarch64 = nop generate poly16x4_t:poly64x1_t, int16x4_t:poly64x1_t, uint16x4_t:poly64x1_t generate poly16x8_t:poly64x2_t, int16x8_t:poly64x2_t, uint16x8_t:poly64x2_t arm = nop generate poly8x8_t:int32x2_t, int8x8_t:int32x2_t, uint8x8_t:int32x2_t, poly16x4_t:int64x1_t, int16x4_t:int64x1_t, uint16x4_t:int64x1_t generate poly8x16_t:int32x4_t, int8x16_t:int32x4_t, uint8x16_t:int32x4_t, poly16x8_t:int64x2_t, int16x8_t:int64x2_t, uint16x8_t:int64x2_t generate poly8x8_t:uint32x2_t, int8x8_t:uint32x2_t, uint8x8_t:uint32x2_t, poly16x4_t:uint64x1_t, int16x4_t:uint64x1_t, uint16x4_t:uint64x1_t generate poly8x16_t:uint32x4_t, int8x16_t:uint32x4_t, uint8x16_t:uint32x4_t, poly16x8_t:uint64x2_t, int16x8_t:uint64x2_t, uint16x8_t:uint64x2_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 1 validate 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 aarch64 = nop generate poly64x1_t:int8x8_t, poly64x1_t:uint8x8_t, poly64x1_t:poly8x8_t generate poly64x2_t:int8x16_t, poly64x2_t:uint8x16_t, poly64x2_t:poly8x16_t arm = nop generate int64x1_t:int8x8_t, uint64x1_t:int8x8_t, int64x1_t:uint8x8_t, uint64x1_t:uint8x8_t, int64x1_t:poly8x8_t, uint64x1_t:poly8x8_t generate int64x2_t:int8x16_t, uint64x2_t:int8x16_t, int64x2_t:uint8x16_t, uint64x2_t:uint8x16_t, int64x2_t:poly8x16_t, uint64x2_t:poly8x16_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 validate 0, 1 aarch64 = nop generate poly8x8_t:poly64x1_t, int8x8_t:poly64x1_t, uint8x8_t:poly64x1_t generate poly8x16_t:poly64x2_t, int8x16_t:poly64x2_t, uint8x16_t:poly64x2_t arm = nop generate poly8x8_t:int64x1_t, int8x8_t:int64x1_t, uint8x8_t:int64x1_t, poly8x8_t:uint64x1_t, int8x8_t:uint64x1_t, uint8x8_t:uint64x1_t generate poly8x16_t:int64x2_t, int8x16_t:int64x2_t, uint8x16_t:int64x2_t, poly8x16_t:uint64x2_t, int8x16_t:uint64x2_t, uint8x16_t:uint64x2_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0., 0., 0., 0., 0., 0., 0., 0. validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 aarch64 = nop generate float64x1_t:int8x8_t, float64x1_t:int16x4_t, float64x1_t:int32x2_t, float64x1_t:int64x1_t generate float64x2_t:int8x16_t, float64x2_t:int16x8_t, float64x2_t:int32x4_t, float64x2_t:int64x2_t generate float64x1_t:uint8x8_t, float64x1_t:uint16x4_t, float64x1_t:uint32x2_t, float64x1_t:uint64x1_t generate float64x2_t:uint8x16_t, float64x2_t:uint16x8_t, float64x2_t:uint32x4_t, float64x2_t:uint64x2_t generate float64x1_t:poly8x8_t, float64x1_t:poly16x4_t, float32x2_t:poly64x1_t, float64x1_t:poly64x1_t generate float64x2_t:poly8x16_t, float64x2_t:poly16x8_t, float32x4_t:poly64x2_t, float64x2_t:poly64x2_t arm = nop generate float32x2_t:int8x8_t, float32x2_t:int16x4_t, float32x2_t:int32x2_t, float32x2_t:int64x1_t generate float32x4_t:int8x16_t, float32x4_t:int16x8_t, float32x4_t:int32x4_t, float32x4_t:int64x2_t generate float32x2_t:uint8x8_t, float32x2_t:uint16x4_t, float32x2_t:uint32x2_t, float32x2_t:uint64x1_t generate float32x4_t:uint8x16_t, float32x4_t:uint16x8_t, float32x4_t:uint32x4_t, float32x4_t:uint64x2_t generate float32x2_t:poly8x8_t, float32x2_t:poly16x4_t generate float32x4_t:poly8x16_t, float32x4_t:poly16x8_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 validate 0., 0., 0., 0., 0., 0., 0., 0. aarch64 = nop generate int8x8_t:float64x1_t, int16x4_t:float64x1_t, int32x2_t:float64x1_t, int64x1_t:float64x1_t generate int8x16_t:float64x2_t, int16x8_t:float64x2_t, int32x4_t:float64x2_t, int64x2_t:float64x2_t generate poly8x8_t:float64x1_t, uint16x4_t:float64x1_t, uint32x2_t:float64x1_t, uint64x1_t:float64x1_t generate poly8x16_t:float64x2_t, uint16x8_t:float64x2_t, uint32x4_t:float64x2_t, uint64x2_t:float64x2_t generate uint8x8_t:float64x1_t, poly16x4_t:float64x1_t, poly64x1_t:float64x1_t, poly64x1_t:float32x2_t generate uint8x16_t:float64x2_t, poly16x8_t:float64x2_t, poly64x2_t:float64x2_t, poly64x2_t:float32x4_t arm = nop generate int8x8_t:float32x2_t, int16x4_t:float32x2_t, int32x2_t:float32x2_t, int64x1_t:float32x2_t generate int8x16_t:float32x4_t, int16x8_t:float32x4_t, int32x4_t:float32x4_t, int64x2_t:float32x4_t generate uint8x8_t:float32x2_t, uint16x4_t:float32x2_t, uint32x2_t:float32x2_t, uint64x1_t:float32x2_t generate uint8x16_t:float32x4_t, uint16x8_t:float32x4_t, uint32x4_t:float32x4_t, uint64x2_t:float32x4_t generate poly8x8_t:float32x2_t, poly16x4_t:float32x2_t generate poly8x16_t:float32x4_t, poly16x8_t:float32x4_t /// Vector reinterpret cast operation name = vreinterpret double-suffixes fn = transmute a = 0., 0., 0., 0., 0., 0., 0., 0. validate 0., 0., 0., 0., 0., 0., 0., 0. aarch64 = nop generate float32x2_t:float64x1_t, float64x1_t:float32x2_t generate float32x4_t:float64x2_t, float64x2_t:float32x4_t /// Signed rounding shift left name = vrshl a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 aarch64 = srshl link-aarch64 = srshl._EXT_ generate i64 arm = vrshl link-arm = vrshifts._EXT_ generate int*_t, int64x*_t /// Unsigned rounding shift left name = vrshl out-suffix a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 aarch64 = urshl link-aarch64 = urshl._EXT_ generate u64:i64:u64 arm = vrshl link-arm = vrshiftu._EXT_ generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t /// Signed rounding shift right name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N).try_into().unwrap()} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = srshr arm = vrshr generate int*_t, int64x*_t /// Signed rounding shift right name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshl-self-noext, a, -N as i64 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = srshr generate i64 /// Unsigned rounding shift right name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N).try_into().unwrap()} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = urshr arm = vrshr generate uint*_t, uint64x*_t /// Unsigned rounding shift right name = vrshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshl-self-noext, a, -N as i64 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = urshr generate u64 /// Rounding shift right narrow name = vrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 arm-aarch64-separate aarch64 = rshrn link-aarch64 = rshrn._EXT2_ const-aarch64 = N arm = vrshrn link-arm = vrshiftn._EXT2_ const-arm = -N as ttn generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t /// Rounding shift right narrow name = vrshrn noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = transmute, {vrshrn_n-noqsigned-::, transmute(a)} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = rshrn arm = vrshrn generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Rounding shift right narrow name = vrshrn_high noq-n-suffix constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::, b}, {asc-0-out_len} a = 0, 1, 8, 9, 8, 9, 10, 11 b = 32, 36, 40, 44, 48, 52, 56, 60 n = 2 validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = rshrn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed rounding shift right and accumulate name = vrsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = simd_add, a, {vrshr-nself-::, b} a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 aarch64 = srsra arm = vrsra generate int*_t, int64x*_t /// Unsigned rounding shift right and accumulate name = vrsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = simd_add, a, {vrshr-nself-::, b} a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 aarch64 = ursra arm = vrsra generate uint*_t, uint64x*_t /// Signed rounding shift right and accumulate. name = vrsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshr-nself-::, b:in_t, b multi_fn = a + b a = 1 b = 4 n = 2 validate 2 aarch64 = srsra generate i64 /// Ungisned rounding shift right and accumulate. name = vrsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = vrshr-nself-::, b:in_t, b multi_fn = a + b a = 1 b = 4 n = 2 validate 2 aarch64 = ursra generate u64 /// Insert vector element from another vector element name = vset_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_insert, b, LANE as u32, a a = 1 b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 n = 0 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = nop arm = nop generate i8:int8x8_t:int8x8_t, i16:int16x4_t:int16x4_t generate i32:int32x2_t:int32x2_t, i64:int64x1_t:int64x1_t generate u8:uint8x8_t:uint8x8_t, u16:uint16x4_t:uint16x4_t generate u32:uint32x2_t:uint32x2_t, u64:uint64x1_t:uint64x1_t generate p8:poly8x8_t:poly8x8_t, p16:poly16x4_t:poly16x4_t target = aes generate p64:poly64x1_t:poly64x1_t /// Insert vector element from another vector element name = vsetq_lane no-q constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_insert, b, LANE as u32, a a = 1 b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 n = 0 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 aarch64 = nop arm = nop generate i8:int8x16_t:int8x16_t, i16:int16x8_t:int16x8_t generate i32:int32x4_t:int32x4_t, i64:int64x2_t:int64x2_t generate u8:uint8x16_t:uint8x16_t, u16:uint16x8_t:uint16x8_t generate u32:uint32x4_t:uint32x4_t, u64:uint64x2_t:uint64x2_t generate p8:poly8x16_t:poly8x16_t, p16:poly16x8_t:poly16x8_t target = aes generate p64:poly64x2_t:poly64x2_t /// Insert vector element from another vector element name = vset_lane constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_insert, b, LANE as u32, a a = 1. b = 0., 2., 3., 4. n = 0 validate 1., 2., 3., 4. aarch64 = nop generate f64:float64x1_t:float64x1_t arm = nop generate f32:float32x2_t:float32x2_t /// Insert vector element from another vector element name = vsetq_lane no-q constn = LANE multi_fn = static_assert_imm-in_exp_len-LANE multi_fn = simd_insert, b, LANE as u32, a a = 1. b = 0., 2., 3., 4. n = 0 validate 1., 2., 3., 4. aarch64 = nop generate f64:float64x2_t:float64x2_t arm = nop generate f32:float32x4_t:float32x4_t /// Signed Shift left name = vshl a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 aarch64 = sshl link-aarch64 = sshl._EXT_ arm = vshl link-arm = vshifts._EXT_ generate int*_t, int64x*_t /// Signed Shift left name = vshl multi_fn = transmute, {vshl-in_ntt-noext, transmute(a), transmute(b)} a = 1 b = 2 validate 4 aarch64 = sshl generate i64 /// Unsigned Shift left name = vshl out-suffix a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 aarch64 = ushl link-aarch64 = ushl._EXT_ arm = vshl link-arm = vshiftu._EXT_ generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t /// Unsigned Shift left out-suffix name = vshl multi_fn = transmute, {vshl-out_ntt-noext, transmute(a), transmute(b)} a = 1 b = 2 validate 4 aarch64 = ushl generate u64:i64:u64 /// Shift left name = vshl n-suffix constn = N multi_fn = static_assert_imm-out_bits_exp_len-N multi_fn = simd_shl, a, {vdup-nself-noext, N.try_into().unwrap()} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 n = 2 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 arm = vshl aarch64 = shl generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Signed shift left long name = vshll n-suffix constn = N multi_fn = static_assert-N-0-bits multi_fn = simd_shl, {simd_cast, a}, {vdup-nout-noext, N.try_into().unwrap()} a = 1, 2, 3, 4, 5, 6, 7, 8 n = 2 validate 4, 8, 12, 16, 20, 24, 28, 32 arm = vshll.s aarch64 = sshll generate int8x8_t:int16x8_t, int16x4_t:int32x4_t, int32x2_t:int64x2_t aarch64 = ushll generate uint8x8_t:uint16x8_t, uint16x4_t:uint32x4_t, uint32x2_t:uint64x2_t /// Signed shift left long name = vshll_high_n no-q constn = N multi_fn = static_assert-N-0-bits multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen} multi_fn = vshll_n-noqself-::, b a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8 n = 2 validate 4, 8, 12, 16, 20, 24, 28, 32 aarch64 = sshll2 generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t aarch64 = ushll2 generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t /// Shift right name = vshr n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = simd_shr, a, {vdup-nself-noext, N.try_into().unwrap()} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 arm = vshr.s aarch64 = sshr generate int*_t, int64x*_t aarch64 = ushr generate uint*_t, uint64x*_t /// Shift right narrow name = vshrn_n no-q constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_cast, {simd_shr, a, {vdup-nself-noext, N.try_into().unwrap()}} a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 arm = vshrn. aarch64 = shrn generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t /// Shift right narrow name = vshrn_high_n no-q constn = N multi_fn = static_assert-N-1-halfbits multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::, b}, {asc-0-out_len} a = 1, 2, 5, 6, 5, 6, 7, 8 b = 20, 24, 28, 32, 52, 56, 60, 64 n = 2 validate 1, 2, 5, 6, 5, 6, 7, 8, 5, 6, 7, 8, 13, 14, 15, 16 aarch64 = shrn2 generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t /// Signed shift right and accumulate name = vsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = simd_add, a, {vshr-nself-::, b} a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 aarch64 = ssra arm = vsra generate int*_t, int64x*_t /// Unsigned shift right and accumulate name = vsra n-suffix constn = N multi_fn = static_assert-N-1-bits multi_fn = simd_add, a, {vshr-nself-::, b} a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 n = 2 validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 aarch64 = usra arm = vsra generate uint*_t, uint64x*_t /// Transpose vectors name = vtrn1 multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len} a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 aarch64 = trn1 generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t aarch64 = zip1 generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t /// Transpose vectors name = vtrn1 multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len} a = 0., 2., 4., 6., 8., 10., 12., 14. b = 1., 3., 5., 7., 9., 11., 13., 15. validate 0., 1., 4., 5., 8., 9., 12., 13. aarch64 = trn1 generate float32x4_t aarch64 = zip1 generate float32x2_t, float64x2_t /// Transpose vectors name = vtrn2 multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len} a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 aarch64 = trn2 generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t aarch64 = zip2 generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t /// Transpose vectors name = vtrn2 multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len} a = 0., 2., 4., 6., 8., 10., 12., 14. b = 1., 3., 5., 7., 9., 11., 13., 15. validate 2., 3., 6., 7., 10., 11., 14., 15. aarch64 = trn2 generate float32x4_t aarch64 = zip2 generate float32x2_t, float64x2_t /// Zip vectors name = vzip1 multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len} a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 aarch64 = zip1 generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t /// Zip vectors name = vzip1 multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len} a = 0., 2., 4., 6., 8., 10., 12., 14. b = 1., 3., 5., 7., 9., 11., 13., 15. validate 0., 1., 2., 3., 4., 5., 6., 7. aarch64 = zip1 generate float32x2_t, float32x4_t, float64x2_t /// Zip vectors name = vzip2 multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len} a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30 b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31 validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 aarch64 = zip2 generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t /// Zip vectors name = vzip2 multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len} a = 0., 8., 8., 10., 8., 10., 12., 14. b = 1., 9., 9., 11., 9., 11., 13., 15. validate 8., 9., 10., 11., 12., 13., 14., 15. aarch64 = zip2 generate float32x2_t, float32x4_t, float64x2_t /// Unzip vectors name = vuzp1 multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len} a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0 b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0 validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16 aarch64 = uzp1 generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t aarch64 = zip1 generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t /// Unzip vectors name = vuzp1 multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len} a = 0., 8., 1., 9., 4., 12., 5., 13. b = 1., 10., 3., 11., 6., 14., 7., 15. validate 0., 1., 1., 3., 4., 5., 6., 7. aarch64 = uzp1 generate float32x4_t aarch64 = zip1 generate float32x2_t, float64x2_t /// Unzip vectors name = vuzp2 multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len} a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24 b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32 validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32 aarch64 = uzp2 generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t aarch64 = zip2 generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t /// Unzip vectors name = vuzp2 multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len} a = 0., 8., 1., 9., 4., 12., 5., 13. b = 2., 9., 3., 11., 6., 14., 7., 15. validate 8., 9., 9., 11., 12., 13., 14., 15. aarch64 = uzp2 generate float32x4_t aarch64 = zip2 generate float32x2_t, float64x2_t //////////////////// // Unsigned Absolute difference and Accumulate Long //////////////////// /// Unsigned Absolute difference and Accumulate Long name = vabal multi_fn = vabd-unsigned-noext, b, c, d:in_t multi_fn = simd_add, a, {simd_cast, d} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 arm = vabal.s aarch64 = uabal generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t /// Unsigned Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = vabd_u8, d, e, f:uint8x8_t multi_fn = simd_add, a, {simd_cast, f} a = 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 20, 20, 20, 20, 20, 20, 20, 20 aarch64 = uabal generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t /// Unsigned Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7] multi_fn = vabd_u16, d, e, f:uint16x4_t multi_fn = simd_add, a, {simd_cast, f} a = 9, 10, 11, 12 b = 1, 2, 3, 4, 9, 10, 11, 12 c = 10, 10, 10, 10, 20, 0, 2, 4 validate 20, 20, 20, 20 aarch64 = uabal generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t /// Unsigned Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3] multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3] multi_fn = vabd_u32, d, e, f:uint32x2_t multi_fn = simd_add, a, {simd_cast, f} a = 15, 16 b = 1, 2, 15, 16 c = 10, 10, 10, 12 validate 20, 20 aarch64 = uabal generate uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t //////////////////// // Signed Absolute difference and Accumulate Long //////////////////// /// Signed Absolute difference and Accumulate Long name = vabal multi_fn = vabd-signed-noext, b, c, d:int8x8_t multi_fn = simd_cast, e:uint8x8_t, d multi_fn = simd_add, a, {simd_cast, e} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 arm = vabal.s aarch64 = sabal generate int16x8_t:int8x8_t:int8x8_t:int16x8_t /// Signed Absolute difference and Accumulate Long name = vabal multi_fn = vabd-signed-noext, b, c, d:int16x4_t multi_fn = simd_cast, e:uint16x4_t, d multi_fn = simd_add, a, {simd_cast, e} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 arm = vabal.s aarch64 = sabal generate int32x4_t:int16x4_t:int16x4_t:int32x4_t /// Signed Absolute difference and Accumulate Long name = vabal multi_fn = vabd-signed-noext, b, c, d:int32x2_t multi_fn = simd_cast, e:uint32x2_t, d multi_fn = simd_add, a, {simd_cast, e} a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 arm = vabal.s aarch64 = sabal generate int64x2_t:int32x2_t:int32x2_t:int64x2_t /// Signed Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] multi_fn = vabd_s8, d, e, f:int8x8_t multi_fn = simd_cast, f:uint8x8_t, f multi_fn = simd_add, a, {simd_cast, f} a = 9, 10, 11, 12, 13, 14, 15, 16 b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 validate 20, 20, 20, 20, 20, 20, 20, 20 aarch64 = sabal generate int16x8_t:int8x16_t:int8x16_t:int16x8_t /// Signed Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7] multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7] multi_fn = vabd_s16, d, e, f:int16x4_t multi_fn = simd_cast, f:uint16x4_t, f multi_fn = simd_add, a, {simd_cast, f} a = 9, 10, 11, 12 b = 1, 2, 3, 4, 9, 10, 11, 12 c = 10, 10, 10, 10, 20, 0, 2, 4 validate 20, 20, 20, 20 aarch64 = sabal generate int32x4_t:int16x8_t:int16x8_t:int32x4_t /// Signed Absolute difference and Accumulate Long name = vabal_high no-q multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3] multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3] multi_fn = vabd_s32, d, e, f:int32x2_t multi_fn = simd_cast, f:uint32x2_t, f multi_fn = simd_add, a, {simd_cast, f} a = 15, 16 b = 1, 2, 15, 16 c = 10, 10, 10, 12 validate 20, 20 aarch64 = sabal generate int64x2_t:int32x4_t:int32x4_t:int64x2_t //////////////////// // Singned saturating Absolute value //////////////////// /// Singned saturating Absolute value name = vqabs a = MIN, MAX, -6, -5, -4, -3, -2, -1, 0, -127, 127, 1, 2, 3, 4, 5 validate MAX, MAX, 6, 5, 4, 3, 2, 1, 0, 127, 127, 1, 2, 3, 4, 5 arm = vqabs.s aarch64 = sqabs link-arm = vqabs._EXT_ link-aarch64 = sqabs._EXT_ generate int*_t /// Singned saturating Absolute value name = vqabs a = MIN, -7 validate MAX, 7 aarch64 = sqabs link-aarch64 = sqabs._EXT_ generate int64x*_t