mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-08-30 13:03:01 +00:00

Intel made a late change to the AVX10 specification that removes support for a 256-bit maximum vector length and enumeration of the maximum vector length. AVX10 will imply a maximum vector length of 512 bits. I.e. there won't be any such thing as AVX10/256 or AVX10/512; there will just be AVX10, and it will essentially just consolidate AVX512 features. As a result of this new development, my strategy of providing both *_avx10_256 and *_avx10_512 functions didn't turn out to be that useful. The only remaining motivation for the 256-bit AVX512 / AVX10 functions is to avoid downclocking on older Intel CPUs. But in the case of AES-XTS and AES-CTR, I already wrote *_avx2 code too (primarily to support CPUs without AVX512), which performs almost as well as *_avx10_256. So we should just use that. Therefore, remove the *_avx10_256 AES-XTS and AES-CTR functions and algorithms, and rename the *_avx10_512 AES-XTS and AES-CTR functions and algorithms to *_avx512. Make Ice Lake and Tiger Lake use *_avx2 instead of *_avx10_256 which they previously used. I've left AES-GCM unchanged for now. There is no VAES+AVX2 optimized AES-GCM in the kernel yet, so the path forward for that is not as clear. However, I did write a VAES+AVX2 optimized AES-GCM for BoringSSL. So one option is to port that to the kernel and then do the same cleanup. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
574 lines
16 KiB
ArmAsm
574 lines
16 KiB
ArmAsm
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
|
|
//
|
|
// Copyright 2025 Google LLC
|
|
//
|
|
// Author: Eric Biggers <ebiggers@google.com>
|
|
//
|
|
// This file is dual-licensed, meaning that you can use it under your choice of
|
|
// either of the following two licenses:
|
|
//
|
|
// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
|
|
// of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
// or
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are met:
|
|
//
|
|
// 1. Redistributions of source code must retain the above copyright notice,
|
|
// this list of conditions and the following disclaimer.
|
|
//
|
|
// 2. Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
// POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
//------------------------------------------------------------------------------
|
|
//
|
|
// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR
|
|
// using the following sets of CPU features:
|
|
// - AES-NI && AVX
|
|
// - VAES && AVX2
|
|
// - VAES && AVX512BW && AVX512VL && BMI2
|
|
//
|
|
// See the function definitions at the bottom of the file for more information.
|
|
|
|
#include <linux/linkage.h>
|
|
#include <linux/cfi_types.h>
|
|
|
|
.section .rodata
|
|
.p2align 4
|
|
|
|
.Lbswap_mask:
|
|
.octa 0x000102030405060708090a0b0c0d0e0f
|
|
|
|
.Lctr_pattern:
|
|
.quad 0, 0
|
|
.Lone:
|
|
.quad 1, 0
|
|
.Ltwo:
|
|
.quad 2, 0
|
|
.quad 3, 0
|
|
|
|
.Lfour:
|
|
.quad 4, 0
|
|
|
|
.text
|
|
|
|
// Move a vector between memory and a register.
|
|
.macro _vmovdqu src, dst
|
|
.if VL < 64
|
|
vmovdqu \src, \dst
|
|
.else
|
|
vmovdqu8 \src, \dst
|
|
.endif
|
|
.endm
|
|
|
|
// Move a vector between registers.
|
|
.macro _vmovdqa src, dst
|
|
.if VL < 64
|
|
vmovdqa \src, \dst
|
|
.else
|
|
vmovdqa64 \src, \dst
|
|
.endif
|
|
.endm
|
|
|
|
// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector
|
|
// register.
|
|
.macro _vbroadcast128 src, dst
|
|
.if VL == 16
|
|
vmovdqu \src, \dst
|
|
.elseif VL == 32
|
|
vbroadcasti128 \src, \dst
|
|
.else
|
|
vbroadcasti32x4 \src, \dst
|
|
.endif
|
|
.endm
|
|
|
|
// XOR two vectors together.
|
|
.macro _vpxor src1, src2, dst
|
|
.if VL < 64
|
|
vpxor \src1, \src2, \dst
|
|
.else
|
|
vpxord \src1, \src2, \dst
|
|
.endif
|
|
.endm
|
|
|
|
// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
|
|
// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
|
|
.macro _load_partial_block src, dst, tmp64, tmp32
|
|
sub $8, %ecx // LEN - 8
|
|
jle .Lle8\@
|
|
|
|
// Load 9 <= LEN <= 15 bytes.
|
|
vmovq (\src), \dst // Load first 8 bytes
|
|
mov (\src, %rcx), %rax // Load last 8 bytes
|
|
neg %ecx
|
|
shl $3, %ecx
|
|
shr %cl, %rax // Discard overlapping bytes
|
|
vpinsrq $1, %rax, \dst, \dst
|
|
jmp .Ldone\@
|
|
|
|
.Lle8\@:
|
|
add $4, %ecx // LEN - 4
|
|
jl .Llt4\@
|
|
|
|
// Load 4 <= LEN <= 8 bytes.
|
|
mov (\src), %eax // Load first 4 bytes
|
|
mov (\src, %rcx), \tmp32 // Load last 4 bytes
|
|
jmp .Lcombine\@
|
|
|
|
.Llt4\@:
|
|
// Load 1 <= LEN <= 3 bytes.
|
|
add $2, %ecx // LEN - 2
|
|
movzbl (\src), %eax // Load first byte
|
|
jl .Lmovq\@
|
|
movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
|
|
.Lcombine\@:
|
|
shl $3, %ecx
|
|
shl %cl, \tmp64
|
|
or \tmp64, %rax // Combine the two parts
|
|
.Lmovq\@:
|
|
vmovq %rax, \dst
|
|
.Ldone\@:
|
|
.endm
|
|
|
|
// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
|
|
// Clobbers %rax, %rcx, and \tmp{64,32}.
|
|
.macro _store_partial_block src, dst, tmp64, tmp32
|
|
sub $8, %ecx // LEN - 8
|
|
jl .Llt8\@
|
|
|
|
// Store 8 <= LEN <= 15 bytes.
|
|
vpextrq $1, \src, %rax
|
|
mov %ecx, \tmp32
|
|
shl $3, %ecx
|
|
ror %cl, %rax
|
|
mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes
|
|
vmovq \src, (\dst) // Store first 8 bytes
|
|
jmp .Ldone\@
|
|
|
|
.Llt8\@:
|
|
add $4, %ecx // LEN - 4
|
|
jl .Llt4\@
|
|
|
|
// Store 4 <= LEN <= 7 bytes.
|
|
vpextrd $1, \src, %eax
|
|
mov %ecx, \tmp32
|
|
shl $3, %ecx
|
|
ror %cl, %eax
|
|
mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes
|
|
vmovd \src, (\dst) // Store first 4 bytes
|
|
jmp .Ldone\@
|
|
|
|
.Llt4\@:
|
|
// Store 1 <= LEN <= 3 bytes.
|
|
vpextrb $0, \src, 0(\dst)
|
|
cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
|
|
jl .Ldone\@
|
|
vpextrb $1, \src, 1(\dst)
|
|
je .Ldone\@
|
|
vpextrb $2, \src, 2(\dst)
|
|
.Ldone\@:
|
|
.endm
|
|
|
|
// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and
|
|
// XOR each with the zero-th round key. Also update LE_CTR if !\final.
|
|
.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0
|
|
.if \is_xctr
|
|
.if USE_AVX512
|
|
vmovdqa64 LE_CTR, AESDATA\i0
|
|
vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0
|
|
.else
|
|
vpxor XCTR_IV, LE_CTR, AESDATA\i0
|
|
vpxor RNDKEY0, AESDATA\i0, AESDATA\i0
|
|
.endif
|
|
vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1
|
|
|
|
.if USE_AVX512
|
|
vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1
|
|
.else
|
|
vpxor XCTR_IV, AESDATA\i1, AESDATA\i1
|
|
vpxor RNDKEY0, AESDATA\i1, AESDATA\i1
|
|
.endif
|
|
.else
|
|
vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0
|
|
_vpxor RNDKEY0, AESDATA\i0, AESDATA\i0
|
|
vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1
|
|
vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1
|
|
_vpxor RNDKEY0, AESDATA\i1, AESDATA\i1
|
|
.endif
|
|
.if !\final
|
|
vpaddq LE_CTR_INC2, LE_CTR, LE_CTR
|
|
.endif
|
|
.endm
|
|
|
|
// Do all AES rounds on the data in the given AESDATA vectors, excluding the
|
|
// zero-th and last rounds.
|
|
.macro _aesenc_loop vecs:vararg
|
|
mov KEY, %rax
|
|
1:
|
|
_vbroadcast128 (%rax), RNDKEY
|
|
.irp i, \vecs
|
|
vaesenc RNDKEY, AESDATA\i, AESDATA\i
|
|
.endr
|
|
add $16, %rax
|
|
cmp %rax, RNDKEYLAST_PTR
|
|
jne 1b
|
|
.endm
|
|
|
|
// Finalize the keystream blocks in the given AESDATA vectors by doing the last
|
|
// AES round, then XOR those keystream blocks with the corresponding data.
|
|
// Reduce latency by doing the XOR before the vaesenclast, utilizing the
|
|
// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
|
|
.macro _aesenclast_and_xor vecs:vararg
|
|
.irp i, \vecs
|
|
_vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY
|
|
vaesenclast RNDKEY, AESDATA\i, AESDATA\i
|
|
.endr
|
|
.irp i, \vecs
|
|
_vmovdqu AESDATA\i, \i*VL(DST)
|
|
.endr
|
|
.endm
|
|
|
|
// XOR the keystream blocks in the specified AESDATA vectors with the
|
|
// corresponding data.
|
|
.macro _xor_data vecs:vararg
|
|
.irp i, \vecs
|
|
_vpxor \i*VL(SRC), AESDATA\i, AESDATA\i
|
|
.endr
|
|
.irp i, \vecs
|
|
_vmovdqu AESDATA\i, \i*VL(DST)
|
|
.endr
|
|
.endm
|
|
|
|
.macro _aes_ctr_crypt is_xctr
|
|
|
|
// Define register aliases V0-V15 that map to the xmm, ymm, or zmm
|
|
// registers according to the selected Vector Length (VL).
|
|
.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
|
|
.if VL == 16
|
|
.set V\i, %xmm\i
|
|
.elseif VL == 32
|
|
.set V\i, %ymm\i
|
|
.elseif VL == 64
|
|
.set V\i, %zmm\i
|
|
.else
|
|
.error "Unsupported Vector Length (VL)"
|
|
.endif
|
|
.endr
|
|
|
|
// Function arguments
|
|
.set KEY, %rdi // Initially points to the start of the
|
|
// crypto_aes_ctx, then is advanced to
|
|
// point to the index 1 round key
|
|
.set KEY32, %edi // Available as temp register after all
|
|
// keystream blocks have been generated
|
|
.set SRC, %rsi // Pointer to next source data
|
|
.set DST, %rdx // Pointer to next destination data
|
|
.set LEN, %ecx // Remaining length in bytes.
|
|
// Note: _load_partial_block relies on
|
|
// this being in %ecx.
|
|
.set LEN64, %rcx // Zero-extend LEN before using!
|
|
.set LEN8, %cl
|
|
.if \is_xctr
|
|
.set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE];
|
|
.set XCTR_CTR, %r9 // u64 ctr;
|
|
.else
|
|
.set LE_CTR_PTR, %r8 // const u64 le_ctr[2];
|
|
.endif
|
|
|
|
// Additional local variables
|
|
.set RNDKEYLAST_PTR, %r10
|
|
.set AESDATA0, V0
|
|
.set AESDATA0_XMM, %xmm0
|
|
.set AESDATA1, V1
|
|
.set AESDATA1_XMM, %xmm1
|
|
.set AESDATA2, V2
|
|
.set AESDATA3, V3
|
|
.set AESDATA4, V4
|
|
.set AESDATA5, V5
|
|
.set AESDATA6, V6
|
|
.set AESDATA7, V7
|
|
.if \is_xctr
|
|
.set XCTR_IV, V8
|
|
.else
|
|
.set BSWAP_MASK, V8
|
|
.endif
|
|
.set LE_CTR, V9
|
|
.set LE_CTR_XMM, %xmm9
|
|
.set LE_CTR_INC1, V10
|
|
.set LE_CTR_INC2, V11
|
|
.set RNDKEY0, V12
|
|
.set RNDKEYLAST, V13
|
|
.set RNDKEY, V14
|
|
|
|
// Create the first vector of counters.
|
|
.if \is_xctr
|
|
.if VL == 16
|
|
vmovq XCTR_CTR, LE_CTR
|
|
.elseif VL == 32
|
|
vmovq XCTR_CTR, LE_CTR_XMM
|
|
inc XCTR_CTR
|
|
vmovq XCTR_CTR, AESDATA0_XMM
|
|
vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR
|
|
.else
|
|
vpbroadcastq XCTR_CTR, LE_CTR
|
|
vpsrldq $8, LE_CTR, LE_CTR
|
|
vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR
|
|
.endif
|
|
_vbroadcast128 (XCTR_IV_PTR), XCTR_IV
|
|
.else
|
|
_vbroadcast128 (LE_CTR_PTR), LE_CTR
|
|
.if VL > 16
|
|
vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR
|
|
.endif
|
|
_vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK
|
|
.endif
|
|
|
|
.if VL == 16
|
|
_vbroadcast128 .Lone(%rip), LE_CTR_INC1
|
|
.elseif VL == 32
|
|
_vbroadcast128 .Ltwo(%rip), LE_CTR_INC1
|
|
.else
|
|
_vbroadcast128 .Lfour(%rip), LE_CTR_INC1
|
|
.endif
|
|
vpsllq $1, LE_CTR_INC1, LE_CTR_INC2
|
|
|
|
// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
|
|
movl 480(KEY), %eax
|
|
|
|
// Compute the pointer to the last round key.
|
|
lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR
|
|
|
|
// Load the zero-th and last round keys.
|
|
_vbroadcast128 (KEY), RNDKEY0
|
|
_vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST
|
|
|
|
// Make KEY point to the first round key.
|
|
add $16, KEY
|
|
|
|
// This is the main loop, which encrypts 8 vectors of data at a time.
|
|
add $-8*VL, LEN
|
|
jl .Lloop_8x_done\@
|
|
.Lloop_8x\@:
|
|
_prepare_2_ctr_vecs \is_xctr, 0, 1
|
|
_prepare_2_ctr_vecs \is_xctr, 2, 3
|
|
_prepare_2_ctr_vecs \is_xctr, 4, 5
|
|
_prepare_2_ctr_vecs \is_xctr, 6, 7
|
|
_aesenc_loop 0,1,2,3,4,5,6,7
|
|
_aesenclast_and_xor 0,1,2,3,4,5,6,7
|
|
sub $-8*VL, SRC
|
|
sub $-8*VL, DST
|
|
add $-8*VL, LEN
|
|
jge .Lloop_8x\@
|
|
.Lloop_8x_done\@:
|
|
sub $-8*VL, LEN
|
|
jz .Ldone\@
|
|
|
|
// 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream
|
|
// blocks, depending on the remaining LEN.
|
|
|
|
_prepare_2_ctr_vecs \is_xctr, 0, 1
|
|
_prepare_2_ctr_vecs \is_xctr, 2, 3
|
|
cmp $4*VL, LEN
|
|
jle .Lenc_tail_atmost4vecs\@
|
|
|
|
// 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the
|
|
// first 4 to XOR 4 full vectors of data. Then XOR the remaining data.
|
|
_prepare_2_ctr_vecs \is_xctr, 4, 5
|
|
_prepare_2_ctr_vecs \is_xctr, 6, 7, final=1
|
|
_aesenc_loop 0,1,2,3,4,5,6,7
|
|
_aesenclast_and_xor 0,1,2,3
|
|
vaesenclast RNDKEYLAST, AESDATA4, AESDATA0
|
|
vaesenclast RNDKEYLAST, AESDATA5, AESDATA1
|
|
vaesenclast RNDKEYLAST, AESDATA6, AESDATA2
|
|
vaesenclast RNDKEYLAST, AESDATA7, AESDATA3
|
|
sub $-4*VL, SRC
|
|
sub $-4*VL, DST
|
|
add $-4*VL, LEN
|
|
cmp $1*VL-1, LEN
|
|
jle .Lxor_tail_partial_vec_0\@
|
|
_xor_data 0
|
|
cmp $2*VL-1, LEN
|
|
jle .Lxor_tail_partial_vec_1\@
|
|
_xor_data 1
|
|
cmp $3*VL-1, LEN
|
|
jle .Lxor_tail_partial_vec_2\@
|
|
_xor_data 2
|
|
cmp $4*VL-1, LEN
|
|
jle .Lxor_tail_partial_vec_3\@
|
|
_xor_data 3
|
|
jmp .Ldone\@
|
|
|
|
.Lenc_tail_atmost4vecs\@:
|
|
cmp $2*VL, LEN
|
|
jle .Lenc_tail_atmost2vecs\@
|
|
|
|
// 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the
|
|
// first 2 to XOR 2 full vectors of data. Then XOR the remaining data.
|
|
_aesenc_loop 0,1,2,3
|
|
_aesenclast_and_xor 0,1
|
|
vaesenclast RNDKEYLAST, AESDATA2, AESDATA0
|
|
vaesenclast RNDKEYLAST, AESDATA3, AESDATA1
|
|
sub $-2*VL, SRC
|
|
sub $-2*VL, DST
|
|
add $-2*VL, LEN
|
|
jmp .Lxor_tail_upto2vecs\@
|
|
|
|
.Lenc_tail_atmost2vecs\@:
|
|
// 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR
|
|
// the remaining data.
|
|
_aesenc_loop 0,1
|
|
vaesenclast RNDKEYLAST, AESDATA0, AESDATA0
|
|
vaesenclast RNDKEYLAST, AESDATA1, AESDATA1
|
|
|
|
.Lxor_tail_upto2vecs\@:
|
|
cmp $1*VL-1, LEN
|
|
jle .Lxor_tail_partial_vec_0\@
|
|
_xor_data 0
|
|
cmp $2*VL-1, LEN
|
|
jle .Lxor_tail_partial_vec_1\@
|
|
_xor_data 1
|
|
jmp .Ldone\@
|
|
|
|
.Lxor_tail_partial_vec_1\@:
|
|
add $-1*VL, LEN
|
|
jz .Ldone\@
|
|
sub $-1*VL, SRC
|
|
sub $-1*VL, DST
|
|
_vmovdqa AESDATA1, AESDATA0
|
|
jmp .Lxor_tail_partial_vec_0\@
|
|
|
|
.Lxor_tail_partial_vec_2\@:
|
|
add $-2*VL, LEN
|
|
jz .Ldone\@
|
|
sub $-2*VL, SRC
|
|
sub $-2*VL, DST
|
|
_vmovdqa AESDATA2, AESDATA0
|
|
jmp .Lxor_tail_partial_vec_0\@
|
|
|
|
.Lxor_tail_partial_vec_3\@:
|
|
add $-3*VL, LEN
|
|
jz .Ldone\@
|
|
sub $-3*VL, SRC
|
|
sub $-3*VL, DST
|
|
_vmovdqa AESDATA3, AESDATA0
|
|
|
|
.Lxor_tail_partial_vec_0\@:
|
|
// XOR the remaining 1 <= LEN < VL bytes. It's easy if masked
|
|
// loads/stores are available; otherwise it's a bit harder...
|
|
.if USE_AVX512
|
|
mov $-1, %rax
|
|
bzhi LEN64, %rax, %rax
|
|
kmovq %rax, %k1
|
|
vmovdqu8 (SRC), AESDATA1{%k1}{z}
|
|
vpxord AESDATA1, AESDATA0, AESDATA0
|
|
vmovdqu8 AESDATA0, (DST){%k1}
|
|
.else
|
|
.if VL == 32
|
|
cmp $16, LEN
|
|
jl 1f
|
|
vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM
|
|
vmovdqu AESDATA1_XMM, (DST)
|
|
add $16, SRC
|
|
add $16, DST
|
|
sub $16, LEN
|
|
jz .Ldone\@
|
|
vextracti128 $1, AESDATA0, AESDATA0_XMM
|
|
1:
|
|
.endif
|
|
mov LEN, %r10d
|
|
_load_partial_block SRC, AESDATA1_XMM, KEY, KEY32
|
|
vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM
|
|
mov %r10d, %ecx
|
|
_store_partial_block AESDATA0_XMM, DST, KEY, KEY32
|
|
.endif
|
|
|
|
.Ldone\@:
|
|
.if VL > 16
|
|
vzeroupper
|
|
.endif
|
|
RET
|
|
.endm
|
|
|
|
// Below are the definitions of the functions generated by the above macro.
|
|
// They have the following prototypes:
|
|
//
|
|
//
|
|
// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key,
|
|
// const u8 *src, u8 *dst, int len,
|
|
// const u64 le_ctr[2]);
|
|
//
|
|
// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key,
|
|
// const u8 *src, u8 *dst, int len,
|
|
// const u8 iv[AES_BLOCK_SIZE], u64 ctr);
|
|
//
|
|
// Both functions generate |len| bytes of keystream, XOR it with the data from
|
|
// |src|, and write the result to |dst|. On non-final calls, |len| must be a
|
|
// multiple of 16. On the final call, |len| can be any value.
|
|
//
|
|
// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated
|
|
// from a 128-bit big endian counter that increments by 1 for each AES block.
|
|
// HOWEVER, to keep the assembly code simple, some of the counter management is
|
|
// left to the caller. aes_ctr64_crypt_* take the counter in little endian
|
|
// form, only increment the low 64 bits internally, do the conversion to big
|
|
// endian internally, and don't write the updated counter back to memory. The
|
|
// caller is responsible for converting the starting IV to the little endian
|
|
// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits
|
|
// being needed and splitting at that point with a carry done in between, and
|
|
// updating le_ctr after each part if the message is multi-part.
|
|
//
|
|
// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption
|
|
// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an
|
|
// easier-to-implement variant of CTR that uses little endian byte order and
|
|
// eliminates carries. |ctr| is the per-message block counter starting at 1.
|
|
|
|
.set VL, 16
|
|
.set USE_AVX512, 0
|
|
SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx)
|
|
_aes_ctr_crypt 0
|
|
SYM_FUNC_END(aes_ctr64_crypt_aesni_avx)
|
|
SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx)
|
|
_aes_ctr_crypt 1
|
|
SYM_FUNC_END(aes_xctr_crypt_aesni_avx)
|
|
|
|
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
|
|
.set VL, 32
|
|
.set USE_AVX512, 0
|
|
SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2)
|
|
_aes_ctr_crypt 0
|
|
SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2)
|
|
SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2)
|
|
_aes_ctr_crypt 1
|
|
SYM_FUNC_END(aes_xctr_crypt_vaes_avx2)
|
|
|
|
.set VL, 64
|
|
.set USE_AVX512, 1
|
|
SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512)
|
|
_aes_ctr_crypt 0
|
|
SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512)
|
|
SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512)
|
|
_aes_ctr_crypt 1
|
|
SYM_FUNC_END(aes_xctr_crypt_vaes_avx512)
|
|
#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ
|