* configure.ac (HAVE_GCC_INLINE_ASM_VAES_VPCLMUL): New.
* src/g10lib.h (HWF_INTEL_VAES_VPCLMUL): New. * src/hwf-x86.c (detect_x86_gnuc): Check for VAES and VPCLMUL. * src/hwfeatures.c (hwflist): Add "intel-vaes-vpclmul". -- Detect support for VAES and VPCLMUL instruction sets, which allow use of AES and PCLMUL instruction with 256-bit and 512-bit vector registers. Signed-off-by: Jussi Kivilinna <[hidden email]> --- configure.ac | 32 ++++++++++++++++++++++++++++++++ src/g10lib.h | 1 + src/hwf-x86.c | 11 +++++++++-- src/hwfeatures.c | 1 + 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 61553ff8..564d361b 100644 --- a/configure.ac +++ b/configure.ac @@ -1609,6 +1609,31 @@ if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then fi +# +# Check whether GCC inline assembler supports VAES and VPCLMUL instructions +# +AC_CACHE_CHECK([whether GCC inline assembler supports VAES and VPCLMUL instructions], + [gcry_cv_gcc_inline_asm_vaes_vpclmul], + [if test "$mpi_cpu_arch" != "x86" || + test "$try_asm_modules" != "yes" ; then + gcry_cv_gcc_inline_asm_vaes_vpclmul="n/a" + else + gcry_cv_gcc_inline_asm_vaes_vpclmul=no + AC_LINK_IFELSE([AC_LANG_PROGRAM( + [[void a(void) { + __asm__("vaesenclast %%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/ + __asm__("vaesenclast %%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/ + __asm__("vpclmulqdq \$0,%%ymm7,%%ymm7,%%ymm1\n\t":::"cc");/*256-bit*/ + __asm__("vpclmulqdq \$0,%%zmm7,%%zmm7,%%zmm1\n\t":::"cc");/*512-bit*/ + }]], [ a(); ] )], + [gcry_cv_gcc_inline_asm_vaes_vpclmul=yes]) + fi]) +if test "$gcry_cv_gcc_inline_asm_vaes_vpclmul" = "yes" ; then + AC_DEFINE(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL,1, + [Defined if inline assembler supports VAES and VPCLMUL instructions]) +fi + + # # Check whether GCC inline assembler supports BMI2 instructions # @@ -2541,6 +2566,10 @@ if test "$found" = "1" ; then # Build with the SSSE3 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64-asm.lo" + + # Build with the VAES/AVX2 implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-vaes.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-vaes-avx2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation @@ -2679,6 +2708,9 @@ if test "$found" = "1" ; then if test x"$aesnisupport" = xyes ; then # Build with the AES-NI/AVX2 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx2-amd64.lo" + + # Build with the VAES/AVX2 implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-vaes-avx2-amd64.lo" fi fi fi diff --git a/src/g10lib.h b/src/g10lib.h index 243997eb..b0b73852 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -237,6 +237,7 @@ char **_gcry_strtokenize (const char *string, const char *delim); #define HWF_INTEL_FAST_VPGATHER (1 << 14) #define HWF_INTEL_RDTSC (1 << 15) #define HWF_INTEL_SHAEXT (1 << 16) +#define HWF_INTEL_VAES_VPCLMUL (1 << 17) #elif defined(HAVE_CPU_ARCH_ARM) diff --git a/src/hwf-x86.c b/src/hwf-x86.c index 9a9ed6d3..91e4c411 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -372,7 +372,7 @@ detect_x86_gnuc (void) if (max_cpuid_level >= 7 && (features & 0x00000001)) { /* Get CPUID:7 contains further Intel feature flags. */ - get_cpuid(7, NULL, &features, NULL, NULL); + get_cpuid(7, NULL, &features, &features2, NULL); /* Test bit 8 for BMI2. */ if (features & 0x00000100) @@ -390,7 +390,14 @@ detect_x86_gnuc (void) /* Test bit 29 for SHA Extensions. */ if (features & (1 << 29)) - result |= HWF_INTEL_SHAEXT; + result |= HWF_INTEL_SHAEXT; + +#if defined(ENABLE_AVX2_SUPPORT) && defined(ENABLE_AESNI_SUPPORT) && \ + defined(ENABLE_PCLMUL_SUPPORT) + /* Test bit 9 for VAES and bit 10 for VPCLMULDQD */ + if ((features2 & 0x00000200) && (features2 & 0x00000400)) + result |= HWF_INTEL_VAES_VPCLMUL; +#endif } return result; diff --git a/src/hwfeatures.c b/src/hwfeatures.c index db58d2a3..b47429bb 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -60,6 +60,7 @@ static struct { HWF_INTEL_FAST_VPGATHER, "intel-fast-vpgather" }, { HWF_INTEL_RDTSC, "intel-rdtsc" }, { HWF_INTEL_SHAEXT, "intel-shaext" }, + { HWF_INTEL_VAES_VPCLMUL, "intel-vaes-vpclmul" }, #elif defined(HAVE_CPU_ARCH_ARM) { HWF_ARM_NEON, "arm-neon" }, { HWF_ARM_AES, "arm-aes" }, -- 2.27.0 _______________________________________________ Gcrypt-devel mailing list [hidden email] http://lists.gnupg.org/mailman/listinfo/gcrypt-devel |
* cipher/Makefile.am: Add 'camellia-aesni-avx2-amd64.h' and
'camellia-vaes-avx2-amd64.S'. * cipher/camellia-aesni-avx2-amd64.S: New, old content moved to... * cipher/camellia-aesni-avx2-amd64.h: ...here. (IF_AESNI, IF_VAES, FUNC_NAME): New. * cipher/camellia-vaes-avx2-amd64.S: New. * cipher/camellia-glue.c (USE_VAES_AVX2): New. (CAMELLIA_context): New member 'use_vaes_avx2'. (_gcry_camellia_vaes_avx2_ctr_enc, _gcry_camellia_vaes_avx2_cbc_dec) (_gcry_camellia_vaes_avx2_cfb_dec, _gcry_camellia_vaes_avx2_ocb_enc) (_gcry_camellia_vaes_avx2_ocb_dec) (_gcry_camellia_vaes_avx2_ocb_auth): New. (camellia_setkey): Check for HWF_INTEL_VAES. (_gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec) (_gcry_camellia_cfb_dec, _gcry_camellia_ocb_crypt) (_gcry_camellia_ocb_auth): Add USE_VAES_AVX2 code. * configure.ac: Add 'camellia-vaes-avx2-amd64.lo'. -- Camellia AES-NI/AVX2 implementation had to split 256-bit vector to 128-bit parts for AES processing, but now we can use those 256-bit registers directly with VAES. Benchmarks on AMD Ryzen 5800X: Before (AES-NI/AVX2): CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC dec | 0.539 ns/B 1769 MiB/s 2.62 c/B 4852 CFB dec | 0.528 ns/B 1806 MiB/s 2.56 c/B 4852±1 CTR enc | 0.552 ns/B 1728 MiB/s 2.68 c/B 4850 OCB enc | 0.550 ns/B 1734 MiB/s 2.65 c/B 4825 OCB dec | 0.577 ns/B 1653 MiB/s 2.78 c/B 4825 OCB auth | 0.546 ns/B 1747 MiB/s 2.63 c/B 4825 After (VAES/AVX2, CBC-dec ~13%, CFB-dec/CTR/OCB ~20% faster): CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC dec | 0.477 ns/B 1999 MiB/s 2.31 c/B 4850 CFB dec | 0.433 ns/B 2201 MiB/s 2.10 c/B 4850 CTR enc | 0.438 ns/B 2176 MiB/s 2.13 c/B 4851 OCB enc | 0.449 ns/B 2122 MiB/s 2.18 c/B 4850 OCB dec | 0.468 ns/B 2038 MiB/s 2.27 c/B 4850 OCB auth | 0.447 ns/B 2131 MiB/s 2.17 c/B 4850 Signed-off-by: Jussi Kivilinna <[hidden email]> --- cipher/Makefile.am | 1 + cipher/camellia-aesni-avx2-amd64.S | 1762 +-------------------------- cipher/camellia-aesni-avx2-amd64.h | 1794 ++++++++++++++++++++++++++++ cipher/camellia-glue.c | 114 +- cipher/camellia-vaes-avx2-amd64.S | 35 + 5 files changed, 1943 insertions(+), 1763 deletions(-) create mode 100644 cipher/camellia-aesni-avx2-amd64.h create mode 100644 cipher/camellia-vaes-avx2-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index d6440056..75680fcd 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -133,6 +133,7 @@ EXTRA_libcipher_la_SOURCES = \ twofish-avx2-amd64.S \ rfc2268.c \ camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \ + camellia-aesni-avx2-amd64.h camellia-vaes-avx2-amd64.S \ camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \ blake2.c \ blake2b-amd64-avx2.S blake2s-amd64-avx.S diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S index f620f040..5102d191 100644 --- a/cipher/camellia-aesni-avx2-amd64.S +++ b/cipher/camellia-aesni-avx2-amd64.S @@ -1,6 +1,6 @@ -/* camellia-avx2-aesni-amd64.S - AES-NI/AVX2 implementation of Camellia cipher +/* camellia-aesni-avx2-amd64.S - AES-NI/AVX2 implementation of Camellia cipher * - * Copyright (C) 2013-2015,2020 Jussi Kivilinna <[hidden email]> + * Copyright (C) 2021 Jussi Kivilinna <[hidden email]> * * This file is part of Libgcrypt. * @@ -25,1758 +25,10 @@ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) -#include "asm-common-amd64.h" +#undef CAMELLIA_VAES_BUILD +#define FUNC_NAME(func) _gcry_camellia_aesni_avx2_ ## func -#define CAMELLIA_TABLE_BYTE_LEN 272 +#include "camellia-aesni-avx2-amd64.h" -/* struct CAMELLIA_context: */ -#define key_table 0 -#define key_bitlength CAMELLIA_TABLE_BYTE_LEN - -/* register macros */ -#define CTX %rdi -#define RIO %r8 - -/********************************************************************** - helper macros - **********************************************************************/ -#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ - vpand x, mask4bit, tmp0; \ - vpandn x, mask4bit, x; \ - vpsrld $4, x, x; \ - \ - vpshufb tmp0, lo_t, tmp0; \ - vpshufb x, hi_t, x; \ - vpxor tmp0, x, x; - -#define ymm0_x xmm0 -#define ymm1_x xmm1 -#define ymm2_x xmm2 -#define ymm3_x xmm3 -#define ymm4_x xmm4 -#define ymm5_x xmm5 -#define ymm6_x xmm6 -#define ymm7_x xmm7 -#define ymm8_x xmm8 -#define ymm9_x xmm9 -#define ymm10_x xmm10 -#define ymm11_x xmm11 -#define ymm12_x xmm12 -#define ymm13_x xmm13 -#define ymm14_x xmm14 -#define ymm15_x xmm15 - -/********************************************************************** - 32-way camellia - **********************************************************************/ - -/* - * IN: - * x0..x7: byte-sliced AB state - * mem_cd: register pointer storing CD state - * key: index for key material - * OUT: - * x0..x7: new byte-sliced CD state - */ -#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ - t7, mem_cd, key) \ - /* \ - * S-function with AES subbytes \ - */ \ - vbroadcasti128 .Linv_shift_row rRIP, t4; \ - vpbroadcastd .L0f0f0f0f rRIP, t7; \ - vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \ - vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \ - vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \ - vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \ - \ - /* AES inverse shift rows */ \ - vpshufb t4, x0, x0; \ - vpshufb t4, x7, x7; \ - vpshufb t4, x3, x3; \ - vpshufb t4, x6, x6; \ - vpshufb t4, x2, x2; \ - vpshufb t4, x5, x5; \ - vpshufb t4, x1, x1; \ - vpshufb t4, x4, x4; \ - \ - /* prefilter sboxes 1, 2 and 3 */ \ - /* prefilter sbox 4 */ \ - filter_8bit(x0, t5, t6, t7, t4); \ - filter_8bit(x7, t5, t6, t7, t4); \ - vextracti128 $1, x0, t0##_x; \ - vextracti128 $1, x7, t1##_x; \ - filter_8bit(x3, t2, t3, t7, t4); \ - filter_8bit(x6, t2, t3, t7, t4); \ - vextracti128 $1, x3, t3##_x; \ - vextracti128 $1, x6, t2##_x; \ - filter_8bit(x2, t5, t6, t7, t4); \ - filter_8bit(x5, t5, t6, t7, t4); \ - filter_8bit(x1, t5, t6, t7, t4); \ - filter_8bit(x4, t5, t6, t7, t4); \ - \ - vpxor t4##_x, t4##_x, t4##_x; \ - \ - /* AES subbytes + AES shift rows */ \ - vextracti128 $1, x2, t6##_x; \ - vextracti128 $1, x5, t5##_x; \ - vaesenclast t4##_x, x0##_x, x0##_x; \ - vaesenclast t4##_x, t0##_x, t0##_x; \ - vaesenclast t4##_x, x7##_x, x7##_x; \ - vaesenclast t4##_x, t1##_x, t1##_x; \ - vaesenclast t4##_x, x3##_x, x3##_x; \ - vaesenclast t4##_x, t3##_x, t3##_x; \ - vaesenclast t4##_x, x6##_x, x6##_x; \ - vaesenclast t4##_x, t2##_x, t2##_x; \ - vinserti128 $1, t0##_x, x0, x0; \ - vinserti128 $1, t1##_x, x7, x7; \ - vinserti128 $1, t3##_x, x3, x3; \ - vinserti128 $1, t2##_x, x6, x6; \ - vextracti128 $1, x1, t3##_x; \ - vextracti128 $1, x4, t2##_x; \ - vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \ - vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \ - vaesenclast t4##_x, x2##_x, x2##_x; \ - vaesenclast t4##_x, t6##_x, t6##_x; \ - vaesenclast t4##_x, x5##_x, x5##_x; \ - vaesenclast t4##_x, t5##_x, t5##_x; \ - vaesenclast t4##_x, x1##_x, x1##_x; \ - vaesenclast t4##_x, t3##_x, t3##_x; \ - vaesenclast t4##_x, x4##_x, x4##_x; \ - vaesenclast t4##_x, t2##_x, t2##_x; \ - vinserti128 $1, t6##_x, x2, x2; \ - vinserti128 $1, t5##_x, x5, x5; \ - vinserti128 $1, t3##_x, x1, x1; \ - vinserti128 $1, t2##_x, x4, x4; \ - \ - /* postfilter sboxes 1 and 4 */ \ - vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \ - vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \ - filter_8bit(x0, t0, t1, t7, t4); \ - filter_8bit(x7, t0, t1, t7, t4); \ - filter_8bit(x3, t0, t1, t7, t6); \ - filter_8bit(x6, t0, t1, t7, t6); \ - \ - /* postfilter sbox 3 */ \ - vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \ - vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \ - filter_8bit(x2, t2, t3, t7, t6); \ - filter_8bit(x5, t2, t3, t7, t6); \ - \ - vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ - \ - /* postfilter sbox 2 */ \ - filter_8bit(x1, t4, t5, t7, t2); \ - filter_8bit(x4, t4, t5, t7, t2); \ - vpxor t7, t7, t7; \ - \ - vpsrldq $1, t0, t1; \ - vpsrldq $2, t0, t2; \ - vpshufb t7, t1, t1; \ - vpsrldq $3, t0, t3; \ - \ - /* P-function */ \ - vpxor x5, x0, x0; \ - vpxor x6, x1, x1; \ - vpxor x7, x2, x2; \ - vpxor x4, x3, x3; \ - \ - vpshufb t7, t2, t2; \ - vpsrldq $4, t0, t4; \ - vpshufb t7, t3, t3; \ - vpsrldq $5, t0, t5; \ - vpshufb t7, t4, t4; \ - \ - vpxor x2, x4, x4; \ - vpxor x3, x5, x5; \ - vpxor x0, x6, x6; \ - vpxor x1, x7, x7; \ - \ - vpsrldq $6, t0, t6; \ - vpshufb t7, t5, t5; \ - vpshufb t7, t6, t6; \ - \ - vpxor x7, x0, x0; \ - vpxor x4, x1, x1; \ - vpxor x5, x2, x2; \ - vpxor x6, x3, x3; \ - \ - vpxor x3, x4, x4; \ - vpxor x0, x5, x5; \ - vpxor x1, x6, x6; \ - vpxor x2, x7, x7; /* note: high and low parts swapped */ \ - \ - /* Add key material and result to CD (x becomes new CD) */ \ - \ - vpxor t6, x1, x1; \ - vpxor 5 * 32(mem_cd), x1, x1; \ - \ - vpsrldq $7, t0, t6; \ - vpshufb t7, t0, t0; \ - vpshufb t7, t6, t7; \ - \ - vpxor t7, x0, x0; \ - vpxor 4 * 32(mem_cd), x0, x0; \ - \ - vpxor t5, x2, x2; \ - vpxor 6 * 32(mem_cd), x2, x2; \ - \ - vpxor t4, x3, x3; \ - vpxor 7 * 32(mem_cd), x3, x3; \ - \ - vpxor t3, x4, x4; \ - vpxor 0 * 32(mem_cd), x4, x4; \ - \ - vpxor t2, x5, x5; \ - vpxor 1 * 32(mem_cd), x5, x5; \ - \ - vpxor t1, x6, x6; \ - vpxor 2 * 32(mem_cd), x6, x6; \ - \ - vpxor t0, x7, x7; \ - vpxor 3 * 32(mem_cd), x7, x7; - -/* - * IN/OUT: - * x0..x7: byte-sliced AB state preloaded - * mem_ab: byte-sliced AB state in memory - * mem_cb: byte-sliced CD state in memory - */ -#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ - roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \ - \ - vmovdqu x0, 4 * 32(mem_cd); \ - vmovdqu x1, 5 * 32(mem_cd); \ - vmovdqu x2, 6 * 32(mem_cd); \ - vmovdqu x3, 7 * 32(mem_cd); \ - vmovdqu x4, 0 * 32(mem_cd); \ - vmovdqu x5, 1 * 32(mem_cd); \ - vmovdqu x6, 2 * 32(mem_cd); \ - vmovdqu x7, 3 * 32(mem_cd); \ - \ - roundsm32(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \ - \ - store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); - -#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ - -#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ - /* Store new AB state */ \ - vmovdqu x4, 4 * 32(mem_ab); \ - vmovdqu x5, 5 * 32(mem_ab); \ - vmovdqu x6, 6 * 32(mem_ab); \ - vmovdqu x7, 7 * 32(mem_ab); \ - vmovdqu x0, 0 * 32(mem_ab); \ - vmovdqu x1, 1 * 32(mem_ab); \ - vmovdqu x2, 2 * 32(mem_ab); \ - vmovdqu x3, 3 * 32(mem_ab); - -#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd, i) \ - two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ - two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ - two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); - -#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd, i) \ - two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ - two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ - two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); - -/* - * IN: - * v0..3: byte-sliced 32-bit integers - * OUT: - * v0..3: (IN <<< 1) - */ -#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ - vpcmpgtb v0, zero, t0; \ - vpaddb v0, v0, v0; \ - vpabsb t0, t0; \ - \ - vpcmpgtb v1, zero, t1; \ - vpaddb v1, v1, v1; \ - vpabsb t1, t1; \ - \ - vpcmpgtb v2, zero, t2; \ - vpaddb v2, v2, v2; \ - vpabsb t2, t2; \ - \ - vpor t0, v1, v1; \ - \ - vpcmpgtb v3, zero, t0; \ - vpaddb v3, v3, v3; \ - vpabsb t0, t0; \ - \ - vpor t1, v2, v2; \ - vpor t2, v3, v3; \ - vpor t0, v0, v0; - -/* - * IN: - * r: byte-sliced AB state in memory - * l: byte-sliced CD state in memory - * OUT: - * x0..x7: new byte-sliced CD state - */ -#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ - tt1, tt2, tt3, kll, klr, krl, krr) \ - /* \ - * t0 = kll; \ - * t0 &= ll; \ - * lr ^= rol32(t0, 1); \ - */ \ - vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ - vpxor tt0, tt0, tt0; \ - vpshufb tt0, t0, t3; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t2; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t1; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t0; \ - \ - vpand l0, t0, t0; \ - vpand l1, t1, t1; \ - vpand l2, t2, t2; \ - vpand l3, t3, t3; \ - \ - rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ - \ - vpxor l4, t0, l4; \ - vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ - vmovdqu l4, 4 * 32(l); \ - vpxor l5, t1, l5; \ - vmovdqu l5, 5 * 32(l); \ - vpxor l6, t2, l6; \ - vmovdqu l6, 6 * 32(l); \ - vpxor l7, t3, l7; \ - vmovdqu l7, 7 * 32(l); \ - \ - /* \ - * t2 = krr; \ - * t2 |= rr; \ - * rl ^= t2; \ - */ \ - \ - vpshufb tt0, t0, t3; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t2; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t1; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t0; \ - \ - vpor 4 * 32(r), t0, t0; \ - vpor 5 * 32(r), t1, t1; \ - vpor 6 * 32(r), t2, t2; \ - vpor 7 * 32(r), t3, t3; \ - \ - vpxor 0 * 32(r), t0, t0; \ - vpxor 1 * 32(r), t1, t1; \ - vpxor 2 * 32(r), t2, t2; \ - vpxor 3 * 32(r), t3, t3; \ - vmovdqu t0, 0 * 32(r); \ - vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ - vmovdqu t1, 1 * 32(r); \ - vmovdqu t2, 2 * 32(r); \ - vmovdqu t3, 3 * 32(r); \ - \ - /* \ - * t2 = krl; \ - * t2 &= rl; \ - * rr ^= rol32(t2, 1); \ - */ \ - vpshufb tt0, t0, t3; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t2; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t1; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t0; \ - \ - vpand 0 * 32(r), t0, t0; \ - vpand 1 * 32(r), t1, t1; \ - vpand 2 * 32(r), t2, t2; \ - vpand 3 * 32(r), t3, t3; \ - \ - rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ - \ - vpxor 4 * 32(r), t0, t0; \ - vpxor 5 * 32(r), t1, t1; \ - vpxor 6 * 32(r), t2, t2; \ - vpxor 7 * 32(r), t3, t3; \ - vmovdqu t0, 4 * 32(r); \ - vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ - vmovdqu t1, 5 * 32(r); \ - vmovdqu t2, 6 * 32(r); \ - vmovdqu t3, 7 * 32(r); \ - \ - /* \ - * t0 = klr; \ - * t0 |= lr; \ - * ll ^= t0; \ - */ \ - \ - vpshufb tt0, t0, t3; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t2; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t1; \ - vpsrldq $1, t0, t0; \ - vpshufb tt0, t0, t0; \ - \ - vpor l4, t0, t0; \ - vpor l5, t1, t1; \ - vpor l6, t2, t2; \ - vpor l7, t3, t3; \ - \ - vpxor l0, t0, l0; \ - vmovdqu l0, 0 * 32(l); \ - vpxor l1, t1, l1; \ - vmovdqu l1, 1 * 32(l); \ - vpxor l2, t2, l2; \ - vmovdqu l2, 2 * 32(l); \ - vpxor l3, t3, l3; \ - vmovdqu l3, 3 * 32(l); - -#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ - vpunpckhdq x1, x0, t2; \ - vpunpckldq x1, x0, x0; \ - \ - vpunpckldq x3, x2, t1; \ - vpunpckhdq x3, x2, x2; \ - \ - vpunpckhqdq t1, x0, x1; \ - vpunpcklqdq t1, x0, x0; \ - \ - vpunpckhqdq x2, t2, x3; \ - vpunpcklqdq x2, t2, x2; - -#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ - a3, b3, c3, d3, st0, st1) \ - vmovdqu d2, st0; \ - vmovdqu d3, st1; \ - transpose_4x4(a0, a1, a2, a3, d2, d3); \ - transpose_4x4(b0, b1, b2, b3, d2, d3); \ - vmovdqu st0, d2; \ - vmovdqu st1, d3; \ - \ - vmovdqu a0, st0; \ - vmovdqu a1, st1; \ - transpose_4x4(c0, c1, c2, c3, a0, a1); \ - transpose_4x4(d0, d1, d2, d3, a0, a1); \ - \ - vbroadcasti128 .Lshufb_16x16b rRIP, a0; \ - vmovdqu st1, a1; \ - vpshufb a0, a2, a2; \ - vpshufb a0, a3, a3; \ - vpshufb a0, b0, b0; \ - vpshufb a0, b1, b1; \ - vpshufb a0, b2, b2; \ - vpshufb a0, b3, b3; \ - vpshufb a0, a1, a1; \ - vpshufb a0, c0, c0; \ - vpshufb a0, c1, c1; \ - vpshufb a0, c2, c2; \ - vpshufb a0, c3, c3; \ - vpshufb a0, d0, d0; \ - vpshufb a0, d1, d1; \ - vpshufb a0, d2, d2; \ - vpshufb a0, d3, d3; \ - vmovdqu d3, st1; \ - vmovdqu st0, d3; \ - vpshufb a0, d3, a0; \ - vmovdqu d2, st0; \ - \ - transpose_4x4(a0, b0, c0, d0, d2, d3); \ - transpose_4x4(a1, b1, c1, d1, d2, d3); \ - vmovdqu st0, d2; \ - vmovdqu st1, d3; \ - \ - vmovdqu b0, st0; \ - vmovdqu b1, st1; \ - transpose_4x4(a2, b2, c2, d2, b0, b1); \ - transpose_4x4(a3, b3, c3, d3, b0, b1); \ - vmovdqu st0, b0; \ - vmovdqu st1, b1; \ - /* does not adjust output bytes inside vectors */ - -/* load blocks to registers and apply pre-whitening */ -#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, rio, key) \ - vpbroadcastq key, x0; \ - vpshufb .Lpack_bswap rRIP, x0, x0; \ - \ - vpxor 0 * 32(rio), x0, y7; \ - vpxor 1 * 32(rio), x0, y6; \ - vpxor 2 * 32(rio), x0, y5; \ - vpxor 3 * 32(rio), x0, y4; \ - vpxor 4 * 32(rio), x0, y3; \ - vpxor 5 * 32(rio), x0, y2; \ - vpxor 6 * 32(rio), x0, y1; \ - vpxor 7 * 32(rio), x0, y0; \ - vpxor 8 * 32(rio), x0, x7; \ - vpxor 9 * 32(rio), x0, x6; \ - vpxor 10 * 32(rio), x0, x5; \ - vpxor 11 * 32(rio), x0, x4; \ - vpxor 12 * 32(rio), x0, x3; \ - vpxor 13 * 32(rio), x0, x2; \ - vpxor 14 * 32(rio), x0, x1; \ - vpxor 15 * 32(rio), x0, x0; - -/* byteslice pre-whitened blocks and store to temporary memory */ -#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, mem_ab, mem_cd) \ - byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ - y4, y5, y6, y7, (mem_ab), (mem_cd)); \ - \ - vmovdqu x0, 0 * 32(mem_ab); \ - vmovdqu x1, 1 * 32(mem_ab); \ - vmovdqu x2, 2 * 32(mem_ab); \ - vmovdqu x3, 3 * 32(mem_ab); \ - vmovdqu x4, 4 * 32(mem_ab); \ - vmovdqu x5, 5 * 32(mem_ab); \ - vmovdqu x6, 6 * 32(mem_ab); \ - vmovdqu x7, 7 * 32(mem_ab); \ - vmovdqu y0, 0 * 32(mem_cd); \ - vmovdqu y1, 1 * 32(mem_cd); \ - vmovdqu y2, 2 * 32(mem_cd); \ - vmovdqu y3, 3 * 32(mem_cd); \ - vmovdqu y4, 4 * 32(mem_cd); \ - vmovdqu y5, 5 * 32(mem_cd); \ - vmovdqu y6, 6 * 32(mem_cd); \ - vmovdqu y7, 7 * 32(mem_cd); - -/* de-byteslice, apply post-whitening and store blocks */ -#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ - y5, y6, y7, key, stack_tmp0, stack_tmp1) \ - byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ - y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ - \ - vmovdqu x0, stack_tmp0; \ - \ - vpbroadcastq key, x0; \ - vpshufb .Lpack_bswap rRIP, x0, x0; \ - \ - vpxor x0, y7, y7; \ - vpxor x0, y6, y6; \ - vpxor x0, y5, y5; \ - vpxor x0, y4, y4; \ - vpxor x0, y3, y3; \ - vpxor x0, y2, y2; \ - vpxor x0, y1, y1; \ - vpxor x0, y0, y0; \ - vpxor x0, x7, x7; \ - vpxor x0, x6, x6; \ - vpxor x0, x5, x5; \ - vpxor x0, x4, x4; \ - vpxor x0, x3, x3; \ - vpxor x0, x2, x2; \ - vpxor x0, x1, x1; \ - vpxor stack_tmp0, x0, x0; - -#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ - y6, y7, rio) \ - vmovdqu x0, 0 * 32(rio); \ - vmovdqu x1, 1 * 32(rio); \ - vmovdqu x2, 2 * 32(rio); \ - vmovdqu x3, 3 * 32(rio); \ - vmovdqu x4, 4 * 32(rio); \ - vmovdqu x5, 5 * 32(rio); \ - vmovdqu x6, 6 * 32(rio); \ - vmovdqu x7, 7 * 32(rio); \ - vmovdqu y0, 8 * 32(rio); \ - vmovdqu y1, 9 * 32(rio); \ - vmovdqu y2, 10 * 32(rio); \ - vmovdqu y3, 11 * 32(rio); \ - vmovdqu y4, 12 * 32(rio); \ - vmovdqu y5, 13 * 32(rio); \ - vmovdqu y6, 14 * 32(rio); \ - vmovdqu y7, 15 * 32(rio); - -.text -.align 32 - -#define SHUFB_BYTES(idx) \ - 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) - -.Lshufb_16x16b: - .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) - .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) - -.Lpack_bswap: - .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 - .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 - -/* For CTR-mode IV byteswap */ -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -/* - * pre-SubByte transform - * - * pre-lookup for sbox1, sbox2, sbox3: - * swap_bitendianness( - * isom_map_camellia_to_aes( - * camellia_f( - * swap_bitendianess(in) - * ) - * ) - * ) - * - * (note: '⊕ 0xc5' inside camellia_f()) - */ -.Lpre_tf_lo_s1: - .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 - .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 -.Lpre_tf_hi_s1: - .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a - .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 - -/* - * pre-SubByte transform - * - * pre-lookup for sbox4: - * swap_bitendianness( - * isom_map_camellia_to_aes( - * camellia_f( - * swap_bitendianess(in <<< 1) - * ) - * ) - * ) - * - * (note: '⊕ 0xc5' inside camellia_f()) - */ -.Lpre_tf_lo_s4: - .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 - .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 -.Lpre_tf_hi_s4: - .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 - .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf - -/* - * post-SubByte transform - * - * post-lookup for sbox1, sbox4: - * swap_bitendianness( - * camellia_h( - * isom_map_aes_to_camellia( - * swap_bitendianness( - * aes_inverse_affine_transform(in) - * ) - * ) - * ) - * ) - * - * (note: '⊕ 0x6e' inside camellia_h()) - */ -.Lpost_tf_lo_s1: - .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 - .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 -.Lpost_tf_hi_s1: - .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 - .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c - -/* - * post-SubByte transform - * - * post-lookup for sbox2: - * swap_bitendianness( - * camellia_h( - * isom_map_aes_to_camellia( - * swap_bitendianness( - * aes_inverse_affine_transform(in) - * ) - * ) - * ) - * ) <<< 1 - * - * (note: '⊕ 0x6e' inside camellia_h()) - */ -.Lpost_tf_lo_s2: - .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 - .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 -.Lpost_tf_hi_s2: - .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 - .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 - -/* - * post-SubByte transform - * - * post-lookup for sbox3: - * swap_bitendianness( - * camellia_h( - * isom_map_aes_to_camellia( - * swap_bitendianness( - * aes_inverse_affine_transform(in) - * ) - * ) - * ) - * ) >>> 1 - * - * (note: '⊕ 0x6e' inside camellia_h()) - */ -.Lpost_tf_lo_s3: - .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 - .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 -.Lpost_tf_hi_s3: - .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 - .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 - -/* For isolating SubBytes from AESENCLAST, inverse shift row */ -.Linv_shift_row: - .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b - .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 - -.align 4 -/* 4-bit mask */ -.L0f0f0f0f: - .long 0x0f0f0f0f - - -.align 8 -ELF(.type __camellia_enc_blk32,@function;) - -__camellia_enc_blk32: - /* input: - * %rdi: ctx, CTX - * %rax: temporary storage, 512 bytes - * %r8d: 24 for 16 byte key, 32 for larger - * %ymm0..%ymm15: 32 plaintext blocks - * output: - * %ymm0..%ymm15: 32 encrypted blocks, order swapped: - * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 - */ - CFI_STARTPROC(); - - leaq 8 * 32(%rax), %rcx; - - leaq (-8 * 8)(CTX, %r8, 8), %r8; - - inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx); - -.align 8 -.Lenc_loop: - enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx, 0); - - cmpq %r8, CTX; - je .Lenc_done; - leaq (8 * 8)(CTX), CTX; - - fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, - ((key_table) + 0)(CTX), - ((key_table) + 4)(CTX), - ((key_table) + 8)(CTX), - ((key_table) + 12)(CTX)); - jmp .Lenc_loop; - -.align 8 -.Lenc_done: - /* load CD for output */ - vmovdqu 0 * 32(%rcx), %ymm8; - vmovdqu 1 * 32(%rcx), %ymm9; - vmovdqu 2 * 32(%rcx), %ymm10; - vmovdqu 3 * 32(%rcx), %ymm11; - vmovdqu 4 * 32(%rcx), %ymm12; - vmovdqu 5 * 32(%rcx), %ymm13; - vmovdqu 6 * 32(%rcx), %ymm14; - vmovdqu 7 * 32(%rcx), %ymm15; - - outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 32(%rax)); - - ret; - CFI_ENDPROC(); -ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;) - -.align 8 -ELF(.type __camellia_dec_blk32,@function;) - -__camellia_dec_blk32: - /* input: - * %rdi: ctx, CTX - * %rax: temporary storage, 512 bytes - * %r8d: 24 for 16 byte key, 32 for larger - * %ymm0..%ymm15: 16 encrypted blocks - * output: - * %ymm0..%ymm15: 16 plaintext blocks, order swapped: - * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 - */ - CFI_STARTPROC(); - - movq %r8, %rcx; - movq CTX, %r8 - leaq (-8 * 8)(CTX, %rcx, 8), CTX; - - leaq 8 * 32(%rax), %rcx; - - inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx); - -.align 8 -.Ldec_loop: - dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rax, %rcx, 0); - - cmpq %r8, CTX; - je .Ldec_done; - - fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, - ((key_table) + 8)(CTX), - ((key_table) + 12)(CTX), - ((key_table) + 0)(CTX), - ((key_table) + 4)(CTX)); - - leaq (-8 * 8)(CTX), CTX; - jmp .Ldec_loop; - -.align 8 -.Ldec_done: - /* load CD for output */ - vmovdqu 0 * 32(%rcx), %ymm8; - vmovdqu 1 * 32(%rcx), %ymm9; - vmovdqu 2 * 32(%rcx), %ymm10; - vmovdqu 3 * 32(%rcx), %ymm11; - vmovdqu 4 * 32(%rcx), %ymm12; - vmovdqu 5 * 32(%rcx), %ymm13; - vmovdqu 6 * 32(%rcx), %ymm14; - vmovdqu 7 * 32(%rcx), %ymm15; - - outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); - - ret; - CFI_ENDPROC(); -ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;) - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -.align 8 -.globl _gcry_camellia_aesni_avx2_ctr_enc -ELF(.type _gcry_camellia_aesni_avx2_ctr_enc,@function;) - -_gcry_camellia_aesni_avx2_ctr_enc: - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (big endian, 128bit) - */ - CFI_STARTPROC(); - - pushq %rbp; - CFI_PUSH(%rbp); - movq %rsp, %rbp; - CFI_DEF_CFA_REGISTER(%rbp); - - movq 8(%rcx), %r11; - bswapq %r11; - - vzeroupper; - - cmpl $128, key_bitlength(CTX); - movl $32, %r8d; - movl $24, %eax; - cmovel %eax, %r8d; /* max */ - - subq $(16 * 32), %rsp; - andq $~63, %rsp; - movq %rsp, %rax; - - vpcmpeqd %ymm15, %ymm15, %ymm15; - vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ - - /* load IV and byteswap */ - vmovdqu (%rcx), %xmm0; - vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0; - vmovdqa %xmm0, %xmm1; - inc_le128(%xmm0, %xmm15, %xmm14); - vbroadcasti128 .Lbswap128_mask rRIP, %ymm14; - vinserti128 $1, %xmm0, %ymm1, %ymm0; - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 15 * 32(%rax); - - /* check need for handling 64-bit overflow and carry */ - cmpq $(0xffffffffffffffff - 32), %r11; - ja .Lload_ctr_carry; - - /* construct IVs */ - vpaddq %ymm15, %ymm15, %ymm15; /* ab: -2:0 ; cd: -2:0 */ - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 14 * 32(%rax); - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 13 * 32(%rax); - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm12; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm11; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm10; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm9; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm8; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm7; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm6; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm5; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm4; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm3; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm2; - vpsubq %ymm15, %ymm0, %ymm0; - vpshufb %ymm14, %ymm0, %ymm1; - vpsubq %ymm15, %ymm0, %ymm0; /* +30 ; +31 */ - vpsubq %xmm15, %xmm0, %xmm13; /* +32 */ - vpshufb %ymm14, %ymm0, %ymm0; - vpshufb %xmm14, %xmm13, %xmm13; - vmovdqu %xmm13, (%rcx); - - jmp .Lload_ctr_done; - -.align 4 -.Lload_ctr_carry: - /* construct IVs */ - inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le1 ; cd: le2 */ - inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le2 ; cd: le3 */ - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 14 * 32(%rax); - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 13 * 32(%rax); - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm12; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm11; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm10; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm9; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm8; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm7; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm6; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm5; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm4; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm3; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm2; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vpshufb %ymm14, %ymm0, %ymm1; - inc_le128(%ymm0, %ymm15, %ymm13); - inc_le128(%ymm0, %ymm15, %ymm13); - vextracti128 $1, %ymm0, %xmm13; - vpshufb %ymm14, %ymm0, %ymm0; - inc_le128(%xmm13, %xmm15, %xmm14); - vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; - vmovdqu %xmm13, (%rcx); - -.align 4 -.Lload_ctr_done: - /* inpack16_pre: */ - vpbroadcastq (key_table)(CTX), %ymm15; - vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; - vpxor %ymm0, %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor %ymm11, %ymm15, %ymm11; - vpxor %ymm12, %ymm15, %ymm12; - vpxor 13 * 32(%rax), %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - call __camellia_enc_blk32; - - vpxor 0 * 32(%rdx), %ymm7, %ymm7; - vpxor 1 * 32(%rdx), %ymm6, %ymm6; - vpxor 2 * 32(%rdx), %ymm5, %ymm5; - vpxor 3 * 32(%rdx), %ymm4, %ymm4; - vpxor 4 * 32(%rdx), %ymm3, %ymm3; - vpxor 5 * 32(%rdx), %ymm2, %ymm2; - vpxor 6 * 32(%rdx), %ymm1, %ymm1; - vpxor 7 * 32(%rdx), %ymm0, %ymm0; - vpxor 8 * 32(%rdx), %ymm15, %ymm15; - vpxor 9 * 32(%rdx), %ymm14, %ymm14; - vpxor 10 * 32(%rdx), %ymm13, %ymm13; - vpxor 11 * 32(%rdx), %ymm12, %ymm12; - vpxor 12 * 32(%rdx), %ymm11, %ymm11; - vpxor 13 * 32(%rdx), %ymm10, %ymm10; - vpxor 14 * 32(%rdx), %ymm9, %ymm9; - vpxor 15 * 32(%rdx), %ymm8, %ymm8; - leaq 32 * 16(%rdx), %rdx; - - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroall; - - leave; - CFI_LEAVE(); - ret; - CFI_ENDPROC(); -ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;) - -.align 8 -.globl _gcry_camellia_aesni_avx2_cbc_dec -ELF(.type _gcry_camellia_aesni_avx2_cbc_dec,@function;) - -_gcry_camellia_aesni_avx2_cbc_dec: - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv - */ - CFI_STARTPROC(); - - pushq %rbp; - CFI_PUSH(%rbp); - movq %rsp, %rbp; - CFI_DEF_CFA_REGISTER(%rbp); - - vzeroupper; - - movq %rcx, %r9; - - cmpl $128, key_bitlength(CTX); - movl $32, %r8d; - movl $24, %eax; - cmovel %eax, %r8d; /* max */ - - subq $(16 * 32), %rsp; - andq $~63, %rsp; - movq %rsp, %rax; - - inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, - %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, - %ymm15, %rdx, (key_table)(CTX, %r8, 8)); - - call __camellia_dec_blk32; - - /* XOR output with IV */ - vmovdqu %ymm8, (%rax); - vmovdqu (%r9), %xmm8; - vinserti128 $1, (%rdx), %ymm8, %ymm8; - vpxor %ymm8, %ymm7, %ymm7; - vmovdqu (%rax), %ymm8; - vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; - vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; - vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; - vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; - vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; - vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; - vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; - vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; - vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; - vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; - vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; - vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; - vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; - vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; - vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; - movq (15 * 32 + 16 + 0)(%rdx), %rax; - movq (15 * 32 + 16 + 8)(%rdx), %rcx; - - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - /* store new IV */ - movq %rax, (0)(%r9); - movq %rcx, (8)(%r9); - - vzeroall; - - leave; - CFI_LEAVE(); - ret; - CFI_ENDPROC(); -ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;) - -.align 8 -.globl _gcry_camellia_aesni_avx2_cfb_dec -ELF(.type _gcry_camellia_aesni_avx2_cfb_dec,@function;) - -_gcry_camellia_aesni_avx2_cfb_dec: - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv - */ - CFI_STARTPROC(); - - pushq %rbp; - CFI_PUSH(%rbp); - movq %rsp, %rbp; - CFI_DEF_CFA_REGISTER(%rbp); - - vzeroupper; - - cmpl $128, key_bitlength(CTX); - movl $32, %r8d; - movl $24, %eax; - cmovel %eax, %r8d; /* max */ - - subq $(16 * 32), %rsp; - andq $~63, %rsp; - movq %rsp, %rax; - - /* inpack16_pre: */ - vpbroadcastq (key_table)(CTX), %ymm0; - vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0; - vmovdqu (%rcx), %xmm15; - vinserti128 $1, (%rdx), %ymm15, %ymm15; - vpxor %ymm15, %ymm0, %ymm15; - vmovdqu (15 * 32 + 16)(%rdx), %xmm1; - vmovdqu %xmm1, (%rcx); /* store new IV */ - vpxor (0 * 32 + 16)(%rdx), %ymm0, %ymm14; - vpxor (1 * 32 + 16)(%rdx), %ymm0, %ymm13; - vpxor (2 * 32 + 16)(%rdx), %ymm0, %ymm12; - vpxor (3 * 32 + 16)(%rdx), %ymm0, %ymm11; - vpxor (4 * 32 + 16)(%rdx), %ymm0, %ymm10; - vpxor (5 * 32 + 16)(%rdx), %ymm0, %ymm9; - vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm8; - vpxor (7 * 32 + 16)(%rdx), %ymm0, %ymm7; - vpxor (8 * 32 + 16)(%rdx), %ymm0, %ymm6; - vpxor (9 * 32 + 16)(%rdx), %ymm0, %ymm5; - vpxor (10 * 32 + 16)(%rdx), %ymm0, %ymm4; - vpxor (11 * 32 + 16)(%rdx), %ymm0, %ymm3; - vpxor (12 * 32 + 16)(%rdx), %ymm0, %ymm2; - vpxor (13 * 32 + 16)(%rdx), %ymm0, %ymm1; - vpxor (14 * 32 + 16)(%rdx), %ymm0, %ymm0; - - call __camellia_enc_blk32; - - vpxor 0 * 32(%rdx), %ymm7, %ymm7; - vpxor 1 * 32(%rdx), %ymm6, %ymm6; - vpxor 2 * 32(%rdx), %ymm5, %ymm5; - vpxor 3 * 32(%rdx), %ymm4, %ymm4; - vpxor 4 * 32(%rdx), %ymm3, %ymm3; - vpxor 5 * 32(%rdx), %ymm2, %ymm2; - vpxor 6 * 32(%rdx), %ymm1, %ymm1; - vpxor 7 * 32(%rdx), %ymm0, %ymm0; - vpxor 8 * 32(%rdx), %ymm15, %ymm15; - vpxor 9 * 32(%rdx), %ymm14, %ymm14; - vpxor 10 * 32(%rdx), %ymm13, %ymm13; - vpxor 11 * 32(%rdx), %ymm12, %ymm12; - vpxor 12 * 32(%rdx), %ymm11, %ymm11; - vpxor 13 * 32(%rdx), %ymm10, %ymm10; - vpxor 14 * 32(%rdx), %ymm9, %ymm9; - vpxor 15 * 32(%rdx), %ymm8, %ymm8; - - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroall; - - leave; - CFI_LEAVE(); - ret; - CFI_ENDPROC(); -ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;) - -.align 8 -.globl _gcry_camellia_aesni_avx2_ocb_enc -ELF(.type _gcry_camellia_aesni_avx2_ocb_enc,@function;) - -_gcry_camellia_aesni_avx2_ocb_enc: - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: offset - * %r8 : checksum - * %r9 : L pointers (void *L[32]) - */ - CFI_STARTPROC(); - - pushq %rbp; - CFI_PUSH(%rbp); - movq %rsp, %rbp; - CFI_DEF_CFA_REGISTER(%rbp); - - vzeroupper; - - subq $(16 * 32 + 4 * 8), %rsp; - andq $~63, %rsp; - movq %rsp, %rax; - - movq %r10, (16 * 32 + 0 * 8)(%rsp); - movq %r11, (16 * 32 + 1 * 8)(%rsp); - movq %r12, (16 * 32 + 2 * 8)(%rsp); - movq %r13, (16 * 32 + 3 * 8)(%rsp); - CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); - CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); - CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); - CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); - - vmovdqu (%rcx), %xmm14; - vmovdqu (%r8), %xmm13; - - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ - /* Checksum_i = Checksum_{i-1} xor P_i */ - /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ - -#define OCB_INPUT(n, l0reg, l1reg, yreg) \ - vmovdqu (n * 32)(%rdx), yreg; \ - vpxor (l0reg), %xmm14, %xmm15; \ - vpxor (l1reg), %xmm15, %xmm14; \ - vinserti128 $1, %xmm14, %ymm15, %ymm15; \ - vpxor yreg, %ymm13, %ymm13; \ - vpxor yreg, %ymm15, yreg; \ - vmovdqu %ymm15, (n * 32)(%rsi); - - movq (0 * 8)(%r9), %r10; - movq (1 * 8)(%r9), %r11; - movq (2 * 8)(%r9), %r12; - movq (3 * 8)(%r9), %r13; - OCB_INPUT(0, %r10, %r11, %ymm0); - vmovdqu %ymm0, (15 * 32)(%rax); - OCB_INPUT(1, %r12, %r13, %ymm0); - vmovdqu %ymm0, (14 * 32)(%rax); - movq (4 * 8)(%r9), %r10; - movq (5 * 8)(%r9), %r11; - movq (6 * 8)(%r9), %r12; - movq (7 * 8)(%r9), %r13; - OCB_INPUT(2, %r10, %r11, %ymm0); - vmovdqu %ymm0, (13 * 32)(%rax); - OCB_INPUT(3, %r12, %r13, %ymm12); - movq (8 * 8)(%r9), %r10; - movq (9 * 8)(%r9), %r11; - movq (10 * 8)(%r9), %r12; - movq (11 * 8)(%r9), %r13; - OCB_INPUT(4, %r10, %r11, %ymm11); - OCB_INPUT(5, %r12, %r13, %ymm10); - movq (12 * 8)(%r9), %r10; - movq (13 * 8)(%r9), %r11; - movq (14 * 8)(%r9), %r12; - movq (15 * 8)(%r9), %r13; - OCB_INPUT(6, %r10, %r11, %ymm9); - OCB_INPUT(7, %r12, %r13, %ymm8); - movq (16 * 8)(%r9), %r10; - movq (17 * 8)(%r9), %r11; - movq (18 * 8)(%r9), %r12; - movq (19 * 8)(%r9), %r13; - OCB_INPUT(8, %r10, %r11, %ymm7); - OCB_INPUT(9, %r12, %r13, %ymm6); - movq (20 * 8)(%r9), %r10; - movq (21 * 8)(%r9), %r11; - movq (22 * 8)(%r9), %r12; - movq (23 * 8)(%r9), %r13; - OCB_INPUT(10, %r10, %r11, %ymm5); - OCB_INPUT(11, %r12, %r13, %ymm4); - movq (24 * 8)(%r9), %r10; - movq (25 * 8)(%r9), %r11; - movq (26 * 8)(%r9), %r12; - movq (27 * 8)(%r9), %r13; - OCB_INPUT(12, %r10, %r11, %ymm3); - OCB_INPUT(13, %r12, %r13, %ymm2); - movq (28 * 8)(%r9), %r10; - movq (29 * 8)(%r9), %r11; - movq (30 * 8)(%r9), %r12; - movq (31 * 8)(%r9), %r13; - OCB_INPUT(14, %r10, %r11, %ymm1); - OCB_INPUT(15, %r12, %r13, %ymm0); -#undef OCB_INPUT - - vextracti128 $1, %ymm13, %xmm15; - vmovdqu %xmm14, (%rcx); - vpxor %xmm13, %xmm15, %xmm15; - vmovdqu %xmm15, (%r8); - - cmpl $128, key_bitlength(CTX); - movl $32, %r8d; - movl $24, %r10d; - cmovel %r10d, %r8d; /* max */ - - /* inpack16_pre: */ - vpbroadcastq (key_table)(CTX), %ymm15; - vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; - vpxor %ymm0, %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor %ymm11, %ymm15, %ymm11; - vpxor %ymm12, %ymm15, %ymm12; - vpxor 13 * 32(%rax), %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - call __camellia_enc_blk32; - - vpxor 0 * 32(%rsi), %ymm7, %ymm7; - vpxor 1 * 32(%rsi), %ymm6, %ymm6; - vpxor 2 * 32(%rsi), %ymm5, %ymm5; - vpxor 3 * 32(%rsi), %ymm4, %ymm4; - vpxor 4 * 32(%rsi), %ymm3, %ymm3; - vpxor 5 * 32(%rsi), %ymm2, %ymm2; - vpxor 6 * 32(%rsi), %ymm1, %ymm1; - vpxor 7 * 32(%rsi), %ymm0, %ymm0; - vpxor 8 * 32(%rsi), %ymm15, %ymm15; - vpxor 9 * 32(%rsi), %ymm14, %ymm14; - vpxor 10 * 32(%rsi), %ymm13, %ymm13; - vpxor 11 * 32(%rsi), %ymm12, %ymm12; - vpxor 12 * 32(%rsi), %ymm11, %ymm11; - vpxor 13 * 32(%rsi), %ymm10, %ymm10; - vpxor 14 * 32(%rsi), %ymm9, %ymm9; - vpxor 15 * 32(%rsi), %ymm8, %ymm8; - - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroall; - - movq (16 * 32 + 0 * 8)(%rsp), %r10; - movq (16 * 32 + 1 * 8)(%rsp), %r11; - movq (16 * 32 + 2 * 8)(%rsp), %r12; - movq (16 * 32 + 3 * 8)(%rsp), %r13; - CFI_RESTORE(%r10); - CFI_RESTORE(%r11); - CFI_RESTORE(%r12); - CFI_RESTORE(%r13); - - leave; - CFI_LEAVE(); - ret; - CFI_ENDPROC(); -ELF(.size _gcry_camellia_aesni_avx2_ocb_enc,.-_gcry_camellia_aesni_avx2_ocb_enc;) - -.align 8 -.globl _gcry_camellia_aesni_avx2_ocb_dec -ELF(.type _gcry_camellia_aesni_avx2_ocb_dec,@function;) - -_gcry_camellia_aesni_avx2_ocb_dec: - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: offset - * %r8 : checksum - * %r9 : L pointers (void *L[32]) - */ - CFI_STARTPROC(); - - pushq %rbp; - CFI_PUSH(%rbp); - movq %rsp, %rbp; - CFI_DEF_CFA_REGISTER(%rbp); - - vzeroupper; - - subq $(16 * 32 + 4 * 8), %rsp; - andq $~63, %rsp; - movq %rsp, %rax; - - movq %r10, (16 * 32 + 0 * 8)(%rsp); - movq %r11, (16 * 32 + 1 * 8)(%rsp); - movq %r12, (16 * 32 + 2 * 8)(%rsp); - movq %r13, (16 * 32 + 3 * 8)(%rsp); - CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); - CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); - CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); - CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); - - vmovdqu (%rcx), %xmm14; - - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ - /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ - -#define OCB_INPUT(n, l0reg, l1reg, yreg) \ - vmovdqu (n * 32)(%rdx), yreg; \ - vpxor (l0reg), %xmm14, %xmm15; \ - vpxor (l1reg), %xmm15, %xmm14; \ - vinserti128 $1, %xmm14, %ymm15, %ymm15; \ - vpxor yreg, %ymm15, yreg; \ - vmovdqu %ymm15, (n * 32)(%rsi); - - movq (0 * 8)(%r9), %r10; - movq (1 * 8)(%r9), %r11; - movq (2 * 8)(%r9), %r12; - movq (3 * 8)(%r9), %r13; - OCB_INPUT(0, %r10, %r11, %ymm0); - vmovdqu %ymm0, (15 * 32)(%rax); - OCB_INPUT(1, %r12, %r13, %ymm0); - vmovdqu %ymm0, (14 * 32)(%rax); - movq (4 * 8)(%r9), %r10; - movq (5 * 8)(%r9), %r11; - movq (6 * 8)(%r9), %r12; - movq (7 * 8)(%r9), %r13; - OCB_INPUT(2, %r10, %r11, %ymm13); - OCB_INPUT(3, %r12, %r13, %ymm12); - movq (8 * 8)(%r9), %r10; - movq (9 * 8)(%r9), %r11; - movq (10 * 8)(%r9), %r12; - movq (11 * 8)(%r9), %r13; - OCB_INPUT(4, %r10, %r11, %ymm11); - OCB_INPUT(5, %r12, %r13, %ymm10); - movq (12 * 8)(%r9), %r10; - movq (13 * 8)(%r9), %r11; - movq (14 * 8)(%r9), %r12; - movq (15 * 8)(%r9), %r13; - OCB_INPUT(6, %r10, %r11, %ymm9); - OCB_INPUT(7, %r12, %r13, %ymm8); - movq (16 * 8)(%r9), %r10; - movq (17 * 8)(%r9), %r11; - movq (18 * 8)(%r9), %r12; - movq (19 * 8)(%r9), %r13; - OCB_INPUT(8, %r10, %r11, %ymm7); - OCB_INPUT(9, %r12, %r13, %ymm6); - movq (20 * 8)(%r9), %r10; - movq (21 * 8)(%r9), %r11; - movq (22 * 8)(%r9), %r12; - movq (23 * 8)(%r9), %r13; - OCB_INPUT(10, %r10, %r11, %ymm5); - OCB_INPUT(11, %r12, %r13, %ymm4); - movq (24 * 8)(%r9), %r10; - movq (25 * 8)(%r9), %r11; - movq (26 * 8)(%r9), %r12; - movq (27 * 8)(%r9), %r13; - OCB_INPUT(12, %r10, %r11, %ymm3); - OCB_INPUT(13, %r12, %r13, %ymm2); - movq (28 * 8)(%r9), %r10; - movq (29 * 8)(%r9), %r11; - movq (30 * 8)(%r9), %r12; - movq (31 * 8)(%r9), %r13; - OCB_INPUT(14, %r10, %r11, %ymm1); - OCB_INPUT(15, %r12, %r13, %ymm0); -#undef OCB_INPUT - - vmovdqu %xmm14, (%rcx); - - movq %r8, %r10; - - cmpl $128, key_bitlength(CTX); - movl $32, %r8d; - movl $24, %r9d; - cmovel %r9d, %r8d; /* max */ - - /* inpack16_pre: */ - vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; - vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; - vpxor %ymm0, %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor %ymm11, %ymm15, %ymm11; - vpxor %ymm12, %ymm15, %ymm12; - vpxor %ymm13, %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - call __camellia_dec_blk32; - - vpxor 0 * 32(%rsi), %ymm7, %ymm7; - vpxor 1 * 32(%rsi), %ymm6, %ymm6; - vpxor 2 * 32(%rsi), %ymm5, %ymm5; - vpxor 3 * 32(%rsi), %ymm4, %ymm4; - vpxor 4 * 32(%rsi), %ymm3, %ymm3; - vpxor 5 * 32(%rsi), %ymm2, %ymm2; - vpxor 6 * 32(%rsi), %ymm1, %ymm1; - vpxor 7 * 32(%rsi), %ymm0, %ymm0; - vmovdqu %ymm7, (7 * 32)(%rax); - vmovdqu %ymm6, (6 * 32)(%rax); - vpxor 8 * 32(%rsi), %ymm15, %ymm15; - vpxor 9 * 32(%rsi), %ymm14, %ymm14; - vpxor 10 * 32(%rsi), %ymm13, %ymm13; - vpxor 11 * 32(%rsi), %ymm12, %ymm12; - vpxor 12 * 32(%rsi), %ymm11, %ymm11; - vpxor 13 * 32(%rsi), %ymm10, %ymm10; - vpxor 14 * 32(%rsi), %ymm9, %ymm9; - vpxor 15 * 32(%rsi), %ymm8, %ymm8; - - /* Checksum_i = Checksum_{i-1} xor P_i */ - - vpxor %ymm5, %ymm7, %ymm7; - vpxor %ymm4, %ymm6, %ymm6; - vpxor %ymm3, %ymm7, %ymm7; - vpxor %ymm2, %ymm6, %ymm6; - vpxor %ymm1, %ymm7, %ymm7; - vpxor %ymm0, %ymm6, %ymm6; - vpxor %ymm15, %ymm7, %ymm7; - vpxor %ymm14, %ymm6, %ymm6; - vpxor %ymm13, %ymm7, %ymm7; - vpxor %ymm12, %ymm6, %ymm6; - vpxor %ymm11, %ymm7, %ymm7; - vpxor %ymm10, %ymm6, %ymm6; - vpxor %ymm9, %ymm7, %ymm7; - vpxor %ymm8, %ymm6, %ymm6; - vpxor %ymm7, %ymm6, %ymm7; - - vextracti128 $1, %ymm7, %xmm6; - vpxor %xmm6, %xmm7, %xmm7; - vpxor (%r10), %xmm7, %xmm7; - vmovdqu %xmm7, (%r10); - - vmovdqu 7 * 32(%rax), %ymm7; - vmovdqu 6 * 32(%rax), %ymm6; - - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroall; - - movq (16 * 32 + 0 * 8)(%rsp), %r10; - movq (16 * 32 + 1 * 8)(%rsp), %r11; - movq (16 * 32 + 2 * 8)(%rsp), %r12; - movq (16 * 32 + 3 * 8)(%rsp), %r13; - CFI_RESTORE(%r10); - CFI_RESTORE(%r11); - CFI_RESTORE(%r12); - CFI_RESTORE(%r13); - - leave; - CFI_LEAVE(); - ret; - CFI_ENDPROC(); -ELF(.size _gcry_camellia_aesni_avx2_ocb_dec,.-_gcry_camellia_aesni_avx2_ocb_dec;) - -.align 8 -.globl _gcry_camellia_aesni_avx2_ocb_auth -ELF(.type _gcry_camellia_aesni_avx2_ocb_auth,@function;) - -_gcry_camellia_aesni_avx2_ocb_auth: - /* input: - * %rdi: ctx, CTX - * %rsi: abuf (16 blocks) - * %rdx: offset - * %rcx: checksum - * %r8 : L pointers (void *L[16]) - */ - CFI_STARTPROC(); - - pushq %rbp; - CFI_PUSH(%rbp); - movq %rsp, %rbp; - CFI_DEF_CFA_REGISTER(%rbp); - - vzeroupper; - - subq $(16 * 32 + 4 * 8), %rsp; - andq $~63, %rsp; - movq %rsp, %rax; - - movq %r10, (16 * 32 + 0 * 8)(%rsp); - movq %r11, (16 * 32 + 1 * 8)(%rsp); - movq %r12, (16 * 32 + 2 * 8)(%rsp); - movq %r13, (16 * 32 + 3 * 8)(%rsp); - CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); - CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); - CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); - CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); - - vmovdqu (%rdx), %xmm14; - - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ - /* Checksum_i = Checksum_{i-1} xor P_i */ - /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ - -#define OCB_INPUT(n, l0reg, l1reg, yreg) \ - vmovdqu (n * 32)(%rsi), yreg; \ - vpxor (l0reg), %xmm14, %xmm15; \ - vpxor (l1reg), %xmm15, %xmm14; \ - vinserti128 $1, %xmm14, %ymm15, %ymm15; \ - vpxor yreg, %ymm15, yreg; - - movq (0 * 8)(%r8), %r10; - movq (1 * 8)(%r8), %r11; - movq (2 * 8)(%r8), %r12; - movq (3 * 8)(%r8), %r13; - OCB_INPUT(0, %r10, %r11, %ymm0); - vmovdqu %ymm0, (15 * 32)(%rax); - OCB_INPUT(1, %r12, %r13, %ymm0); - vmovdqu %ymm0, (14 * 32)(%rax); - movq (4 * 8)(%r8), %r10; - movq (5 * 8)(%r8), %r11; - movq (6 * 8)(%r8), %r12; - movq (7 * 8)(%r8), %r13; - OCB_INPUT(2, %r10, %r11, %ymm13); - OCB_INPUT(3, %r12, %r13, %ymm12); - movq (8 * 8)(%r8), %r10; - movq (9 * 8)(%r8), %r11; - movq (10 * 8)(%r8), %r12; - movq (11 * 8)(%r8), %r13; - OCB_INPUT(4, %r10, %r11, %ymm11); - OCB_INPUT(5, %r12, %r13, %ymm10); - movq (12 * 8)(%r8), %r10; - movq (13 * 8)(%r8), %r11; - movq (14 * 8)(%r8), %r12; - movq (15 * 8)(%r8), %r13; - OCB_INPUT(6, %r10, %r11, %ymm9); - OCB_INPUT(7, %r12, %r13, %ymm8); - movq (16 * 8)(%r8), %r10; - movq (17 * 8)(%r8), %r11; - movq (18 * 8)(%r8), %r12; - movq (19 * 8)(%r8), %r13; - OCB_INPUT(8, %r10, %r11, %ymm7); - OCB_INPUT(9, %r12, %r13, %ymm6); - movq (20 * 8)(%r8), %r10; - movq (21 * 8)(%r8), %r11; - movq (22 * 8)(%r8), %r12; - movq (23 * 8)(%r8), %r13; - OCB_INPUT(10, %r10, %r11, %ymm5); - OCB_INPUT(11, %r12, %r13, %ymm4); - movq (24 * 8)(%r8), %r10; - movq (25 * 8)(%r8), %r11; - movq (26 * 8)(%r8), %r12; - movq (27 * 8)(%r8), %r13; - OCB_INPUT(12, %r10, %r11, %ymm3); - OCB_INPUT(13, %r12, %r13, %ymm2); - movq (28 * 8)(%r8), %r10; - movq (29 * 8)(%r8), %r11; - movq (30 * 8)(%r8), %r12; - movq (31 * 8)(%r8), %r13; - OCB_INPUT(14, %r10, %r11, %ymm1); - OCB_INPUT(15, %r12, %r13, %ymm0); -#undef OCB_INPUT - - vmovdqu %xmm14, (%rdx); - - cmpl $128, key_bitlength(CTX); - movl $32, %r8d; - movl $24, %r10d; - cmovel %r10d, %r8d; /* max */ - - movq %rcx, %r10; - - /* inpack16_pre: */ - vpbroadcastq (key_table)(CTX), %ymm15; - vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; - vpxor %ymm0, %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor %ymm11, %ymm15, %ymm11; - vpxor %ymm12, %ymm15, %ymm12; - vpxor %ymm13, %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - call __camellia_enc_blk32; - - vpxor %ymm7, %ymm6, %ymm6; - vpxor %ymm5, %ymm4, %ymm4; - vpxor %ymm3, %ymm2, %ymm2; - vpxor %ymm1, %ymm0, %ymm0; - vpxor %ymm15, %ymm14, %ymm14; - vpxor %ymm13, %ymm12, %ymm12; - vpxor %ymm11, %ymm10, %ymm10; - vpxor %ymm9, %ymm8, %ymm8; - - vpxor %ymm6, %ymm4, %ymm4; - vpxor %ymm2, %ymm0, %ymm0; - vpxor %ymm14, %ymm12, %ymm12; - vpxor %ymm10, %ymm8, %ymm8; - - vpxor %ymm4, %ymm0, %ymm0; - vpxor %ymm12, %ymm8, %ymm8; - - vpxor %ymm0, %ymm8, %ymm0; - - vextracti128 $1, %ymm0, %xmm1; - vpxor (%r10), %xmm0, %xmm0; - vpxor %xmm0, %xmm1, %xmm0; - vmovdqu %xmm0, (%r10); - - vzeroall; - - movq (16 * 32 + 0 * 8)(%rsp), %r10; - movq (16 * 32 + 1 * 8)(%rsp), %r11; - movq (16 * 32 + 2 * 8)(%rsp), %r12; - movq (16 * 32 + 3 * 8)(%rsp), %r13; - CFI_RESTORE(%r10); - CFI_RESTORE(%r11); - CFI_RESTORE(%r12); - CFI_RESTORE(%r13); - - leave; - CFI_LEAVE(); - ret; - CFI_ENDPROC(); -ELF(.size _gcry_camellia_aesni_avx2_ocb_auth,.-_gcry_camellia_aesni_avx2_ocb_auth;) - -#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/ -#endif /*__x86_64*/ +#endif /* defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) */ +#endif /* __x86_64 */ diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h new file mode 100644 index 00000000..be7bb0aa --- /dev/null +++ b/cipher/camellia-aesni-avx2-amd64.h @@ -0,0 +1,1794 @@ +/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/AVX2 implementation of Camellia + * + * Copyright (C) 2013-2015,2020-2021 Jussi Kivilinna <[hidden email]> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef GCRY_CAMELLIA_AESNI_AVX2_AMD64_H +#define GCRY_CAMELLIA_AESNI_AVX2_AMD64_H + +#include "asm-common-amd64.h" + +#define CAMELLIA_TABLE_BYTE_LEN 272 + +/* struct CAMELLIA_context: */ +#define key_table 0 +#define key_bitlength CAMELLIA_TABLE_BYTE_LEN + +/* register macros */ +#define CTX %rdi +#define RIO %r8 + +/********************************************************************** + helper macros + **********************************************************************/ +#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +#define ymm0_x xmm0 +#define ymm1_x xmm1 +#define ymm2_x xmm2 +#define ymm3_x xmm3 +#define ymm4_x xmm4 +#define ymm5_x xmm5 +#define ymm6_x xmm6 +#define ymm7_x xmm7 +#define ymm8_x xmm8 +#define ymm9_x xmm9 +#define ymm10_x xmm10 +#define ymm11_x xmm11 +#define ymm12_x xmm12 +#define ymm13_x xmm13 +#define ymm14_x xmm14 +#define ymm15_x xmm15 + +#ifdef CAMELLIA_VAES_BUILD +# define IF_AESNI(...) +# define IF_VAES(...) __VA_ARGS__ +#else +# define IF_AESNI(...) __VA_ARGS__ +# define IF_VAES(...) +#endif + +/********************************************************************** + 32-way camellia + **********************************************************************/ + +/* + * IN: + * x0..x7: byte-sliced AB state + * mem_cd: register pointer storing CD state + * key: index for key material + * OUT: + * x0..x7: new byte-sliced CD state + */ + +#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \ + t6, t7, mem_cd, key) \ + /* \ + * S-function with AES subbytes \ + */ \ + vbroadcasti128 .Linv_shift_row rRIP, t4; \ + vpbroadcastd .L0f0f0f0f rRIP, t7; \ + vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \ + vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \ + vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \ + vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \ + \ + /* AES inverse shift rows */ \ + vpshufb t4, x0, x0; \ + vpshufb t4, x7, x7; \ + vpshufb t4, x3, x3; \ + vpshufb t4, x6, x6; \ + vpshufb t4, x2, x2; \ + vpshufb t4, x5, x5; \ + vpshufb t4, x1, x1; \ + vpshufb t4, x4, x4; \ + \ + /* prefilter sboxes 1, 2 and 3 */ \ + /* prefilter sbox 4 */ \ + filter_8bit(x0, t5, t6, t7, t4); \ + filter_8bit(x7, t5, t6, t7, t4); \ + IF_AESNI(vextracti128 $1, x0, t0##_x); \ + IF_AESNI(vextracti128 $1, x7, t1##_x); \ + filter_8bit(x3, t2, t3, t7, t4); \ + filter_8bit(x6, t2, t3, t7, t4); \ + IF_AESNI(vextracti128 $1, x3, t3##_x); \ + IF_AESNI(vextracti128 $1, x6, t2##_x); \ + filter_8bit(x2, t5, t6, t7, t4); \ + filter_8bit(x5, t5, t6, t7, t4); \ + filter_8bit(x1, t5, t6, t7, t4); \ + filter_8bit(x4, t5, t6, t7, t4); \ + \ + vpxor t4##_x, t4##_x, t4##_x; \ + \ + /* AES subbytes + AES shift rows */ \ + IF_AESNI(vextracti128 $1, x2, t6##_x; \ + vextracti128 $1, x5, t5##_x; \ + vaesenclast t4##_x, x0##_x, x0##_x; \ + vaesenclast t4##_x, t0##_x, t0##_x; \ + vaesenclast t4##_x, x7##_x, x7##_x; \ + vaesenclast t4##_x, t1##_x, t1##_x; \ + vaesenclast t4##_x, x3##_x, x3##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vaesenclast t4##_x, x6##_x, x6##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t0##_x, x0, x0; \ + vinserti128 $1, t1##_x, x7, x7; \ + vinserti128 $1, t3##_x, x3, x3; \ + vinserti128 $1, t2##_x, x6, x6; \ + vextracti128 $1, x1, t3##_x; \ + vextracti128 $1, x4, t2##_x); \ + vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \ + vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \ + IF_AESNI(vaesenclast t4##_x, x2##_x, x2##_x; \ + vaesenclast t4##_x, t6##_x, t6##_x; \ + vaesenclast t4##_x, x5##_x, x5##_x; \ + vaesenclast t4##_x, t5##_x, t5##_x; \ + vaesenclast t4##_x, x1##_x, x1##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vaesenclast t4##_x, x4##_x, x4##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t6##_x, x2, x2; \ + vinserti128 $1, t5##_x, x5, x5; \ + vinserti128 $1, t3##_x, x1, x1; \ + vinserti128 $1, t2##_x, x4, x4); \ + IF_VAES(vaesenclast t4, x0, x0; \ + vaesenclast t4, x7, x7; \ + vaesenclast t4, x3, x3; \ + vaesenclast t4, x6, x6; \ + vaesenclast t4, x2, x2; \ + vaesenclast t4, x5, x5; \ + vaesenclast t4, x1, x1; \ + vaesenclast t4, x4, x4); \ + \ + /* postfilter sboxes 1 and 4 */ \ + vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \ + vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \ + filter_8bit(x0, t0, t1, t7, t4); \ + filter_8bit(x7, t0, t1, t7, t4); \ + filter_8bit(x3, t0, t1, t7, t6); \ + filter_8bit(x6, t0, t1, t7, t6); \ + \ + /* postfilter sbox 3 */ \ + vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \ + vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \ + filter_8bit(x2, t2, t3, t7, t6); \ + filter_8bit(x5, t2, t3, t7, t6); \ + \ + vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ + \ + /* postfilter sbox 2 */ \ + filter_8bit(x1, t4, t5, t7, t2); \ + filter_8bit(x4, t4, t5, t7, t2); \ + vpxor t7, t7, t7; \ + \ + vpsrldq $1, t0, t1; \ + vpsrldq $2, t0, t2; \ + vpshufb t7, t1, t1; \ + vpsrldq $3, t0, t3; \ + \ + /* P-function */ \ + vpxor x5, x0, x0; \ + vpxor x6, x1, x1; \ + vpxor x7, x2, x2; \ + vpxor x4, x3, x3; \ + \ + vpshufb t7, t2, t2; \ + vpsrldq $4, t0, t4; \ + vpshufb t7, t3, t3; \ + vpsrldq $5, t0, t5; \ + vpshufb t7, t4, t4; \ + \ + vpxor x2, x4, x4; \ + vpxor x3, x5, x5; \ + vpxor x0, x6, x6; \ + vpxor x1, x7, x7; \ + \ + vpsrldq $6, t0, t6; \ + vpshufb t7, t5, t5; \ + vpshufb t7, t6, t6; \ + \ + vpxor x7, x0, x0; \ + vpxor x4, x1, x1; \ + vpxor x5, x2, x2; \ + vpxor x6, x3, x3; \ + \ + vpxor x3, x4, x4; \ + vpxor x0, x5, x5; \ + vpxor x1, x6, x6; \ + vpxor x2, x7, x7; /* note: high and low parts swapped */ \ + \ + /* Add key material and result to CD (x becomes new CD) */ \ + \ + vpxor t6, x1, x1; \ + vpxor 5 * 32(mem_cd), x1, x1; \ + \ + vpsrldq $7, t0, t6; \ + vpshufb t7, t0, t0; \ + vpshufb t7, t6, t7; \ + \ + vpxor t7, x0, x0; \ + vpxor 4 * 32(mem_cd), x0, x0; \ + \ + vpxor t5, x2, x2; \ + vpxor 6 * 32(mem_cd), x2, x2; \ + \ + vpxor t4, x3, x3; \ + vpxor 7 * 32(mem_cd), x3, x3; \ + \ + vpxor t3, x4, x4; \ + vpxor 0 * 32(mem_cd), x4, x4; \ + \ + vpxor t2, x5, x5; \ + vpxor 1 * 32(mem_cd), x5, x5; \ + \ + vpxor t1, x6, x6; \ + vpxor 2 * 32(mem_cd), x6, x6; \ + \ + vpxor t0, x7, x7; \ + vpxor 3 * 32(mem_cd), x7, x7; + +/* + * IN/OUT: + * x0..x7: byte-sliced AB state preloaded + * mem_ab: byte-sliced AB state in memory + * mem_cb: byte-sliced CD state in memory + */ +#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ + roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \ + \ + vmovdqu x0, 4 * 32(mem_cd); \ + vmovdqu x1, 5 * 32(mem_cd); \ + vmovdqu x2, 6 * 32(mem_cd); \ + vmovdqu x3, 7 * 32(mem_cd); \ + vmovdqu x4, 0 * 32(mem_cd); \ + vmovdqu x5, 1 * 32(mem_cd); \ + vmovdqu x6, 2 * 32(mem_cd); \ + vmovdqu x7, 3 * 32(mem_cd); \ + \ + roundsm32(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \ + \ + store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); + +#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ + +#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ + /* Store new AB state */ \ + vmovdqu x4, 4 * 32(mem_ab); \ + vmovdqu x5, 5 * 32(mem_ab); \ + vmovdqu x6, 6 * 32(mem_ab); \ + vmovdqu x7, 7 * 32(mem_ab); \ + vmovdqu x0, 0 * 32(mem_ab); \ + vmovdqu x1, 1 * 32(mem_ab); \ + vmovdqu x2, 2 * 32(mem_ab); \ + vmovdqu x3, 3 * 32(mem_ab); + +#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i) \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); + +#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i) \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); + +/* + * IN: + * v0..3: byte-sliced 32-bit integers + * OUT: + * v0..3: (IN <<< 1) + */ +#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ + vpcmpgtb v0, zero, t0; \ + vpaddb v0, v0, v0; \ + vpabsb t0, t0; \ + \ + vpcmpgtb v1, zero, t1; \ + vpaddb v1, v1, v1; \ + vpabsb t1, t1; \ + \ + vpcmpgtb v2, zero, t2; \ + vpaddb v2, v2, v2; \ + vpabsb t2, t2; \ + \ + vpor t0, v1, v1; \ + \ + vpcmpgtb v3, zero, t0; \ + vpaddb v3, v3, v3; \ + vpabsb t0, t0; \ + \ + vpor t1, v2, v2; \ + vpor t2, v3, v3; \ + vpor t0, v0, v0; + +/* + * IN: + * r: byte-sliced AB state in memory + * l: byte-sliced CD state in memory + * OUT: + * x0..x7: new byte-sliced CD state + */ +#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ + tt1, tt2, tt3, kll, klr, krl, krr) \ + /* \ + * t0 = kll; \ + * t0 &= ll; \ + * lr ^= rol32(t0, 1); \ + */ \ + vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ + vpxor tt0, tt0, tt0; \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpand l0, t0, t0; \ + vpand l1, t1, t1; \ + vpand l2, t2, t2; \ + vpand l3, t3, t3; \ + \ + rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ + \ + vpxor l4, t0, l4; \ + vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ + vmovdqu l4, 4 * 32(l); \ + vpxor l5, t1, l5; \ + vmovdqu l5, 5 * 32(l); \ + vpxor l6, t2, l6; \ + vmovdqu l6, 6 * 32(l); \ + vpxor l7, t3, l7; \ + vmovdqu l7, 7 * 32(l); \ + \ + /* \ + * t2 = krr; \ + * t2 |= rr; \ + * rl ^= t2; \ + */ \ + \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpor 4 * 32(r), t0, t0; \ + vpor 5 * 32(r), t1, t1; \ + vpor 6 * 32(r), t2, t2; \ + vpor 7 * 32(r), t3, t3; \ + \ + vpxor 0 * 32(r), t0, t0; \ + vpxor 1 * 32(r), t1, t1; \ + vpxor 2 * 32(r), t2, t2; \ + vpxor 3 * 32(r), t3, t3; \ + vmovdqu t0, 0 * 32(r); \ + vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ + vmovdqu t1, 1 * 32(r); \ + vmovdqu t2, 2 * 32(r); \ + vmovdqu t3, 3 * 32(r); \ + \ + /* \ + * t2 = krl; \ + * t2 &= rl; \ + * rr ^= rol32(t2, 1); \ + */ \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpand 0 * 32(r), t0, t0; \ + vpand 1 * 32(r), t1, t1; \ + vpand 2 * 32(r), t2, t2; \ + vpand 3 * 32(r), t3, t3; \ + \ + rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ + \ + vpxor 4 * 32(r), t0, t0; \ + vpxor 5 * 32(r), t1, t1; \ + vpxor 6 * 32(r), t2, t2; \ + vpxor 7 * 32(r), t3, t3; \ + vmovdqu t0, 4 * 32(r); \ + vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ + vmovdqu t1, 5 * 32(r); \ + vmovdqu t2, 6 * 32(r); \ + vmovdqu t3, 7 * 32(r); \ + \ + /* \ + * t0 = klr; \ + * t0 |= lr; \ + * ll ^= t0; \ + */ \ + \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpor l4, t0, t0; \ + vpor l5, t1, t1; \ + vpor l6, t2, t2; \ + vpor l7, t3, t3; \ + \ + vpxor l0, t0, l0; \ + vmovdqu l0, 0 * 32(l); \ + vpxor l1, t1, l1; \ + vmovdqu l1, 1 * 32(l); \ + vpxor l2, t2, l2; \ + vmovdqu l2, 2 * 32(l); \ + vpxor l3, t3, l3; \ + vmovdqu l3, 3 * 32(l); + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ + a3, b3, c3, d3, st0, st1) \ + vmovdqu d2, st0; \ + vmovdqu d3, st1; \ + transpose_4x4(a0, a1, a2, a3, d2, d3); \ + transpose_4x4(b0, b1, b2, b3, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu a0, st0; \ + vmovdqu a1, st1; \ + transpose_4x4(c0, c1, c2, c3, a0, a1); \ + transpose_4x4(d0, d1, d2, d3, a0, a1); \ + \ + vbroadcasti128 .Lshufb_16x16b rRIP, a0; \ + vmovdqu st1, a1; \ + vpshufb a0, a2, a2; \ + vpshufb a0, a3, a3; \ + vpshufb a0, b0, b0; \ + vpshufb a0, b1, b1; \ + vpshufb a0, b2, b2; \ + vpshufb a0, b3, b3; \ + vpshufb a0, a1, a1; \ + vpshufb a0, c0, c0; \ + vpshufb a0, c1, c1; \ + vpshufb a0, c2, c2; \ + vpshufb a0, c3, c3; \ + vpshufb a0, d0, d0; \ + vpshufb a0, d1, d1; \ + vpshufb a0, d2, d2; \ + vpshufb a0, d3, d3; \ + vmovdqu d3, st1; \ + vmovdqu st0, d3; \ + vpshufb a0, d3, a0; \ + vmovdqu d2, st0; \ + \ + transpose_4x4(a0, b0, c0, d0, d2, d3); \ + transpose_4x4(a1, b1, c1, d1, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu b0, st0; \ + vmovdqu b1, st1; \ + transpose_4x4(a2, b2, c2, d2, b0, b1); \ + transpose_4x4(a3, b3, c3, d3, b0, b1); \ + vmovdqu st0, b0; \ + vmovdqu st1, b1; \ + /* does not adjust output bytes inside vectors */ + +/* load blocks to registers and apply pre-whitening */ +#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, rio, key) \ + vpbroadcastq key, x0; \ + vpshufb .Lpack_bswap rRIP, x0, x0; \ + \ + vpxor 0 * 32(rio), x0, y7; \ + vpxor 1 * 32(rio), x0, y6; \ + vpxor 2 * 32(rio), x0, y5; \ + vpxor 3 * 32(rio), x0, y4; \ + vpxor 4 * 32(rio), x0, y3; \ + vpxor 5 * 32(rio), x0, y2; \ + vpxor 6 * 32(rio), x0, y1; \ + vpxor 7 * 32(rio), x0, y0; \ + vpxor 8 * 32(rio), x0, x7; \ + vpxor 9 * 32(rio), x0, x6; \ + vpxor 10 * 32(rio), x0, x5; \ + vpxor 11 * 32(rio), x0, x4; \ + vpxor 12 * 32(rio), x0, x3; \ + vpxor 13 * 32(rio), x0, x2; \ + vpxor 14 * 32(rio), x0, x1; \ + vpxor 15 * 32(rio), x0, x0; + +/* byteslice pre-whitened blocks and store to temporary memory */ +#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd) \ + byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ + y4, y5, y6, y7, (mem_ab), (mem_cd)); \ + \ + vmovdqu x0, 0 * 32(mem_ab); \ + vmovdqu x1, 1 * 32(mem_ab); \ + vmovdqu x2, 2 * 32(mem_ab); \ + vmovdqu x3, 3 * 32(mem_ab); \ + vmovdqu x4, 4 * 32(mem_ab); \ + vmovdqu x5, 5 * 32(mem_ab); \ + vmovdqu x6, 6 * 32(mem_ab); \ + vmovdqu x7, 7 * 32(mem_ab); \ + vmovdqu y0, 0 * 32(mem_cd); \ + vmovdqu y1, 1 * 32(mem_cd); \ + vmovdqu y2, 2 * 32(mem_cd); \ + vmovdqu y3, 3 * 32(mem_cd); \ + vmovdqu y4, 4 * 32(mem_cd); \ + vmovdqu y5, 5 * 32(mem_cd); \ + vmovdqu y6, 6 * 32(mem_cd); \ + vmovdqu y7, 7 * 32(mem_cd); + +/* de-byteslice, apply post-whitening and store blocks */ +#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ + y5, y6, y7, key, stack_tmp0, stack_tmp1) \ + byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ + y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ + \ + vmovdqu x0, stack_tmp0; \ + \ + vpbroadcastq key, x0; \ + vpshufb .Lpack_bswap rRIP, x0, x0; \ + \ + vpxor x0, y7, y7; \ + vpxor x0, y6, y6; \ + vpxor x0, y5, y5; \ + vpxor x0, y4, y4; \ + vpxor x0, y3, y3; \ + vpxor x0, y2, y2; \ + vpxor x0, y1, y1; \ + vpxor x0, y0, y0; \ + vpxor x0, x7, x7; \ + vpxor x0, x6, x6; \ + vpxor x0, x5, x5; \ + vpxor x0, x4, x4; \ + vpxor x0, x3, x3; \ + vpxor x0, x2, x2; \ + vpxor x0, x1, x1; \ + vpxor stack_tmp0, x0, x0; + +#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, rio) \ + vmovdqu x0, 0 * 32(rio); \ + vmovdqu x1, 1 * 32(rio); \ + vmovdqu x2, 2 * 32(rio); \ + vmovdqu x3, 3 * 32(rio); \ + vmovdqu x4, 4 * 32(rio); \ + vmovdqu x5, 5 * 32(rio); \ + vmovdqu x6, 6 * 32(rio); \ + vmovdqu x7, 7 * 32(rio); \ + vmovdqu y0, 8 * 32(rio); \ + vmovdqu y1, 9 * 32(rio); \ + vmovdqu y2, 10 * 32(rio); \ + vmovdqu y3, 11 * 32(rio); \ + vmovdqu y4, 12 * 32(rio); \ + vmovdqu y5, 13 * 32(rio); \ + vmovdqu y6, 14 * 32(rio); \ + vmovdqu y7, 15 * 32(rio); + +.text +.align 32 + +#define SHUFB_BYTES(idx) \ + 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) + +.Lshufb_16x16b: + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + +.Lpack_bswap: + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* + * pre-SubByte transform + * + * pre-lookup for sbox1, sbox2, sbox3: + * swap_bitendianness( + * isom_map_camellia_to_aes( + * camellia_f( + * swap_bitendianess(in) + * ) + * ) + * ) + * + * (note: '⊕ 0xc5' inside camellia_f()) + */ +.Lpre_tf_lo_s1: + .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 + .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 +.Lpre_tf_hi_s1: + .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a + .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 + +/* + * pre-SubByte transform + * + * pre-lookup for sbox4: + * swap_bitendianness( + * isom_map_camellia_to_aes( + * camellia_f( + * swap_bitendianess(in <<< 1) + * ) + * ) + * ) + * + * (note: '⊕ 0xc5' inside camellia_f()) + */ +.Lpre_tf_lo_s4: + .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 + .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 +.Lpre_tf_hi_s4: + .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 + .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf + +/* + * post-SubByte transform + * + * post-lookup for sbox1, sbox4: + * swap_bitendianness( + * camellia_h( + * isom_map_aes_to_camellia( + * swap_bitendianness( + * aes_inverse_affine_transform(in) + * ) + * ) + * ) + * ) + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s1: + .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 + .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 +.Lpost_tf_hi_s1: + .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 + .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c + +/* + * post-SubByte transform + * + * post-lookup for sbox2: + * swap_bitendianness( + * camellia_h( + * isom_map_aes_to_camellia( + * swap_bitendianness( + * aes_inverse_affine_transform(in) + * ) + * ) + * ) + * ) <<< 1 + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s2: + .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 + .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 +.Lpost_tf_hi_s2: + .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 + .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 + +/* + * post-SubByte transform + * + * post-lookup for sbox3: + * swap_bitendianness( + * camellia_h( + * isom_map_aes_to_camellia( + * swap_bitendianness( + * aes_inverse_affine_transform(in) + * ) + * ) + * ) + * ) >>> 1 + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s3: + .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 + .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 +.Lpost_tf_hi_s3: + .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 + .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +.align 4 +/* 4-bit mask */ +.L0f0f0f0f: + .long 0x0f0f0f0f + + +.align 8 +ELF(.type __camellia_enc_blk32,@function;) + +__camellia_enc_blk32: + /* input: + * %rdi: ctx, CTX + * %rax: temporary storage, 512 bytes + * %r8d: 24 for 16 byte key, 32 for larger + * %ymm0..%ymm15: 32 plaintext blocks + * output: + * %ymm0..%ymm15: 32 encrypted blocks, order swapped: + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + */ + CFI_STARTPROC(); + + leaq 8 * 32(%rax), %rcx; + + leaq (-8 * 8)(CTX, %r8, 8), %r8; + + inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx); + +.align 8 +.Lenc_loop: + enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 0); + + cmpq %r8, CTX; + je .Lenc_done; + leaq (8 * 8)(CTX), CTX; + + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, + ((key_table) + 0)(CTX), + ((key_table) + 4)(CTX), + ((key_table) + 8)(CTX), + ((key_table) + 12)(CTX)); + jmp .Lenc_loop; + +.align 8 +.Lenc_done: + /* load CD for output */ + vmovdqu 0 * 32(%rcx), %ymm8; + vmovdqu 1 * 32(%rcx), %ymm9; + vmovdqu 2 * 32(%rcx), %ymm10; + vmovdqu 3 * 32(%rcx), %ymm11; + vmovdqu 4 * 32(%rcx), %ymm12; + vmovdqu 5 * 32(%rcx), %ymm13; + vmovdqu 6 * 32(%rcx), %ymm14; + vmovdqu 7 * 32(%rcx), %ymm15; + + outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 32(%rax)); + + ret; + CFI_ENDPROC(); +ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;) + +.align 8 +ELF(.type __camellia_dec_blk32,@function;) + +__camellia_dec_blk32: + /* input: + * %rdi: ctx, CTX + * %rax: temporary storage, 512 bytes + * %r8d: 24 for 16 byte key, 32 for larger + * %ymm0..%ymm15: 16 encrypted blocks + * output: + * %ymm0..%ymm15: 16 plaintext blocks, order swapped: + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + */ + CFI_STARTPROC(); + + movq %r8, %rcx; + movq CTX, %r8 + leaq (-8 * 8)(CTX, %rcx, 8), CTX; + + leaq 8 * 32(%rax), %rcx; + + inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx); + +.align 8 +.Ldec_loop: + dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 0); + + cmpq %r8, CTX; + je .Ldec_done; + + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, + ((key_table) + 8)(CTX), + ((key_table) + 12)(CTX), + ((key_table) + 0)(CTX), + ((key_table) + 4)(CTX)); + + leaq (-8 * 8)(CTX), CTX; + jmp .Ldec_loop; + +.align 8 +.Ldec_done: + /* load CD for output */ + vmovdqu 0 * 32(%rcx), %ymm8; + vmovdqu 1 * 32(%rcx), %ymm9; + vmovdqu 2 * 32(%rcx), %ymm10; + vmovdqu 3 * 32(%rcx), %ymm11; + vmovdqu 4 * 32(%rcx), %ymm12; + vmovdqu 5 * 32(%rcx), %ymm13; + vmovdqu 6 * 32(%rcx), %ymm14; + vmovdqu 7 * 32(%rcx), %ymm15; + + outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); + + ret; + CFI_ENDPROC(); +ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;) + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +.align 8 +.globl FUNC_NAME(ctr_enc) +ELF(.type FUNC_NAME(ctr_enc),@function;) + +FUNC_NAME(ctr_enc): + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv (big endian, 128bit) + */ + CFI_STARTPROC(); + + pushq %rbp; + CFI_PUSH(%rbp); + movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); + + movq 8(%rcx), %r11; + bswapq %r11; + + vzeroupper; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + subq $(16 * 32), %rsp; + andq $~63, %rsp; + movq %rsp, %rax; + + vpcmpeqd %ymm15, %ymm15, %ymm15; + vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ + + /* load IV and byteswap */ + vmovdqu (%rcx), %xmm0; + vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0; + vmovdqa %xmm0, %xmm1; + inc_le128(%xmm0, %xmm15, %xmm14); + vbroadcasti128 .Lbswap128_mask rRIP, %ymm14; + vinserti128 $1, %xmm0, %ymm1, %ymm0; + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 15 * 32(%rax); + + /* check need for handling 64-bit overflow and carry */ + cmpq $(0xffffffffffffffff - 32), %r11; + ja .Lload_ctr_carry; + + /* construct IVs */ + vpaddq %ymm15, %ymm15, %ymm15; /* ab: -2:0 ; cd: -2:0 */ + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 14 * 32(%rax); + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 13 * 32(%rax); + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm12; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm11; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm10; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm9; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm8; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm7; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm6; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm5; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm4; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm3; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm2; + vpsubq %ymm15, %ymm0, %ymm0; + vpshufb %ymm14, %ymm0, %ymm1; + vpsubq %ymm15, %ymm0, %ymm0; /* +30 ; +31 */ + vpsubq %xmm15, %xmm0, %xmm13; /* +32 */ + vpshufb %ymm14, %ymm0, %ymm0; + vpshufb %xmm14, %xmm13, %xmm13; + vmovdqu %xmm13, (%rcx); + + jmp .Lload_ctr_done; + +.align 4 +.Lload_ctr_carry: + /* construct IVs */ + inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le1 ; cd: le2 */ + inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le2 ; cd: le3 */ + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 14 * 32(%rax); + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 13 * 32(%rax); + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm12; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm11; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm10; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm9; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm8; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm7; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm6; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm5; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm4; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm3; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm2; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vpshufb %ymm14, %ymm0, %ymm1; + inc_le128(%ymm0, %ymm15, %ymm13); + inc_le128(%ymm0, %ymm15, %ymm13); + vextracti128 $1, %ymm0, %xmm13; + vpshufb %ymm14, %ymm0, %ymm0; + inc_le128(%xmm13, %xmm15, %xmm14); + vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; + vmovdqu %xmm13, (%rcx); + +.align 4 +.Lload_ctr_done: + /* inpack16_pre: */ + vpbroadcastq (key_table)(CTX), %ymm15; + vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; + vpxor %ymm0, %ymm15, %ymm0; + vpxor %ymm1, %ymm15, %ymm1; + vpxor %ymm2, %ymm15, %ymm2; + vpxor %ymm3, %ymm15, %ymm3; + vpxor %ymm4, %ymm15, %ymm4; + vpxor %ymm5, %ymm15, %ymm5; + vpxor %ymm6, %ymm15, %ymm6; + vpxor %ymm7, %ymm15, %ymm7; + vpxor %ymm8, %ymm15, %ymm8; + vpxor %ymm9, %ymm15, %ymm9; + vpxor %ymm10, %ymm15, %ymm10; + vpxor %ymm11, %ymm15, %ymm11; + vpxor %ymm12, %ymm15, %ymm12; + vpxor 13 * 32(%rax), %ymm15, %ymm13; + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + + call __camellia_enc_blk32; + + vpxor 0 * 32(%rdx), %ymm7, %ymm7; + vpxor 1 * 32(%rdx), %ymm6, %ymm6; + vpxor 2 * 32(%rdx), %ymm5, %ymm5; + vpxor 3 * 32(%rdx), %ymm4, %ymm4; + vpxor 4 * 32(%rdx), %ymm3, %ymm3; + vpxor 5 * 32(%rdx), %ymm2, %ymm2; + vpxor 6 * 32(%rdx), %ymm1, %ymm1; + vpxor 7 * 32(%rdx), %ymm0, %ymm0; + vpxor 8 * 32(%rdx), %ymm15, %ymm15; + vpxor 9 * 32(%rdx), %ymm14, %ymm14; + vpxor 10 * 32(%rdx), %ymm13, %ymm13; + vpxor 11 * 32(%rdx), %ymm12, %ymm12; + vpxor 12 * 32(%rdx), %ymm11, %ymm11; + vpxor 13 * 32(%rdx), %ymm10, %ymm10; + vpxor 14 * 32(%rdx), %ymm9, %ymm9; + vpxor 15 * 32(%rdx), %ymm8, %ymm8; + leaq 32 * 16(%rdx), %rdx; + + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroall; + + leave; + CFI_LEAVE(); + ret; + CFI_ENDPROC(); +ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);) + +.align 8 +.globl FUNC_NAME(cbc_dec) +ELF(.type FUNC_NAME(cbc_dec),@function;) + +FUNC_NAME(cbc_dec): + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv + */ + CFI_STARTPROC(); + + pushq %rbp; + CFI_PUSH(%rbp); + movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); + + vzeroupper; + + movq %rcx, %r9; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + subq $(16 * 32), %rsp; + andq $~63, %rsp; + movq %rsp, %rax; + + inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rdx, (key_table)(CTX, %r8, 8)); + + call __camellia_dec_blk32; + + /* XOR output with IV */ + vmovdqu %ymm8, (%rax); + vmovdqu (%r9), %xmm8; + vinserti128 $1, (%rdx), %ymm8, %ymm8; + vpxor %ymm8, %ymm7, %ymm7; + vmovdqu (%rax), %ymm8; + vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; + vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; + vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; + vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; + vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; + vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; + vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; + vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; + vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; + vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; + vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; + vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; + vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; + vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; + vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; + movq (15 * 32 + 16 + 0)(%rdx), %rax; + movq (15 * 32 + 16 + 8)(%rdx), %rcx; + + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + /* store new IV */ + movq %rax, (0)(%r9); + movq %rcx, (8)(%r9); + + vzeroall; + + leave; + CFI_LEAVE(); + ret; + CFI_ENDPROC(); +ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);) + +.align 8 +.globl FUNC_NAME(cfb_dec) +ELF(.type FUNC_NAME(cfb_dec),@function;) + +FUNC_NAME(cfb_dec): + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv + */ + CFI_STARTPROC(); + + pushq %rbp; + CFI_PUSH(%rbp); + movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); + + vzeroupper; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + subq $(16 * 32), %rsp; + andq $~63, %rsp; + movq %rsp, %rax; + + /* inpack16_pre: */ + vpbroadcastq (key_table)(CTX), %ymm0; + vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0; + vmovdqu (%rcx), %xmm15; + vinserti128 $1, (%rdx), %ymm15, %ymm15; + vpxor %ymm15, %ymm0, %ymm15; + vmovdqu (15 * 32 + 16)(%rdx), %xmm1; + vmovdqu %xmm1, (%rcx); /* store new IV */ + vpxor (0 * 32 + 16)(%rdx), %ymm0, %ymm14; + vpxor (1 * 32 + 16)(%rdx), %ymm0, %ymm13; + vpxor (2 * 32 + 16)(%rdx), %ymm0, %ymm12; + vpxor (3 * 32 + 16)(%rdx), %ymm0, %ymm11; + vpxor (4 * 32 + 16)(%rdx), %ymm0, %ymm10; + vpxor (5 * 32 + 16)(%rdx), %ymm0, %ymm9; + vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm8; + vpxor (7 * 32 + 16)(%rdx), %ymm0, %ymm7; + vpxor (8 * 32 + 16)(%rdx), %ymm0, %ymm6; + vpxor (9 * 32 + 16)(%rdx), %ymm0, %ymm5; + vpxor (10 * 32 + 16)(%rdx), %ymm0, %ymm4; + vpxor (11 * 32 + 16)(%rdx), %ymm0, %ymm3; + vpxor (12 * 32 + 16)(%rdx), %ymm0, %ymm2; + vpxor (13 * 32 + 16)(%rdx), %ymm0, %ymm1; + vpxor (14 * 32 + 16)(%rdx), %ymm0, %ymm0; + + call __camellia_enc_blk32; + + vpxor 0 * 32(%rdx), %ymm7, %ymm7; + vpxor 1 * 32(%rdx), %ymm6, %ymm6; + vpxor 2 * 32(%rdx), %ymm5, %ymm5; + vpxor 3 * 32(%rdx), %ymm4, %ymm4; + vpxor 4 * 32(%rdx), %ymm3, %ymm3; + vpxor 5 * 32(%rdx), %ymm2, %ymm2; + vpxor 6 * 32(%rdx), %ymm1, %ymm1; + vpxor 7 * 32(%rdx), %ymm0, %ymm0; + vpxor 8 * 32(%rdx), %ymm15, %ymm15; + vpxor 9 * 32(%rdx), %ymm14, %ymm14; + vpxor 10 * 32(%rdx), %ymm13, %ymm13; + vpxor 11 * 32(%rdx), %ymm12, %ymm12; + vpxor 12 * 32(%rdx), %ymm11, %ymm11; + vpxor 13 * 32(%rdx), %ymm10, %ymm10; + vpxor 14 * 32(%rdx), %ymm9, %ymm9; + vpxor 15 * 32(%rdx), %ymm8, %ymm8; + + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroall; + + leave; + CFI_LEAVE(); + ret; + CFI_ENDPROC(); +ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);) + +.align 8 +.globl FUNC_NAME(ocb_enc) +ELF(.type FUNC_NAME(ocb_enc),@function;) + +FUNC_NAME(ocb_enc): + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[32]) + */ + CFI_STARTPROC(); + + pushq %rbp; + CFI_PUSH(%rbp); + movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); + + vzeroupper; + + subq $(16 * 32 + 4 * 8), %rsp; + andq $~63, %rsp; + movq %rsp, %rax; + + movq %r10, (16 * 32 + 0 * 8)(%rsp); + movq %r11, (16 * 32 + 1 * 8)(%rsp); + movq %r12, (16 * 32 + 2 * 8)(%rsp); + movq %r13, (16 * 32 + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); + CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); + CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); + CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); + + vmovdqu (%rcx), %xmm14; + vmovdqu (%r8), %xmm13; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rdx), yreg; \ + vpxor (l0reg), %xmm14, %xmm15; \ + vpxor (l1reg), %xmm15, %xmm14; \ + vinserti128 $1, %xmm14, %ymm15, %ymm15; \ + vpxor yreg, %ymm13, %ymm13; \ + vpxor yreg, %ymm15, yreg; \ + vmovdqu %ymm15, (n * 32)(%rsi); + + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %r11, %ymm0); + vmovdqu %ymm0, (15 * 32)(%rax); + OCB_INPUT(1, %r12, %r13, %ymm0); + vmovdqu %ymm0, (14 * 32)(%rax); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(2, %r10, %r11, %ymm0); + vmovdqu %ymm0, (13 * 32)(%rax); + OCB_INPUT(3, %r12, %r13, %ymm12); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %r11, %ymm11); + OCB_INPUT(5, %r12, %r13, %ymm10); + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(6, %r10, %r11, %ymm9); + OCB_INPUT(7, %r12, %r13, %ymm8); + movq (16 * 8)(%r9), %r10; + movq (17 * 8)(%r9), %r11; + movq (18 * 8)(%r9), %r12; + movq (19 * 8)(%r9), %r13; + OCB_INPUT(8, %r10, %r11, %ymm7); + OCB_INPUT(9, %r12, %r13, %ymm6); + movq (20 * 8)(%r9), %r10; + movq (21 * 8)(%r9), %r11; + movq (22 * 8)(%r9), %r12; + movq (23 * 8)(%r9), %r13; + OCB_INPUT(10, %r10, %r11, %ymm5); + OCB_INPUT(11, %r12, %r13, %ymm4); + movq (24 * 8)(%r9), %r10; + movq (25 * 8)(%r9), %r11; + movq (26 * 8)(%r9), %r12; + movq (27 * 8)(%r9), %r13; + OCB_INPUT(12, %r10, %r11, %ymm3); + OCB_INPUT(13, %r12, %r13, %ymm2); + movq (28 * 8)(%r9), %r10; + movq (29 * 8)(%r9), %r11; + movq (30 * 8)(%r9), %r12; + movq (31 * 8)(%r9), %r13; + OCB_INPUT(14, %r10, %r11, %ymm1); + OCB_INPUT(15, %r12, %r13, %ymm0); +#undef OCB_INPUT + + vextracti128 $1, %ymm13, %xmm15; + vmovdqu %xmm14, (%rcx); + vpxor %xmm13, %xmm15, %xmm15; + vmovdqu %xmm15, (%r8); + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %r10d; + cmovel %r10d, %r8d; /* max */ + + /* inpack16_pre: */ + vpbroadcastq (key_table)(CTX), %ymm15; + vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; + vpxor %ymm0, %ymm15, %ymm0; + vpxor %ymm1, %ymm15, %ymm1; + vpxor %ymm2, %ymm15, %ymm2; + vpxor %ymm3, %ymm15, %ymm3; + vpxor %ymm4, %ymm15, %ymm4; + vpxor %ymm5, %ymm15, %ymm5; + vpxor %ymm6, %ymm15, %ymm6; + vpxor %ymm7, %ymm15, %ymm7; + vpxor %ymm8, %ymm15, %ymm8; + vpxor %ymm9, %ymm15, %ymm9; + vpxor %ymm10, %ymm15, %ymm10; + vpxor %ymm11, %ymm15, %ymm11; + vpxor %ymm12, %ymm15, %ymm12; + vpxor 13 * 32(%rax), %ymm15, %ymm13; + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + + call __camellia_enc_blk32; + + vpxor 0 * 32(%rsi), %ymm7, %ymm7; + vpxor 1 * 32(%rsi), %ymm6, %ymm6; + vpxor 2 * 32(%rsi), %ymm5, %ymm5; + vpxor 3 * 32(%rsi), %ymm4, %ymm4; + vpxor 4 * 32(%rsi), %ymm3, %ymm3; + vpxor 5 * 32(%rsi), %ymm2, %ymm2; + vpxor 6 * 32(%rsi), %ymm1, %ymm1; + vpxor 7 * 32(%rsi), %ymm0, %ymm0; + vpxor 8 * 32(%rsi), %ymm15, %ymm15; + vpxor 9 * 32(%rsi), %ymm14, %ymm14; + vpxor 10 * 32(%rsi), %ymm13, %ymm13; + vpxor 11 * 32(%rsi), %ymm12, %ymm12; + vpxor 12 * 32(%rsi), %ymm11, %ymm11; + vpxor 13 * 32(%rsi), %ymm10, %ymm10; + vpxor 14 * 32(%rsi), %ymm9, %ymm9; + vpxor 15 * 32(%rsi), %ymm8, %ymm8; + + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroall; + + movq (16 * 32 + 0 * 8)(%rsp), %r10; + movq (16 * 32 + 1 * 8)(%rsp), %r11; + movq (16 * 32 + 2 * 8)(%rsp), %r12; + movq (16 * 32 + 3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + + leave; + CFI_LEAVE(); + ret; + CFI_ENDPROC(); +ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);) + +.align 8 +.globl FUNC_NAME(ocb_dec) +ELF(.type FUNC_NAME(ocb_dec),@function;) + +FUNC_NAME(ocb_dec): + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[32]) + */ + CFI_STARTPROC(); + + pushq %rbp; + CFI_PUSH(%rbp); + movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); + + vzeroupper; + + subq $(16 * 32 + 4 * 8), %rsp; + andq $~63, %rsp; + movq %rsp, %rax; + + movq %r10, (16 * 32 + 0 * 8)(%rsp); + movq %r11, (16 * 32 + 1 * 8)(%rsp); + movq %r12, (16 * 32 + 2 * 8)(%rsp); + movq %r13, (16 * 32 + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); + CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); + CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); + CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); + + vmovdqu (%rcx), %xmm14; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rdx), yreg; \ + vpxor (l0reg), %xmm14, %xmm15; \ + vpxor (l1reg), %xmm15, %xmm14; \ + vinserti128 $1, %xmm14, %ymm15, %ymm15; \ + vpxor yreg, %ymm15, yreg; \ + vmovdqu %ymm15, (n * 32)(%rsi); + + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %r11, %ymm0); + vmovdqu %ymm0, (15 * 32)(%rax); + OCB_INPUT(1, %r12, %r13, %ymm0); + vmovdqu %ymm0, (14 * 32)(%rax); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(2, %r10, %r11, %ymm13); + OCB_INPUT(3, %r12, %r13, %ymm12); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %r11, %ymm11); + OCB_INPUT(5, %r12, %r13, %ymm10); + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(6, %r10, %r11, %ymm9); + OCB_INPUT(7, %r12, %r13, %ymm8); + movq (16 * 8)(%r9), %r10; + movq (17 * 8)(%r9), %r11; + movq (18 * 8)(%r9), %r12; + movq (19 * 8)(%r9), %r13; + OCB_INPUT(8, %r10, %r11, %ymm7); + OCB_INPUT(9, %r12, %r13, %ymm6); + movq (20 * 8)(%r9), %r10; + movq (21 * 8)(%r9), %r11; + movq (22 * 8)(%r9), %r12; + movq (23 * 8)(%r9), %r13; + OCB_INPUT(10, %r10, %r11, %ymm5); + OCB_INPUT(11, %r12, %r13, %ymm4); + movq (24 * 8)(%r9), %r10; + movq (25 * 8)(%r9), %r11; + movq (26 * 8)(%r9), %r12; + movq (27 * 8)(%r9), %r13; + OCB_INPUT(12, %r10, %r11, %ymm3); + OCB_INPUT(13, %r12, %r13, %ymm2); + movq (28 * 8)(%r9), %r10; + movq (29 * 8)(%r9), %r11; + movq (30 * 8)(%r9), %r12; + movq (31 * 8)(%r9), %r13; + OCB_INPUT(14, %r10, %r11, %ymm1); + OCB_INPUT(15, %r12, %r13, %ymm0); +#undef OCB_INPUT + + vmovdqu %xmm14, (%rcx); + + movq %r8, %r10; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %r9d; + cmovel %r9d, %r8d; /* max */ + + /* inpack16_pre: */ + vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; + vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; + vpxor %ymm0, %ymm15, %ymm0; + vpxor %ymm1, %ymm15, %ymm1; + vpxor %ymm2, %ymm15, %ymm2; + vpxor %ymm3, %ymm15, %ymm3; + vpxor %ymm4, %ymm15, %ymm4; + vpxor %ymm5, %ymm15, %ymm5; + vpxor %ymm6, %ymm15, %ymm6; + vpxor %ymm7, %ymm15, %ymm7; + vpxor %ymm8, %ymm15, %ymm8; + vpxor %ymm9, %ymm15, %ymm9; + vpxor %ymm10, %ymm15, %ymm10; + vpxor %ymm11, %ymm15, %ymm11; + vpxor %ymm12, %ymm15, %ymm12; + vpxor %ymm13, %ymm15, %ymm13; + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + + call __camellia_dec_blk32; + + vpxor 0 * 32(%rsi), %ymm7, %ymm7; + vpxor 1 * 32(%rsi), %ymm6, %ymm6; + vpxor 2 * 32(%rsi), %ymm5, %ymm5; + vpxor 3 * 32(%rsi), %ymm4, %ymm4; + vpxor 4 * 32(%rsi), %ymm3, %ymm3; + vpxor 5 * 32(%rsi), %ymm2, %ymm2; + vpxor 6 * 32(%rsi), %ymm1, %ymm1; + vpxor 7 * 32(%rsi), %ymm0, %ymm0; + vmovdqu %ymm7, (7 * 32)(%rax); + vmovdqu %ymm6, (6 * 32)(%rax); + vpxor 8 * 32(%rsi), %ymm15, %ymm15; + vpxor 9 * 32(%rsi), %ymm14, %ymm14; + vpxor 10 * 32(%rsi), %ymm13, %ymm13; + vpxor 11 * 32(%rsi), %ymm12, %ymm12; + vpxor 12 * 32(%rsi), %ymm11, %ymm11; + vpxor 13 * 32(%rsi), %ymm10, %ymm10; + vpxor 14 * 32(%rsi), %ymm9, %ymm9; + vpxor 15 * 32(%rsi), %ymm8, %ymm8; + + /* Checksum_i = Checksum_{i-1} xor P_i */ + + vpxor %ymm5, %ymm7, %ymm7; + vpxor %ymm4, %ymm6, %ymm6; + vpxor %ymm3, %ymm7, %ymm7; + vpxor %ymm2, %ymm6, %ymm6; + vpxor %ymm1, %ymm7, %ymm7; + vpxor %ymm0, %ymm6, %ymm6; + vpxor %ymm15, %ymm7, %ymm7; + vpxor %ymm14, %ymm6, %ymm6; + vpxor %ymm13, %ymm7, %ymm7; + vpxor %ymm12, %ymm6, %ymm6; + vpxor %ymm11, %ymm7, %ymm7; + vpxor %ymm10, %ymm6, %ymm6; + vpxor %ymm9, %ymm7, %ymm7; + vpxor %ymm8, %ymm6, %ymm6; + vpxor %ymm7, %ymm6, %ymm7; + + vextracti128 $1, %ymm7, %xmm6; + vpxor %xmm6, %xmm7, %xmm7; + vpxor (%r10), %xmm7, %xmm7; + vmovdqu %xmm7, (%r10); + + vmovdqu 7 * 32(%rax), %ymm7; + vmovdqu 6 * 32(%rax), %ymm6; + + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroall; + + movq (16 * 32 + 0 * 8)(%rsp), %r10; + movq (16 * 32 + 1 * 8)(%rsp), %r11; + movq (16 * 32 + 2 * 8)(%rsp), %r12; + movq (16 * 32 + 3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + + leave; + CFI_LEAVE(); + ret; + CFI_ENDPROC(); +ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);) + +.align 8 +.globl FUNC_NAME(ocb_auth) +ELF(.type FUNC_NAME(ocb_auth),@function;) + +FUNC_NAME(ocb_auth): + /* input: + * %rdi: ctx, CTX + * %rsi: abuf (16 blocks) + * %rdx: offset + * %rcx: checksum + * %r8 : L pointers (void *L[16]) + */ + CFI_STARTPROC(); + + pushq %rbp; + CFI_PUSH(%rbp); + movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); + + vzeroupper; + + subq $(16 * 32 + 4 * 8), %rsp; + andq $~63, %rsp; + movq %rsp, %rax; + + movq %r10, (16 * 32 + 0 * 8)(%rsp); + movq %r11, (16 * 32 + 1 * 8)(%rsp); + movq %r12, (16 * 32 + 2 * 8)(%rsp); + movq %r13, (16 * 32 + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); + CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); + CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); + CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); + + vmovdqu (%rdx), %xmm14; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rsi), yreg; \ + vpxor (l0reg), %xmm14, %xmm15; \ + vpxor (l1reg), %xmm15, %xmm14; \ + vinserti128 $1, %xmm14, %ymm15, %ymm15; \ + vpxor yreg, %ymm15, yreg; + + movq (0 * 8)(%r8), %r10; + movq (1 * 8)(%r8), %r11; + movq (2 * 8)(%r8), %r12; + movq (3 * 8)(%r8), %r13; + OCB_INPUT(0, %r10, %r11, %ymm0); + vmovdqu %ymm0, (15 * 32)(%rax); + OCB_INPUT(1, %r12, %r13, %ymm0); + vmovdqu %ymm0, (14 * 32)(%rax); + movq (4 * 8)(%r8), %r10; + movq (5 * 8)(%r8), %r11; + movq (6 * 8)(%r8), %r12; + movq (7 * 8)(%r8), %r13; + OCB_INPUT(2, %r10, %r11, %ymm13); + OCB_INPUT(3, %r12, %r13, %ymm12); + movq (8 * 8)(%r8), %r10; + movq (9 * 8)(%r8), %r11; + movq (10 * 8)(%r8), %r12; + movq (11 * 8)(%r8), %r13; + OCB_INPUT(4, %r10, %r11, %ymm11); + OCB_INPUT(5, %r12, %r13, %ymm10); + movq (12 * 8)(%r8), %r10; + movq (13 * 8)(%r8), %r11; + movq (14 * 8)(%r8), %r12; + movq (15 * 8)(%r8), %r13; + OCB_INPUT(6, %r10, %r11, %ymm9); + OCB_INPUT(7, %r12, %r13, %ymm8); + movq (16 * 8)(%r8), %r10; + movq (17 * 8)(%r8), %r11; + movq (18 * 8)(%r8), %r12; + movq (19 * 8)(%r8), %r13; + OCB_INPUT(8, %r10, %r11, %ymm7); + OCB_INPUT(9, %r12, %r13, %ymm6); + movq (20 * 8)(%r8), %r10; + movq (21 * 8)(%r8), %r11; + movq (22 * 8)(%r8), %r12; + movq (23 * 8)(%r8), %r13; + OCB_INPUT(10, %r10, %r11, %ymm5); + OCB_INPUT(11, %r12, %r13, %ymm4); + movq (24 * 8)(%r8), %r10; + movq (25 * 8)(%r8), %r11; + movq (26 * 8)(%r8), %r12; + movq (27 * 8)(%r8), %r13; + OCB_INPUT(12, %r10, %r11, %ymm3); + OCB_INPUT(13, %r12, %r13, %ymm2); + movq (28 * 8)(%r8), %r10; + movq (29 * 8)(%r8), %r11; + movq (30 * 8)(%r8), %r12; + movq (31 * 8)(%r8), %r13; + OCB_INPUT(14, %r10, %r11, %ymm1); + OCB_INPUT(15, %r12, %r13, %ymm0); +#undef OCB_INPUT + + vmovdqu %xmm14, (%rdx); + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %r10d; + cmovel %r10d, %r8d; /* max */ + + movq %rcx, %r10; + + /* inpack16_pre: */ + vpbroadcastq (key_table)(CTX), %ymm15; + vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; + vpxor %ymm0, %ymm15, %ymm0; + vpxor %ymm1, %ymm15, %ymm1; + vpxor %ymm2, %ymm15, %ymm2; + vpxor %ymm3, %ymm15, %ymm3; + vpxor %ymm4, %ymm15, %ymm4; + vpxor %ymm5, %ymm15, %ymm5; + vpxor %ymm6, %ymm15, %ymm6; + vpxor %ymm7, %ymm15, %ymm7; + vpxor %ymm8, %ymm15, %ymm8; + vpxor %ymm9, %ymm15, %ymm9; + vpxor %ymm10, %ymm15, %ymm10; + vpxor %ymm11, %ymm15, %ymm11; + vpxor %ymm12, %ymm15, %ymm12; + vpxor %ymm13, %ymm15, %ymm13; + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + + call __camellia_enc_blk32; + + vpxor %ymm7, %ymm6, %ymm6; + vpxor %ymm5, %ymm4, %ymm4; + vpxor %ymm3, %ymm2, %ymm2; + vpxor %ymm1, %ymm0, %ymm0; + vpxor %ymm15, %ymm14, %ymm14; + vpxor %ymm13, %ymm12, %ymm12; + vpxor %ymm11, %ymm10, %ymm10; + vpxor %ymm9, %ymm8, %ymm8; + + vpxor %ymm6, %ymm4, %ymm4; + vpxor %ymm2, %ymm0, %ymm0; + vpxor %ymm14, %ymm12, %ymm12; + vpxor %ymm10, %ymm8, %ymm8; + + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm12, %ymm8, %ymm8; + + vpxor %ymm0, %ymm8, %ymm0; + + vextracti128 $1, %ymm0, %xmm1; + vpxor (%r10), %xmm0, %xmm0; + vpxor %xmm0, %xmm1, %xmm0; + vmovdqu %xmm0, (%r10); + + vzeroall; + + movq (16 * 32 + 0 * 8)(%rsp), %r10; + movq (16 * 32 + 1 * 8)(%rsp), %r11; + movq (16 * 32 + 2 * 8)(%rsp), %r12; + movq (16 * 32 + 3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + + leave; + CFI_LEAVE(); + ret; + CFI_ENDPROC(); +ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);) + +#endif /* GCRY_CAMELLIA_AESNI_AVX2_AMD64_H */ diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 6577b651..23cbec81 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -91,6 +91,12 @@ # endif #endif +/* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 code. */ +#undef USE_VAES_AVX2 +#if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) +# define USE_VAES_AVX2 1 +#endif + typedef struct { KEY_TABLE_TYPE keytable; @@ -100,6 +106,7 @@ typedef struct #endif /*USE_AESNI_AVX*/ #ifdef USE_AESNI_AVX2 unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used. */ + unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used. */ #endif /*USE_AESNI_AVX2*/ } CAMELLIA_context; @@ -201,6 +208,46 @@ extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx, const u64 Ls[32]) ASM_FUNC_ABI; #endif +#ifdef USE_VAES_AVX2 +/* Assembler implementations of Camellia using VAES and AVX2. Process data + in 32 block same time. + */ +extern void _gcry_camellia_vaes_avx2_ctr_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *ctr) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_cbc_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_cfb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_ocb_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_ocb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; +#endif + static const char *selftest(void); static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr, @@ -225,7 +272,7 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, CAMELLIA_context *ctx=c; static int initialized=0; static const char *selftest_failed=NULL; -#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) +#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_VAES_AVX2) unsigned int hwf = _gcry_get_hw_features (); #endif @@ -248,6 +295,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, #endif #ifdef USE_AESNI_AVX2 ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2); + ctx->use_vaes_avx2 = 0; +#endif +#ifdef USE_VAES_AVX2 + ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2); #endif ctx->keybitlength=keylen*8; @@ -389,11 +440,19 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int use_vaes = ctx->use_vaes_avx2; +#endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { - _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); +#ifdef USE_VAES_AVX2 + if (use_vaes) + _gcry_camellia_vaes_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); + else +#endif + _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -478,11 +537,19 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int use_vaes = ctx->use_vaes_avx2; +#endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { - _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv); +#ifdef USE_VAES_AVX2 + if (use_vaes) + _gcry_camellia_vaes_avx2_cbc_dec(ctx, outbuf, inbuf, iv); + else +#endif + _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -564,11 +631,19 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int use_vaes = ctx->use_vaes_avx2; +#endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { - _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv); +#ifdef USE_VAES_AVX2 + if (use_vaes) + _gcry_camellia_vaes_avx2_cfb_dec(ctx, outbuf, inbuf, iv); + else +#endif + _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -654,6 +729,10 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int encrypt_use_vaes = encrypt && ctx->use_vaes_avx2; + int decrypt_use_vaes = !encrypt && ctx->use_vaes_avx2; +#endif u64 Ls[32]; unsigned int n = 32 - (blkn % 32); u64 *l; @@ -685,7 +764,16 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, blkn += 32; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32); - if (encrypt) + if (0) {} +#ifdef USE_VAES_AVX2 + else if (encrypt_use_vaes) + _gcry_camellia_vaes_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else if (decrypt_use_vaes) + _gcry_camellia_vaes_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); +#endif + else if (encrypt) _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); else @@ -803,6 +891,9 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int use_vaes = ctx->use_vaes_avx2; +#endif u64 Ls[32]; unsigned int n = 32 - (blkn % 32); u64 *l; @@ -834,9 +925,16 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, blkn += 32; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32); - _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, - c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); +#ifdef USE_VAES_AVX2 + if (use_vaes) + _gcry_camellia_vaes_avx2_ocb_auth(ctx, abuf, + c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + else +#endif + _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, + c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); nblocks -= 32; abuf += 32 * CAMELLIA_BLOCK_SIZE; diff --git a/cipher/camellia-vaes-avx2-amd64.S b/cipher/camellia-vaes-avx2-amd64.S new file mode 100644 index 00000000..e6e0c78e --- /dev/null +++ b/cipher/camellia-vaes-avx2-amd64.S @@ -0,0 +1,35 @@ +/* camellia-vaes-avx2-amd64.S - VAES/AVX2 implementation of Camellia cipher + * + * Copyright (C) 2021 Jussi Kivilinna <[hidden email]> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#ifdef __x86_64 +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ + defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) && \ + defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) + +#define CAMELLIA_VAES_BUILD 1 +#define FUNC_NAME(func) _gcry_camellia_vaes_avx2_ ## func + +#include "camellia-aesni-avx2-amd64.h" + +#endif /* defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) */ +#endif /* __x86_64 */ -- 2.27.0 _______________________________________________ Gcrypt-devel mailing list [hidden email] http://lists.gnupg.org/mailman/listinfo/gcrypt-devel |
In reply to this post by Jussi Kivilinna-2
* cipher/rijndael-aesni.c (xts_gfmul_const): Fix array size from 16
to 2. (_gcry_aes_aesni_xts_enc, _gcry_aes_aesni_xts_dec) [__x86_64__]: Add 8-block parallel code paths. -- Signed-off-by: Jussi Kivilinna <[hidden email]> --- cipher/rijndael-aesni.c | 596 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 595 insertions(+), 1 deletion(-) diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 95ec4c2b..9dde0489 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -3661,7 +3661,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, } -static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) = +static const u64 xts_gfmul_const[2] __attribute__ ((aligned (16))) = { 0x87, 0x01 }; @@ -3683,6 +3683,303 @@ _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak, [gfmul] "m" (*xts_gfmul_const) : "memory" ); +#ifdef __x86_64__ + if (nblocks >= 8) + { + aesni_prepare_8_15_variable; + + aesni_prepare_8_15(); + + for ( ;nblocks >= 8 ; nblocks -= 8 ) + { + asm volatile ("pshufd $0x13, %%xmm5, %%xmm11\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqa %%xmm5, %%xmm7\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf0] "m" (*(inbuf + 0 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqa %%xmm5, %%xmm12\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf1] "m" (*(inbuf + 1 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqa %%xmm5, %%xmm13\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf2] "m" (*(inbuf + 2 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqa %%xmm5, %%xmm14\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf3] "m" (*(inbuf + 3 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf4], %%xmm8\n\t" + "pxor %%xmm5, %%xmm8\n\t" + "movdqa %%xmm5, %%xmm15\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf4] "m" (*(inbuf + 4 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf5], %%xmm9\n\t" + "pxor %%xmm5, %%xmm9\n\t" + "movdqu %%xmm5, %[outbuf5]\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : [outbuf5] "=m" (*(outbuf + 5 * 16)) + : [inbuf5] "m" (*(inbuf + 5 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" + "pxor %%xmm5, %%xmm10\n\t" + "movdqu %%xmm5, %[outbuf6]\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : [outbuf6] "=m" (*(outbuf + 6 * 16)) + : [inbuf6] "m" (*(inbuf + 6 * 16)) + : "memory" ); + + asm volatile ("movdqa %%xmm11, %%xmm0\n\t" + "movdqu %[inbuf7], %%xmm11\n\t" + "pxor %%xmm5, %%xmm11\n\t" + "movdqu %%xmm5, %[outbuf7]\n\t" + + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : [outbuf7] "=m" (*(outbuf + 7 * 16)) + : [inbuf7] "m" (*(inbuf + 7 * 16)) + : "memory" ); + + asm volatile ("cmpl $12, %[rounds]\n\t" + "movdqa (%[key]), %%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "pxor %%xmm0, %%xmm4\n\t" + "pxor %%xmm0, %%xmm8\n\t" + "pxor %%xmm0, %%xmm9\n\t" + "pxor %%xmm0, %%xmm10\n\t" + "pxor %%xmm0, %%xmm11\n\t" + "movdqa 0x10(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x20(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x30(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x40(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x50(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x60(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x70(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x80(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0x90(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xa0(%[key]), %%xmm0\n\t" + "jb .Lenclast%=\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xb0(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xc0(%[key]), %%xmm0\n\t" + "je .Lenclast%=\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xd0(%[key]), %%xmm0\n\t" + "aesenc %%xmm0, %%xmm1\n\t" + "aesenc %%xmm0, %%xmm2\n\t" + "aesenc %%xmm0, %%xmm3\n\t" + "aesenc %%xmm0, %%xmm4\n\t" + "aesenc %%xmm0, %%xmm8\n\t" + "aesenc %%xmm0, %%xmm9\n\t" + "aesenc %%xmm0, %%xmm10\n\t" + "aesenc %%xmm0, %%xmm11\n\t" + "movdqa 0xe0(%[key]), %%xmm0\n\t" + + ".Lenclast%=:\n\t" + : + : [key] "r" (ctx->keyschenc), + [rounds] "rm" (ctx->rounds) + : "cc", "memory"); + + asm volatile ("pxor %%xmm0, %%xmm7\n\t" + "pxor %%xmm0, %%xmm12\n\t" + "pxor %%xmm0, %%xmm13\n\t" + "pxor %%xmm0, %%xmm14\n\t" + "aesenclast %%xmm7, %%xmm1\n\t" + "aesenclast %%xmm12, %%xmm2\n\t" + "aesenclast %%xmm13, %%xmm3\n\t" + "aesenclast %%xmm14, %%xmm4\n\t" + "movdqu 5*16(%[outbuf]), %%xmm12\n\t" + "movdqu 6*16(%[outbuf]), %%xmm13\n\t" + "movdqu 7*16(%[outbuf]), %%xmm14\n\t" + "pxor %%xmm0, %%xmm15\n\t" + "pxor %%xmm0, %%xmm12\n\t" + "pxor %%xmm0, %%xmm13\n\t" + "pxor %%xmm0, %%xmm14\n\t" + "aesenclast %%xmm15, %%xmm8\n\t" + "aesenclast %%xmm12, %%xmm9\n\t" + "aesenclast %%xmm13, %%xmm10\n\t" + "aesenclast %%xmm14, %%xmm11\n\t" + "movdqu %%xmm1, 0*16(%[outbuf])\n\t" + "movdqu %%xmm2, 1*16(%[outbuf])\n\t" + "movdqu %%xmm3, 2*16(%[outbuf])\n\t" + "movdqu %%xmm4, 3*16(%[outbuf])\n\t" + "movdqu %%xmm8, 4*16(%[outbuf])\n\t" + "movdqu %%xmm9, 5*16(%[outbuf])\n\t" + "movdqu %%xmm10, 6*16(%[outbuf])\n\t" + "movdqu %%xmm11, 7*16(%[outbuf])\n\t" + : + : [outbuf] "r" (outbuf) + : "memory" ); + + outbuf += 8*BLOCKSIZE; + inbuf += 8*BLOCKSIZE; + } + + aesni_cleanup_8_15(); + } +#endif + for ( ;nblocks >= 4; nblocks -= 4 ) { asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t" @@ -3827,6 +4124,303 @@ _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak, [gfmul] "m" (*xts_gfmul_const) : "memory" ); +#ifdef __x86_64__ + if (nblocks >= 8) + { + aesni_prepare_8_15_variable; + + aesni_prepare_8_15(); + + for ( ;nblocks >= 8 ; nblocks -= 8 ) + { + asm volatile ("pshufd $0x13, %%xmm5, %%xmm11\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqa %%xmm5, %%xmm7\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf0] "m" (*(inbuf + 0 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqa %%xmm5, %%xmm12\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf1] "m" (*(inbuf + 1 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqa %%xmm5, %%xmm13\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf2] "m" (*(inbuf + 2 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqa %%xmm5, %%xmm14\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf3] "m" (*(inbuf + 3 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf4], %%xmm8\n\t" + "pxor %%xmm5, %%xmm8\n\t" + "movdqa %%xmm5, %%xmm15\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : + : [inbuf4] "m" (*(inbuf + 4 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf5], %%xmm9\n\t" + "pxor %%xmm5, %%xmm9\n\t" + "movdqu %%xmm5, %[outbuf5]\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : [outbuf5] "=m" (*(outbuf + 5 * 16)) + : [inbuf5] "m" (*(inbuf + 5 * 16)) + : "memory" ); + + asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" + "pxor %%xmm5, %%xmm10\n\t" + "movdqu %%xmm5, %[outbuf6]\n\t" + + "movdqa %%xmm11, %%xmm0\n\t" + "paddd %%xmm11, %%xmm11\n\t" + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : [outbuf6] "=m" (*(outbuf + 6 * 16)) + : [inbuf6] "m" (*(inbuf + 6 * 16)) + : "memory" ); + + asm volatile ("movdqa %%xmm11, %%xmm0\n\t" + "movdqu %[inbuf7], %%xmm11\n\t" + "pxor %%xmm5, %%xmm11\n\t" + "movdqu %%xmm5, %[outbuf7]\n\t" + + "psrad $31, %%xmm0\n\t" + "paddq %%xmm5, %%xmm5\n\t" + "pand %%xmm6, %%xmm0\n\t" + "pxor %%xmm0, %%xmm5\n\t" + : [outbuf7] "=m" (*(outbuf + 7 * 16)) + : [inbuf7] "m" (*(inbuf + 7 * 16)) + : "memory" ); + + asm volatile ("cmpl $12, %[rounds]\n\t" + "movdqa (%[key]), %%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "pxor %%xmm0, %%xmm4\n\t" + "pxor %%xmm0, %%xmm8\n\t" + "pxor %%xmm0, %%xmm9\n\t" + "pxor %%xmm0, %%xmm10\n\t" + "pxor %%xmm0, %%xmm11\n\t" + "movdqa 0x10(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x20(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x30(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x40(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x50(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x60(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x70(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x80(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0x90(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xa0(%[key]), %%xmm0\n\t" + "jb .Ldeclast%=\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xb0(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xc0(%[key]), %%xmm0\n\t" + "je .Ldeclast%=\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xd0(%[key]), %%xmm0\n\t" + "aesdec %%xmm0, %%xmm1\n\t" + "aesdec %%xmm0, %%xmm2\n\t" + "aesdec %%xmm0, %%xmm3\n\t" + "aesdec %%xmm0, %%xmm4\n\t" + "aesdec %%xmm0, %%xmm8\n\t" + "aesdec %%xmm0, %%xmm9\n\t" + "aesdec %%xmm0, %%xmm10\n\t" + "aesdec %%xmm0, %%xmm11\n\t" + "movdqa 0xe0(%[key]), %%xmm0\n\t" + + ".Ldeclast%=:\n\t" + : + : [key] "r" (ctx->keyschdec), + [rounds] "rm" (ctx->rounds) + : "cc", "memory"); + + asm volatile ("pxor %%xmm0, %%xmm7\n\t" + "pxor %%xmm0, %%xmm12\n\t" + "pxor %%xmm0, %%xmm13\n\t" + "pxor %%xmm0, %%xmm14\n\t" + "aesdeclast %%xmm7, %%xmm1\n\t" + "aesdeclast %%xmm12, %%xmm2\n\t" + "aesdeclast %%xmm13, %%xmm3\n\t" + "aesdeclast %%xmm14, %%xmm4\n\t" + "movdqu 5*16(%[outbuf]), %%xmm12\n\t" + "movdqu 6*16(%[outbuf]), %%xmm13\n\t" + "movdqu 7*16(%[outbuf]), %%xmm14\n\t" + "pxor %%xmm0, %%xmm15\n\t" + "pxor %%xmm0, %%xmm12\n\t" + "pxor %%xmm0, %%xmm13\n\t" + "pxor %%xmm0, %%xmm14\n\t" + "aesdeclast %%xmm15, %%xmm8\n\t" + "aesdeclast %%xmm12, %%xmm9\n\t" + "aesdeclast %%xmm13, %%xmm10\n\t" + "aesdeclast %%xmm14, %%xmm11\n\t" + "movdqu %%xmm1, 0*16(%[outbuf])\n\t" + "movdqu %%xmm2, 1*16(%[outbuf])\n\t" + "movdqu %%xmm3, 2*16(%[outbuf])\n\t" + "movdqu %%xmm4, 3*16(%[outbuf])\n\t" + "movdqu %%xmm8, 4*16(%[outbuf])\n\t" + "movdqu %%xmm9, 5*16(%[outbuf])\n\t" + "movdqu %%xmm10, 6*16(%[outbuf])\n\t" + "movdqu %%xmm11, 7*16(%[outbuf])\n\t" + : + : [outbuf] "r" (outbuf) + : "memory" ); + + outbuf += 8*BLOCKSIZE; + inbuf += 8*BLOCKSIZE; + } + + aesni_cleanup_8_15(); + } +#endif + for ( ;nblocks >= 4; nblocks -= 4 ) { asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t" -- 2.27.0 _______________________________________________ Gcrypt-devel mailing list [hidden email] http://lists.gnupg.org/mailman/listinfo/gcrypt-devel |
In reply to this post by Jussi Kivilinna-2
* cipher/Makefile.am: Add 'rijndael-vaes.c' and
'rijndael-vaes-avx2-amd64.S'. * cipher/rijndael-internal.h (USE_VAES): New. * cipher/rijndael-vaes-avx2-amd64.S: New. * cipher/rijndael-vaes.c: New. * cipher/rijndael.c (_gcry_aes_vaes_cfb_dec, _gcry_aes_vaes_cbc_dec) (_gcry_aes_vaes_ctr_enc, _gcry_aes_vaes_ocb_crypt) (_gcry_aes_vaes_xts_crypt): New. (do_setkey) [USE_VAES]: Add detection for VAES. (selftest_ctr_128, selftest_cbc_128, selftest_cfb_128) [USE_VAES]: Increase number of selftest blocks. * configure.ac: Add 'rijndael-vaes.lo' and 'rijndael-vaes-avx2-amd64.lo'. -- Patch adds VAES/AVX2 accelerated implementation for CBC-decryption, CFB-decryption, CTR-encryption, OCB-en/decryption and XTS-en/decryption. Benchmarks on AMD Ryzen 5800X: Before: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC dec | 0.067 ns/B 14314 MiB/s 0.323 c/B 4850 CFB dec | 0.067 ns/B 14322 MiB/s 0.323 c/B 4850 CTR enc | 0.066 ns/B 14429 MiB/s 0.321 c/B 4850 CTR dec | 0.066 ns/B 14433 MiB/s 0.320 c/B 4850 XTS enc | 0.087 ns/B 10910 MiB/s 0.424 c/B 4850 XTS dec | 0.088 ns/B 10856 MiB/s 0.426 c/B 4850 OCB enc | 0.070 ns/B 13633 MiB/s 0.339 c/B 4850 OCB dec | 0.069 ns/B 13911 MiB/s 0.332 c/B 4850 After (XTS ~1.7x faster, others ~1.9x faster): AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC dec | 0.034 ns/B 28159 MiB/s 0.164 c/B 4850 CFB dec | 0.034 ns/B 27955 MiB/s 0.165 c/B 4850 CTR enc | 0.034 ns/B 28214 MiB/s 0.164 c/B 4850 CTR dec | 0.034 ns/B 28146 MiB/s 0.164 c/B 4850 XTS enc | 0.051 ns/B 18539 MiB/s 0.249 c/B 4850 XTS dec | 0.051 ns/B 18655 MiB/s 0.248 c/B 4850 GCM auth | 0.088 ns/B 10817 MiB/s 0.428 c/B 4850 OCB enc | 0.037 ns/B 25824 MiB/s 0.179 c/B 4850 OCB dec | 0.038 ns/B 25359 MiB/s 0.182 c/B 4850 Signed-off-by: Jussi Kivilinna <[hidden email]> --- cipher/Makefile.am | 1 + cipher/rijndael-internal.h | 10 + cipher/rijndael-vaes-avx2-amd64.S | 2693 +++++++++++++++++++++++++++++ cipher/rijndael-vaes.c | 176 ++ cipher/rijndael.c | 45 + 5 files changed, 2925 insertions(+) create mode 100644 cipher/rijndael-vaes-avx2-amd64.S create mode 100644 cipher/rijndael-vaes.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 75680fcd..e2100cf3 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -100,6 +100,7 @@ EXTRA_libcipher_la_SOURCES = \ rijndael-aesni.c rijndael-padlock.c \ rijndael-amd64.S rijndael-arm.S \ rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \ + rijndael-vaes.c rijndael-vaes-avx2-amd64.S \ rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S \ rijndael-armv8-aarch64-ce.S rijndael-aarch64.S \ rijndael-ppc.c rijndael-ppc9le.c \ diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h index 7e01f6b0..30604088 100644 --- a/cipher/rijndael-internal.h +++ b/cipher/rijndael-internal.h @@ -89,6 +89,16 @@ # endif #endif /* ENABLE_AESNI_SUPPORT */ +/* USE_VAES inidicates whether to compile with Intel VAES code. */ +#undef USE_VAES +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ + defined(__x86_64__) && defined(ENABLE_AVX2_SUPPORT) && \ + defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) && \ + defined(USE_AESNI) +# define USE_VAES 1 +#endif + /* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly * code. */ #undef USE_ARM_CE diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S new file mode 100644 index 00000000..c4deea9b --- /dev/null +++ b/cipher/rijndael-vaes-avx2-amd64.S @@ -0,0 +1,2693 @@ +/* VAES/AVX2 AMD64 accelerated AES for Libgcrypt + * Copyright (C) 2021 Jussi Kivilinna <[hidden email]> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#if defined(__x86_64__) +#include <config.h> +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ + defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) && \ + defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) + +#include "asm-common-amd64.h" + +.text + +/********************************************************************** + helper macros + **********************************************************************/ +#define no(...) /*_*/ +#define yes(...) __VA_ARGS__ + +#define AES_OP8(op, key, b0, b1, b2, b3, b4, b5, b6, b7) \ + op key, b0, b0; \ + op key, b1, b1; \ + op key, b2, b2; \ + op key, b3, b3; \ + op key, b4, b4; \ + op key, b5, b5; \ + op key, b6, b6; \ + op key, b7, b7; + +#define VAESENC8(key, b0, b1, b2, b3, b4, b5, b6, b7) \ + AES_OP8(vaesenc, key, b0, b1, b2, b3, b4, b5, b6, b7) + +#define VAESDEC8(key, b0, b1, b2, b3, b4, b5, b6, b7) \ + AES_OP8(vaesdec, key, b0, b1, b2, b3, b4, b5, b6, b7) + +#define XOR8(key, b0, b1, b2, b3, b4, b5, b6, b7) \ + AES_OP8(vpxor, key, b0, b1, b2, b3, b4, b5, b6, b7) + +#define AES_OP4(op, key, b0, b1, b2, b3) \ + op key, b0, b0; \ + op key, b1, b1; \ + op key, b2, b2; \ + op key, b3, b3; + +#define VAESENC4(key, b0, b1, b2, b3) \ + AES_OP4(vaesenc, key, b0, b1, b2, b3) + +#define VAESDEC4(key, b0, b1, b2, b3) \ + AES_OP4(vaesdec, key, b0, b1, b2, b3) + +#define XOR4(key, b0, b1, b2, b3) \ + AES_OP4(vpxor, key, b0, b1, b2, b3) + +#define AES_OP2(op, key, b0, b1) \ + op key, b0, b0; \ + op key, b1, b1; + +#define VAESENC2(key, b0, b1) \ + AES_OP2(vaesenc, key, b0, b1) + +#define VAESDEC2(key, b0, b1) \ + AES_OP2(vaesdec, key, b0, b1) + +#define XOR2(key, b0, b1) \ + AES_OP2(vpxor, key, b0, b1) + +/********************************************************************** + CBC-mode decryption + **********************************************************************/ +ELF(.type _gcry_vaes_avx2_cbc_dec_amd64,@function) +.globl _gcry_vaes_avx2_cbc_dec_amd64 +_gcry_vaes_avx2_cbc_dec_amd64: + /* input: + * %rdi: round keys + * %rsi: iv + * %rdx: dst + * %rcx: src + * %r8: nblocks + * %r9: nrounds + */ + CFI_STARTPROC(); + + /* Load IV. */ + vmovdqu (%rsi), %xmm15; + + /* Process 16 blocks per loop. */ +.align 8 +.Lcbc_dec_blk16: + cmpq $16, %r8; + jb .Lcbc_dec_blk8; + + leaq -16(%r8), %r8; + + /* Load input and xor first key. Update IV. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm8; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vmovdqu (4 * 16)(%rcx), %ymm2; + vmovdqu (6 * 16)(%rcx), %ymm3; + vmovdqu (8 * 16)(%rcx), %ymm4; + vmovdqu (10 * 16)(%rcx), %ymm5; + vmovdqu (12 * 16)(%rcx), %ymm6; + vmovdqu (14 * 16)(%rcx), %ymm7; + vpxor %ymm8, %ymm0, %ymm0; + vpxor %ymm8, %ymm1, %ymm1; + vpxor %ymm8, %ymm2, %ymm2; + vpxor %ymm8, %ymm3, %ymm3; + vpxor %ymm8, %ymm4, %ymm4; + vpxor %ymm8, %ymm5, %ymm5; + vpxor %ymm8, %ymm6, %ymm6; + vpxor %ymm8, %ymm7, %ymm7; + vbroadcasti128 (1 * 16)(%rdi), %ymm8; + vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm9; + vmovdqu (1 * 16)(%rcx), %ymm10; + vmovdqu (3 * 16)(%rcx), %ymm11; + vmovdqu (5 * 16)(%rcx), %ymm12; + vmovdqu (7 * 16)(%rcx), %ymm13; + vmovdqu (9 * 16)(%rcx), %ymm14; + vmovdqu (15 * 16)(%rcx), %xmm15; + leaq (16 * 16)(%rcx), %rcx; + + /* AES rounds */ + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (2 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (3 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (4 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (5 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (6 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (7 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (8 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (9 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (10 * 16)(%rdi), %ymm8; + cmpl $12, %r9d; + jb .Lcbc_dec_blk16_last; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (11 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (12 * 16)(%rdi), %ymm8; + jz .Lcbc_dec_blk16_last; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (13 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (14 * 16)(%rdi), %ymm8; + + /* Last round and output handling. */ + .Lcbc_dec_blk16_last: + vpxor %ymm8, %ymm9, %ymm9; + vpxor %ymm8, %ymm10, %ymm10; + vpxor %ymm8, %ymm11, %ymm11; + vpxor %ymm8, %ymm12, %ymm12; + vpxor %ymm8, %ymm13, %ymm13; + vpxor %ymm8, %ymm14, %ymm14; + vaesdeclast %ymm9, %ymm0, %ymm0; + vaesdeclast %ymm10, %ymm1, %ymm1; + vpxor (-5 * 16)(%rcx), %ymm8, %ymm9; + vpxor (-3 * 16)(%rcx), %ymm8, %ymm10; + vaesdeclast %ymm11, %ymm2, %ymm2; + vaesdeclast %ymm12, %ymm3, %ymm3; + vaesdeclast %ymm13, %ymm4, %ymm4; + vaesdeclast %ymm14, %ymm5, %ymm5; + vaesdeclast %ymm9, %ymm6, %ymm6; + vaesdeclast %ymm10, %ymm7, %ymm7; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + vmovdqu %ymm4, (8 * 16)(%rdx); + vmovdqu %ymm5, (10 * 16)(%rdx); + vmovdqu %ymm6, (12 * 16)(%rdx); + vmovdqu %ymm7, (14 * 16)(%rdx); + leaq (16 * 16)(%rdx), %rdx; + + jmp .Lcbc_dec_blk16; + + /* Handle trailing eight blocks. */ +.align 8 +.Lcbc_dec_blk8: + cmpq $8, %r8; + jb .Lcbc_dec_blk4; + + leaq -8(%r8), %r8; + + /* Load input and xor first key. Update IV. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vmovdqu (4 * 16)(%rcx), %ymm2; + vmovdqu (6 * 16)(%rcx), %ymm3; + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm4, %ymm1, %ymm1; + vpxor %ymm4, %ymm2, %ymm2; + vpxor %ymm4, %ymm3, %ymm3; + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10; + vmovdqu (1 * 16)(%rcx), %ymm11; + vmovdqu (3 * 16)(%rcx), %ymm12; + vmovdqu (5 * 16)(%rcx), %ymm13; + vmovdqu (7 * 16)(%rcx), %xmm15; + leaq (8 * 16)(%rcx), %rcx; + + /* AES rounds */ + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lcbc_dec_blk8_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lcbc_dec_blk8_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lcbc_dec_blk8_last: + vpxor %ymm4, %ymm10, %ymm10; + vpxor %ymm4, %ymm11, %ymm11; + vpxor %ymm4, %ymm12, %ymm12; + vpxor %ymm4, %ymm13, %ymm13; + vaesdeclast %ymm10, %ymm0, %ymm0; + vaesdeclast %ymm11, %ymm1, %ymm1; + vaesdeclast %ymm12, %ymm2, %ymm2; + vaesdeclast %ymm13, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + /* Handle trailing four blocks. */ +.align 8 +.Lcbc_dec_blk4: + cmpq $4, %r8; + jb .Lcbc_dec_blk1; + + leaq -4(%r8), %r8; + + /* Load input and xor first key. Update IV. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm4, %ymm1, %ymm1; + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10; + vmovdqu (1 * 16)(%rcx), %ymm11; + vmovdqu (3 * 16)(%rcx), %xmm15; + leaq (4 * 16)(%rcx), %rcx; + + /* AES rounds */ + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lcbc_dec_blk4_last; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lcbc_dec_blk4_last; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lcbc_dec_blk4_last: + vpxor %ymm4, %ymm10, %ymm10; + vpxor %ymm4, %ymm11, %ymm11; + vaesdeclast %ymm10, %ymm0, %ymm0; + vaesdeclast %ymm11, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + + /* Process trailing one to three blocks, one per loop. */ +.align 8 +.Lcbc_dec_blk1: + cmpq $1, %r8; + jb .Ldone_cbc_dec; + + leaq -1(%r8), %r8; + + /* Load input. */ + vmovdqu (%rcx), %xmm2; + leaq 16(%rcx), %rcx; + + /* Xor first key. */ + vpxor (0 * 16)(%rdi), %xmm2, %xmm0; + + /* AES rounds. */ + vaesdec (1 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (2 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (3 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (4 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (5 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (6 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (7 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (8 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (9 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (10 * 16)(%rdi), %xmm1; + cmpl $12, %r9d; + jb .Lcbc_dec_blk1_last; + vaesdec %xmm1, %xmm0, %xmm0; + vaesdec (11 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (12 * 16)(%rdi), %xmm1; + jz .Lcbc_dec_blk1_last; + vaesdec %xmm1, %xmm0, %xmm0; + vaesdec (13 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (14 * 16)(%rdi), %xmm1; + + /* Last round and output handling. */ + .Lcbc_dec_blk1_last: + vpxor %xmm1, %xmm15, %xmm15; + vaesdeclast %xmm15, %xmm0, %xmm0; + vmovdqa %xmm2, %xmm15; + vmovdqu %xmm0, (%rdx); + leaq 16(%rdx), %rdx; + + jmp .Lcbc_dec_blk1; + +.align 8 +.Ldone_cbc_dec: + /* Store IV. */ + vmovdqu %xmm15, (%rsi); + + vzeroall; + ret + CFI_ENDPROC(); +ELF(.size _gcry_vaes_avx2_cbc_dec_amd64,.-_gcry_vaes_avx2_cbc_dec_amd64) + +/********************************************************************** + CFB-mode decryption + **********************************************************************/ +ELF(.type _gcry_vaes_avx2_cfb_dec_amd64,@function) +.globl _gcry_vaes_avx2_cfb_dec_amd64 +_gcry_vaes_avx2_cfb_dec_amd64: + /* input: + * %rdi: round keys + * %rsi: iv + * %rdx: dst + * %rcx: src + * %r8: nblocks + * %r9: nrounds + */ + CFI_STARTPROC(); + + /* Load IV. */ + vmovdqu (%rsi), %xmm15; + + /* Process 16 blocks per loop. */ +.align 8 +.Lcfb_dec_blk16: + cmpq $16, %r8; + jb .Lcfb_dec_blk8; + + leaq -16(%r8), %r8; + + /* Load input and xor first key. Update IV. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm8; + vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0; + vmovdqu (1 * 16)(%rcx), %ymm1; + vmovdqu (3 * 16)(%rcx), %ymm2; + vmovdqu (5 * 16)(%rcx), %ymm3; + vmovdqu (7 * 16)(%rcx), %ymm4; + vmovdqu (9 * 16)(%rcx), %ymm5; + vmovdqu (11 * 16)(%rcx), %ymm6; + vmovdqu (13 * 16)(%rcx), %ymm7; + vmovdqu (15 * 16)(%rcx), %xmm15; + vpxor %ymm8, %ymm0, %ymm0; + vpxor %ymm8, %ymm1, %ymm1; + vpxor %ymm8, %ymm2, %ymm2; + vpxor %ymm8, %ymm3, %ymm3; + vpxor %ymm8, %ymm4, %ymm4; + vpxor %ymm8, %ymm5, %ymm5; + vpxor %ymm8, %ymm6, %ymm6; + vpxor %ymm8, %ymm7, %ymm7; + vbroadcasti128 (1 * 16)(%rdi), %ymm8; + vmovdqu (0 * 16)(%rcx), %ymm9; + vmovdqu (2 * 16)(%rcx), %ymm10; + vmovdqu (4 * 16)(%rcx), %ymm11; + vmovdqu (6 * 16)(%rcx), %ymm12; + vmovdqu (8 * 16)(%rcx), %ymm13; + vmovdqu (10 * 16)(%rcx), %ymm14; + + leaq (16 * 16)(%rcx), %rcx; + + /* AES rounds */ + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (2 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (3 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (4 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (5 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (6 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (7 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (8 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (9 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (10 * 16)(%rdi), %ymm8; + cmpl $12, %r9d; + jb .Lcfb_dec_blk16_last; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (11 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (12 * 16)(%rdi), %ymm8; + jz .Lcfb_dec_blk16_last; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (13 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (14 * 16)(%rdi), %ymm8; + + /* Last round and output handling. */ + .Lcfb_dec_blk16_last: + vpxor %ymm8, %ymm9, %ymm9; + vpxor %ymm8, %ymm10, %ymm10; + vpxor %ymm8, %ymm11, %ymm11; + vpxor %ymm8, %ymm12, %ymm12; + vpxor %ymm8, %ymm13, %ymm13; + vpxor %ymm8, %ymm14, %ymm14; + vaesenclast %ymm9, %ymm0, %ymm0; + vaesenclast %ymm10, %ymm1, %ymm1; + vpxor (-4 * 16)(%rcx), %ymm8, %ymm9; + vpxor (-2 * 16)(%rcx), %ymm8, %ymm10; + vaesenclast %ymm11, %ymm2, %ymm2; + vaesenclast %ymm12, %ymm3, %ymm3; + vaesenclast %ymm13, %ymm4, %ymm4; + vaesenclast %ymm14, %ymm5, %ymm5; + vaesenclast %ymm9, %ymm6, %ymm6; + vaesenclast %ymm10, %ymm7, %ymm7; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + vmovdqu %ymm4, (8 * 16)(%rdx); + vmovdqu %ymm5, (10 * 16)(%rdx); + vmovdqu %ymm6, (12 * 16)(%rdx); + vmovdqu %ymm7, (14 * 16)(%rdx); + leaq (16 * 16)(%rdx), %rdx; + + jmp .Lcfb_dec_blk16; + + /* Handle trailing eight blocks. */ +.align 8 +.Lcfb_dec_blk8: + cmpq $8, %r8; + jb .Lcfb_dec_blk4; + + leaq -8(%r8), %r8; + + /* Load input and xor first key. Update IV. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0; + vmovdqu (1 * 16)(%rcx), %ymm1; + vmovdqu (3 * 16)(%rcx), %ymm2; + vmovdqu (5 * 16)(%rcx), %ymm3; + vmovdqu (7 * 16)(%rcx), %xmm15; + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm4, %ymm1, %ymm1; + vpxor %ymm4, %ymm2, %ymm2; + vpxor %ymm4, %ymm3, %ymm3; + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + vmovdqu (0 * 16)(%rcx), %ymm10; + vmovdqu (2 * 16)(%rcx), %ymm11; + vmovdqu (4 * 16)(%rcx), %ymm12; + vmovdqu (6 * 16)(%rcx), %ymm13; + + leaq (8 * 16)(%rcx), %rcx; + + /* AES rounds */ + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lcfb_dec_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lcfb_dec_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lcfb_dec_blk8_last: + vpxor %ymm4, %ymm10, %ymm10; + vpxor %ymm4, %ymm11, %ymm11; + vpxor %ymm4, %ymm12, %ymm12; + vpxor %ymm4, %ymm13, %ymm13; + vaesenclast %ymm10, %ymm0, %ymm0; + vaesenclast %ymm11, %ymm1, %ymm1; + vaesenclast %ymm12, %ymm2, %ymm2; + vaesenclast %ymm13, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + /* Handle trailing four blocks. */ +.align 8 +.Lcfb_dec_blk4: + cmpq $4, %r8; + jb .Lcfb_dec_blk1; + + leaq -4(%r8), %r8; + + /* Load input and xor first key. Update IV. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0; + vmovdqu (1 * 16)(%rcx), %ymm1; + vmovdqu (3 * 16)(%rcx), %xmm15; + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm4, %ymm1, %ymm1; + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + vmovdqu (0 * 16)(%rcx), %ymm10; + vmovdqu (2 * 16)(%rcx), %ymm11; + + leaq (4 * 16)(%rcx), %rcx; + + /* AES rounds */ + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lcfb_dec_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lcfb_dec_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lcfb_dec_blk4_last: + vpxor %ymm4, %ymm10, %ymm10; + vpxor %ymm4, %ymm11, %ymm11; + vaesenclast %ymm10, %ymm0, %ymm0; + vaesenclast %ymm11, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + + /* Process trailing one to three blocks, one per loop. */ +.align 8 +.Lcfb_dec_blk1: + cmpq $1, %r8; + jb .Ldone_cfb_dec; + + leaq -1(%r8), %r8; + + /* Xor first key. */ + vpxor (0 * 16)(%rdi), %xmm15, %xmm0; + + /* Load input as next IV. */ + vmovdqu (%rcx), %xmm15; + leaq 16(%rcx), %rcx; + + /* AES rounds. */ + vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (10 * 16)(%rdi), %xmm1; + cmpl $12, %r9d; + jb .Lcfb_dec_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (12 * 16)(%rdi), %xmm1; + jz .Lcfb_dec_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (14 * 16)(%rdi), %xmm1; + + /* Last round and output handling. */ + .Lcfb_dec_blk1_last: + vpxor %xmm15, %xmm1, %xmm1; + vaesenclast %xmm1, %xmm0, %xmm0; + vmovdqu %xmm0, (%rdx); + leaq 16(%rdx), %rdx; + + jmp .Lcfb_dec_blk1; + +.align 8 +.Ldone_cfb_dec: + /* Store IV. */ + vmovdqu %xmm15, (%rsi); + + vzeroall; + ret + CFI_ENDPROC(); +ELF(.size _gcry_vaes_avx2_cfb_dec_amd64,.-_gcry_vaes_avx2_cfb_dec_amd64) + +/********************************************************************** + CTR-mode encryption + **********************************************************************/ +ELF(.type _gcry_vaes_avx2_ctr_enc_amd64,@function) +.globl _gcry_vaes_avx2_ctr_enc_amd64 +_gcry_vaes_avx2_ctr_enc_amd64: + /* input: + * %rdi: round keys + * %rsi: counter + * %rdx: dst + * %rcx: src + * %r8: nblocks + * %r9: nrounds + */ + CFI_STARTPROC(); + + movq 8(%rsi), %r10; + movq 0(%rsi), %r11; + bswapq %r10; + bswapq %r11; + + vpcmpeqd %ymm15, %ymm15, %ymm15; + vpsrldq $8, %ymm15, %ymm15; // 0:-1 + vpaddq %ymm15, %ymm15, %ymm14; // 0:-2 + vbroadcasti128 .Lbswap128_mask rRIP, %ymm13; + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ + vpcmpeqq minus_one, x, tmp1; \ + vpcmpeqq minus_two, x, tmp2; \ + vpor tmp1, tmp2, tmp2; \ + vpsubq minus_two, x, x; \ + vpslldq $8, tmp2, tmp2; \ + vpsubq tmp2, x, x; + + /* Process 16 blocks per loop. */ +.align 8 +.Lctr_enc_blk16: + cmpq $16, %r8; + jb .Lctr_enc_blk8; + + leaq -16(%r8), %r8; + + vbroadcasti128 (%rsi), %ymm7; + vbroadcasti128 (0 * 16)(%rdi), %ymm8; + + /* detect if carry handling is needed */ + addb $16, 15(%rsi); + jc .Lctr_enc_blk16_handle_carry; + + /* Increment counters. */ + vpaddb .Lbige_addb_0 rRIP, %ymm7, %ymm0; + vpaddb .Lbige_addb_2 rRIP, %ymm7, %ymm1; + vpaddb .Lbige_addb_4 rRIP, %ymm7, %ymm2; + vpaddb .Lbige_addb_6 rRIP, %ymm7, %ymm3; + vpaddb .Lbige_addb_8 rRIP, %ymm7, %ymm4; + vpaddb .Lbige_addb_10 rRIP, %ymm7, %ymm5; + vpaddb .Lbige_addb_12 rRIP, %ymm7, %ymm6; + vpaddb .Lbige_addb_14 rRIP, %ymm7, %ymm7; + leaq 16(%r10), %r10; + + .Lctr_enc_blk16_rounds: + /* AES rounds */ + XOR8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (1 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (2 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (3 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (4 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (5 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (6 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (7 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (8 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (9 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (10 * 16)(%rdi), %ymm8; + cmpl $12, %r9d; + jb .Lctr_enc_blk16_last; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (11 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (12 * 16)(%rdi), %ymm8; + jz .Lctr_enc_blk16_last; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (13 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (14 * 16)(%rdi), %ymm8; + + /* Last round and output handling. */ + .Lctr_enc_blk16_last: + vpxor (0 * 16)(%rcx), %ymm8, %ymm9; /* Xor src to last round key. */ + vpxor (2 * 16)(%rcx), %ymm8, %ymm10; + vpxor (4 * 16)(%rcx), %ymm8, %ymm11; + vpxor (6 * 16)(%rcx), %ymm8, %ymm12; + vaesenclast %ymm9, %ymm0, %ymm0; + vaesenclast %ymm10, %ymm1, %ymm1; + vaesenclast %ymm11, %ymm2, %ymm2; + vaesenclast %ymm12, %ymm3, %ymm3; + vpxor (8 * 16)(%rcx), %ymm8, %ymm9; + vpxor (10 * 16)(%rcx), %ymm8, %ymm10; + vpxor (12 * 16)(%rcx), %ymm8, %ymm11; + vpxor (14 * 16)(%rcx), %ymm8, %ymm8; + leaq (16 * 16)(%rcx), %rcx; + vaesenclast %ymm9, %ymm4, %ymm4; + vaesenclast %ymm10, %ymm5, %ymm5; + vaesenclast %ymm11, %ymm6, %ymm6; + vaesenclast %ymm8, %ymm7, %ymm7; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + vmovdqu %ymm4, (8 * 16)(%rdx); + vmovdqu %ymm5, (10 * 16)(%rdx); + vmovdqu %ymm6, (12 * 16)(%rdx); + vmovdqu %ymm7, (14 * 16)(%rdx); + leaq (16 * 16)(%rdx), %rdx; + + jmp .Lctr_enc_blk16; + + .align 8 + .Lctr_enc_blk16_handle_carry: + /* Increment counters (handle carry). */ + vpshufb %xmm13, %xmm7, %xmm1; /* be => le */ + vmovdqa %xmm1, %xmm0; + inc_le128(%xmm1, %xmm15, %xmm5); + vinserti128 $1, %xmm1, %ymm0, %ymm7; /* ctr: +1:+0 */ + vpshufb %ymm13, %ymm7, %ymm0; + addq $16, %r10; + adcq $0, %r11; + bswapq %r10; + bswapq %r11; + movq %r10, 8(%rsi); + movq %r11, 0(%rsi); + bswapq %r10; + bswapq %r11; + add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +3:+2 */ + vpshufb %ymm13, %ymm7, %ymm1; + add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +5:+4 */ + vpshufb %ymm13, %ymm7, %ymm2; + add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +7:+6 */ + vpshufb %ymm13, %ymm7, %ymm3; + add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +9:+8 */ + vpshufb %ymm13, %ymm7, %ymm4; + add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +11:+10 */ + vpshufb %ymm13, %ymm7, %ymm5; + add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +13:+12 */ + vpshufb %ymm13, %ymm7, %ymm6; + add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +15:+14 */ + vpshufb %ymm13, %ymm7, %ymm7; + + jmp .Lctr_enc_blk16_rounds; + + /* Handle trailing eight blocks. */ +.align 8 +.Lctr_enc_blk8: + cmpq $8, %r8; + jb .Lctr_enc_blk4; + + leaq -8(%r8), %r8; + + vbroadcasti128 (%rsi), %ymm3; + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + + /* detect if carry handling is needed */ + addb $8, 15(%rsi); + jc .Lctr_enc_blk8_handle_carry; + + /* Increment counters. */ + vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0; + vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1; + vpaddb .Lbige_addb_4 rRIP, %ymm3, %ymm2; + vpaddb .Lbige_addb_6 rRIP, %ymm3, %ymm3; + leaq 8(%r10), %r10; + + .Lctr_enc_blk8_rounds: + /* AES rounds */ + XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lctr_enc_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lctr_enc_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lctr_enc_blk8_last: + vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */ + vpxor (2 * 16)(%rcx), %ymm4, %ymm6; + vpxor (4 * 16)(%rcx), %ymm4, %ymm7; + vpxor (6 * 16)(%rcx), %ymm4, %ymm4; + leaq (8 * 16)(%rcx), %rcx; + vaesenclast %ymm5, %ymm0, %ymm0; + vaesenclast %ymm6, %ymm1, %ymm1; + vaesenclast %ymm7, %ymm2, %ymm2; + vaesenclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + jmp .Lctr_enc_blk4; + + .align 8 + .Lctr_enc_blk8_handle_carry: + /* Increment counters (handle carry). */ + vpshufb %xmm13, %xmm3, %xmm1; /* be => le */ + vmovdqa %xmm1, %xmm0; + inc_le128(%xmm1, %xmm15, %xmm5); + vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */ + vpshufb %ymm13, %ymm3, %ymm0; + addq $8, %r10; + adcq $0, %r11; + bswapq %r10; + bswapq %r11; + movq %r10, 8(%rsi); + movq %r11, 0(%rsi); + bswapq %r10; + bswapq %r11; + add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */ + vpshufb %ymm13, %ymm3, %ymm1; + add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +5:+4 */ + vpshufb %ymm13, %ymm3, %ymm2; + add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +7:+6 */ + vpshufb %ymm13, %ymm3, %ymm3; + + jmp .Lctr_enc_blk8_rounds; + + /* Handle trailing four blocks. */ +.align 8 +.Lctr_enc_blk4: + cmpq $4, %r8; + jb .Lctr_enc_blk1; + + leaq -4(%r8), %r8; + + vbroadcasti128 (%rsi), %ymm3; + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + + /* detect if carry handling is needed */ + addb $4, 15(%rsi); + jc .Lctr_enc_blk4_handle_carry; + + /* Increment counters. */ + vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0; + vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1; + leaq 4(%r10), %r10; + + .Lctr_enc_blk4_rounds: + /* AES rounds */ + XOR2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lctr_enc_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lctr_enc_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lctr_enc_blk4_last: + vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */ + vpxor (2 * 16)(%rcx), %ymm4, %ymm6; + leaq (4 * 16)(%rcx), %rcx; + vaesenclast %ymm5, %ymm0, %ymm0; + vaesenclast %ymm6, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + + jmp .Lctr_enc_blk1; + + .align 8 + .Lctr_enc_blk4_handle_carry: + /* Increment counters (handle carry). */ + vpshufb %xmm13, %xmm3, %xmm1; /* be => le */ + vmovdqa %xmm1, %xmm0; + inc_le128(%xmm1, %xmm15, %xmm5); + vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */ + vpshufb %ymm13, %ymm3, %ymm0; + addq $4, %r10; + adcq $0, %r11; + bswapq %r10; + bswapq %r11; + movq %r10, 8(%rsi); + movq %r11, 0(%rsi); + bswapq %r10; + bswapq %r11; + add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */ + vpshufb %ymm13, %ymm3, %ymm1; + + jmp .Lctr_enc_blk4_rounds; + + /* Process trailing one to three blocks, one per loop. */ +.align 8 +.Lctr_enc_blk1: + cmpq $1, %r8; + jb .Ldone_ctr_enc; + + leaq -1(%r8), %r8; + + /* Load and increament counter. */ + vmovdqu (%rsi), %xmm0; + addq $1, %r10; + adcq $0, %r11; + bswapq %r10; + bswapq %r11; + movq %r10, 8(%rsi); + movq %r11, 0(%rsi); + bswapq %r10; + bswapq %r11; + + /* AES rounds. */ + vpxor (0 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (10 * 16)(%rdi), %xmm1; + cmpl $12, %r9d; + jb .Lctr_enc_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (12 * 16)(%rdi), %xmm1; + jz .Lctr_enc_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (14 * 16)(%rdi), %xmm1; + + /* Last round and output handling. */ + .Lctr_enc_blk1_last: + vpxor (%rcx), %xmm1, %xmm1; /* Xor src to last round key. */ + leaq 16(%rcx), %rcx; + vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */ + vmovdqu %xmm0, (%rdx); + leaq 16(%rdx), %rdx; + + jmp .Lctr_enc_blk1; + +.align 8 +.Ldone_ctr_enc: + vzeroall; + xorl %r10d, %r10d; + xorl %r11d, %r11d; + ret + CFI_ENDPROC(); +ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64) + +/********************************************************************** + OCB-mode encryption/decryption + **********************************************************************/ +ELF(.type _gcry_vaes_avx2_ocb_checksum,@function) +_gcry_vaes_avx2_ocb_checksum: + /* input: + * %rax: offset pointer + * %r10: plaintext pointer + * %r11: nblocks + */ + CFI_STARTPROC(); + + vpxor %xmm0, %xmm0, %xmm0; + cmpq $4, %r11; + jb .Locb_checksum_blk1; + vpxor %xmm1, %xmm1, %xmm1; + vpxor %xmm2, %xmm2, %xmm2; + vpxor %xmm3, %xmm3, %xmm3; + cmpq $16, %r11; + jb .Locb_checksum_blk4; + vpxor %xmm4, %xmm4, %xmm4; + vpxor %xmm5, %xmm5, %xmm5; + vpxor %xmm6, %xmm6, %xmm6; + vpxor %xmm7, %xmm7, %xmm7; + cmpq $32, %r11; + jb .Locb_checksum_blk16; + vpxor %xmm8, %xmm8, %xmm8; + vpxor %xmm9, %xmm9, %xmm9; + vpxor %xmm10, %xmm10, %xmm10; + vpxor %xmm11, %xmm11, %xmm11; + vpxor %xmm12, %xmm12, %xmm12; + vpxor %xmm13, %xmm13, %xmm13; + vpxor %xmm14, %xmm14, %xmm14; + vpxor %xmm15, %xmm15, %xmm15; + +.align 8 +.Locb_checksum_blk32: + cmpq $32, %r11; + jb .Locb_checksum_blk32_done; + + leaq -32(%r11), %r11; + + vpxor (0 * 16)(%r10), %ymm0, %ymm0; + vpxor (2 * 16)(%r10), %ymm1, %ymm1; + vpxor (4 * 16)(%r10), %ymm2, %ymm2; + vpxor (6 * 16)(%r10), %ymm3, %ymm3; + vpxor (8 * 16)(%r10), %ymm4, %ymm4; + vpxor (10 * 16)(%r10), %ymm5, %ymm5; + vpxor (12 * 16)(%r10), %ymm6, %ymm6; + vpxor (14 * 16)(%r10), %ymm7, %ymm7; + vpxor (16 * 16)(%r10), %ymm8, %ymm8; + vpxor (18 * 16)(%r10), %ymm9, %ymm9; + vpxor (20 * 16)(%r10), %ymm10, %ymm10; + vpxor (22 * 16)(%r10), %ymm11, %ymm11; + vpxor (24 * 16)(%r10), %ymm12, %ymm12; + vpxor (26 * 16)(%r10), %ymm13, %ymm13; + vpxor (28 * 16)(%r10), %ymm14, %ymm14; + vpxor (30 * 16)(%r10), %ymm15, %ymm15; + leaq (32 * 16)(%r10), %r10; + + jmp .Locb_checksum_blk32; + +.align 8 +.Locb_checksum_blk32_done: + vpxor %ymm8, %ymm0, %ymm0; + vpxor %ymm9, %ymm1, %ymm1; + vpxor %ymm10, %ymm2, %ymm2; + vpxor %ymm11, %ymm3, %ymm3; + vpxor %ymm12, %ymm4, %ymm4; + vpxor %ymm13, %ymm5, %ymm5; + vpxor %ymm14, %ymm6, %ymm6; + vpxor %ymm15, %ymm7, %ymm7; + +.align 8 +.Locb_checksum_blk16: + cmpq $16, %r11; + jb .Locb_checksum_blk16_done; + + leaq -16(%r11), %r11; + + vpxor (0 * 16)(%r10), %ymm0, %ymm0; + vpxor (2 * 16)(%r10), %ymm1, %ymm1; + vpxor (4 * 16)(%r10), %ymm2, %ymm2; + vpxor (6 * 16)(%r10), %ymm3, %ymm3; + vpxor (8 * 16)(%r10), %ymm4, %ymm4; + vpxor (10 * 16)(%r10), %ymm5, %ymm5; + vpxor (12 * 16)(%r10), %ymm6, %ymm6; + vpxor (14 * 16)(%r10), %ymm7, %ymm7; + leaq (16 * 16)(%r10), %r10; + + jmp .Locb_checksum_blk16; + +.align 8 +.Locb_checksum_blk16_done: + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm5, %ymm1, %ymm1; + vpxor %ymm6, %ymm2, %ymm2; + vpxor %ymm7, %ymm3, %ymm3; + vextracti128 $1, %ymm0, %xmm4; + vextracti128 $1, %ymm1, %xmm5; + vextracti128 $1, %ymm2, %xmm6; + vextracti128 $1, %ymm3, %xmm7; + vpxor %xmm4, %xmm0, %xmm0; + vpxor %xmm5, %xmm1, %xmm1; + vpxor %xmm6, %xmm2, %xmm2; + vpxor %xmm7, %xmm3, %xmm3; + +.align 8 +.Locb_checksum_blk4: + cmpq $4, %r11; + jb .Locb_checksum_blk4_done; + + leaq -4(%r11), %r11; + + vpxor (0 * 16)(%r10), %xmm0, %xmm0; + vpxor (1 * 16)(%r10), %xmm1, %xmm1; + vpxor (2 * 16)(%r10), %xmm2, %xmm2; + vpxor (3 * 16)(%r10), %xmm3, %xmm3; + leaq (4 * 16)(%r10), %r10; + + jmp .Locb_checksum_blk4; + +.align 8 +.Locb_checksum_blk4_done: + vpxor %xmm1, %xmm0, %xmm0; + vpxor %xmm3, %xmm2, %xmm2; + vpxor %xmm2, %xmm0, %xmm0; + +.align 8 +.Locb_checksum_blk1: + cmpq $1, %r11; + jb .Locb_checksum_done; + + leaq -1(%r11), %r11; + + vpxor (%r10), %xmm0, %xmm0; + leaq 16(%r10), %r10; + + jmp .Locb_checksum_blk1; + +.align 8 +.Locb_checksum_done: + vpxor (%rax), %xmm0, %xmm0; + vmovdqu %xmm0, (%rax); + ret; + CFI_ENDPROC(); +ELF(.size _gcry_vaes_avx2_ocb_checksum,.-_gcry_vaes_avx2_ocb_checksum) + +ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64,@function) +.globl _gcry_vaes_avx2_ocb_crypt_amd64 +_gcry_vaes_avx2_ocb_crypt_amd64: + /* input: + * %rdi: round keys + * %esi: nblk + * %rdx: dst + * %rcx: src + * %r8: nblocks + * %r9: nrounds + * 16(%rbp): offset + * 24(%rbp): checksum + * 32(%rbp): L-array + * 40(%rbp): encrypt (%r15d) + */ + CFI_STARTPROC(); + +#define STACK_REGS_POS (16 * 16 + 4 * 16) +#define STACK_ALLOC (STACK_REGS_POS + 6 * 8) + + pushq %rbp; + CFI_PUSH(%rbp); + movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); + + subq $STACK_ALLOC, %rsp; + andq $~63, %rsp; + + movq %r12, (STACK_REGS_POS + 0 * 8)(%rsp); + CFI_REG_ON_STACK(r12, STACK_REGS_POS + 0 * 8); + movq %r13, (STACK_REGS_POS + 1 * 8)(%rsp); + CFI_REG_ON_STACK(r13, STACK_REGS_POS + 1 * 8); + movq %r14, (STACK_REGS_POS + 2 * 8)(%rsp); + CFI_REG_ON_STACK(r14, STACK_REGS_POS + 2 * 8); + movq %r15, (STACK_REGS_POS + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r15, STACK_REGS_POS + 3 * 8); + + movl 40(%rbp), %r15d; /* encrypt-flag. */ + movq 16(%rbp), %r14; /* offset ptr. */ + + /* Handle encryption checksumming. */ + testl %r15d, %r15d; + jz .Locb_dec_checksum_prepare; + movq 24(%rbp), %rax; /* checksum ptr. */ + movq %rcx, %r10; + movq %r8, %r11; + call _gcry_vaes_avx2_ocb_checksum; + jmp .Locb_enc_checksum_done; +.Locb_dec_checksum_prepare: + /* Store plaintext address and number of blocks for decryption + * checksumming. */ + movq %rdx, (STACK_REGS_POS + 4 * 8)(%rsp); + movq %r8, (STACK_REGS_POS + 5 * 8)(%rsp); +.Locb_enc_checksum_done: + + vmovdqu (%r14), %xmm15; /* Load offset. */ + movq 32(%rbp), %r14; /* L-array ptr. */ + vmovdqa (0 * 16)(%rdi), %xmm0; /* first key */ + movl $(10 * 16), %eax; + cmpl $12, %r9d; + jb .Llast_key_ptr; + movl $(12 * 16), %eax; + je .Llast_key_ptr; + movl $(14 * 16), %eax; + .align 8 + .Llast_key_ptr: + vpxor (%rdi, %rax), %xmm0, %xmm0; /* first key ^ last key */ + vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key */ + vmovdqa %xmm0, (14 * 16)(%rsp); + vmovdqa %xmm0, (15 * 16)(%rsp); + +.align 8 +.Lhandle_unaligned_ocb: + /* Get number of blocks to align nblk to 16 (and L-array optimization). */ + movl %esi, %r10d; + negl %r10d; + andl $15, %r10d; + cmpq %r8, %r10; + cmovaq %r8, %r10; + cmpq $1, %r10; + jb .Lunaligned_ocb_done; + + /* Number of blocks after alignment. */ + movq %r8, %r11; + subq %r10, %r11; + + /* If number after alignment is less than 16, skip aligned handling + * completely. */ + cmp $16, %r11; + cmovbq %r8, %r10; + + /* Unaligned: Process eight blocks per loop. */ +.align 8 +.Locb_unaligned_blk8: + cmpq $8, %r10; + jb .Locb_unaligned_blk4; + + leaq -8(%r8), %r8; + leaq -8(%r10), %r10; + + leal 1(%esi), %r11d; + leal 2(%esi), %r12d; + leal 3(%esi), %r13d; + leal 4(%esi), %eax; + tzcntl %r11d, %r11d; + tzcntl %r12d, %r12d; + tzcntl %r13d, %r13d; + tzcntl %eax, %eax; + shll $4, %r11d; + shll $4, %r12d; + shll $4, %r13d; + shll $4, %eax; + vpxor (%r14, %r11), %xmm15, %xmm5; + vpxor (%r14, %r12), %xmm5, %xmm6; + vpxor (%r14, %r13), %xmm6, %xmm7; + vpxor (%r14, %rax), %xmm7, %xmm8; + + leal 5(%esi), %r11d; + leal 6(%esi), %r12d; + leal 7(%esi), %r13d; + leal 8(%esi), %esi; + tzcntl %r11d, %r11d; + tzcntl %r12d, %r12d; + tzcntl %r13d, %r13d; + tzcntl %esi, %eax; + shll $4, %r11d; + shll $4, %r12d; + shll $4, %r13d; + shll $4, %eax; + vpxor (%r14, %r11), %xmm8, %xmm9; + vpxor (%r14, %r12), %xmm9, %xmm10; + vpxor (%r14, %r13), %xmm10, %xmm11; + vpxor (%r14, %rax), %xmm11, %xmm15; + + vinserti128 $1, %xmm6, %ymm5, %ymm5; + vinserti128 $1, %xmm8, %ymm7, %ymm6; + vinserti128 $1, %xmm10, %ymm9, %ymm7; + vinserti128 $1, %xmm15, %ymm11, %ymm8; + + vpxor (0 * 16)(%rcx), %ymm5, %ymm0; + vpxor (2 * 16)(%rcx), %ymm6, %ymm1; + vpxor (4 * 16)(%rcx), %ymm7, %ymm2; + vpxor (6 * 16)(%rcx), %ymm8, %ymm3; + leaq (8 * 16)(%rcx), %rcx; + + vmovdqa (14 * 16)(%rsp), %ymm9; + + testl %r15d, %r15d; + jz .Locb_unaligned_blk8_dec; + /* AES rounds */ + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + cmpl $12, %r9d; + jb .Locb_unaligned_blk8_enc_last; + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + jz .Locb_unaligned_blk8_enc_last; + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + + /* Last round and output handling. */ + .Locb_unaligned_blk8_enc_last: + vpxor %ymm5, %ymm9, %ymm5; /* Xor src to last round key. */ + vpxor %ymm6, %ymm9, %ymm6; + vpxor %ymm7, %ymm9, %ymm7; + vpxor %ymm8, %ymm9, %ymm4; + vaesenclast %ymm5, %ymm0, %ymm0; + vaesenclast %ymm6, %ymm1, %ymm1; + vaesenclast %ymm7, %ymm2, %ymm2; + vaesenclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + jmp .Locb_unaligned_blk8; + + .align 8 + .Locb_unaligned_blk8_dec: + /* AES rounds */ + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + cmpl $12, %r9d; + jb .Locb_unaligned_blk8_dec_last; + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + jz .Locb_unaligned_blk8_dec_last; + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + + /* Last round and output handling. */ + .Locb_unaligned_blk8_dec_last: + vpxor %ymm5, %ymm9, %ymm5; /* Xor src to last round key. */ + vpxor %ymm6, %ymm9, %ymm6; + vpxor %ymm7, %ymm9, %ymm7; + vpxor %ymm8, %ymm9, %ymm4; + vaesdeclast %ymm5, %ymm0, %ymm0; + vaesdeclast %ymm6, %ymm1, %ymm1; + vaesdeclast %ymm7, %ymm2, %ymm2; + vaesdeclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + jmp .Locb_unaligned_blk8; + + /* Unaligned: Process four blocks. */ +.align 8 +.Locb_unaligned_blk4: + cmpq $4, %r10; + jb .Locb_unaligned_blk1; + + leaq -4(%r8), %r8; + leaq -4(%r10), %r10; + + leal 1(%esi), %r11d; + leal 2(%esi), %r12d; + leal 3(%esi), %r13d; + leal 4(%esi), %esi; + tzcntl %r11d, %r11d; + tzcntl %r12d, %r12d; + tzcntl %r13d, %r13d; + tzcntl %esi, %eax; + shll $4, %r11d; + shll $4, %r12d; + shll $4, %r13d; + shll $4, %eax; + + vpxor (%r14, %r11), %xmm15, %xmm5; + vpxor (%r14, %r12), %xmm5, %xmm6; + vinserti128 $1, %xmm6, %ymm5, %ymm5; + vpxor (%r14, %r13), %xmm6, %xmm7; + vpxor (%r14, %rax), %xmm7, %xmm15; + vinserti128 $1, %xmm15, %ymm7, %ymm6; + + vpxor (0 * 16)(%rcx), %ymm5, %ymm0; + vpxor (2 * 16)(%rcx), %ymm6, %ymm1; + leaq (4 * 16)(%rcx), %rcx; + + testl %r15d, %r15d; + jz .Locb_unaligned_blk4_dec; + /* AES rounds */ + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + cmpl $12, %r9d; + jb .Locb_unaligned_blk4_enc_last; + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + jz .Locb_unaligned_blk4_enc_last; + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + + /* Last round and output handling. */ + .Locb_unaligned_blk4_enc_last: + vmovdqa (14 * 16)(%rsp), %ymm8; + vpxor %ymm5, %ymm8, %ymm5; /* Xor src to last round key. */ + vpxor %ymm6, %ymm8, %ymm6; + vaesenclast %ymm5, %ymm0, %ymm0; + vaesenclast %ymm6, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + + jmp .Locb_unaligned_blk1; + + .align 8 + .Locb_unaligned_blk4_dec: + /* AES rounds */ + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + cmpl $12, %r9d; + jb .Locb_unaligned_blk4_dec_last; + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + jz .Locb_unaligned_blk4_dec_last; + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + + /* Last round and output handling. */ + .Locb_unaligned_blk4_dec_last: + vmovdqa (14 * 16)(%rsp), %ymm8; + vpxor %ymm5, %ymm8, %ymm5; /* Xor src to last round key. */ + vpxor %ymm6, %ymm8, %ymm6; + vaesdeclast %ymm5, %ymm0, %ymm0; + vaesdeclast %ymm6, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + + /* Unaligned: Process one block per loop. */ +.align 8 +.Locb_unaligned_blk1: + cmpq $1, %r10; + jb .Lunaligned_ocb_done; + + leaq -1(%r8), %r8; + leaq -1(%r10), %r10; + + leal 1(%esi), %esi; + tzcntl %esi, %r11d; + shll $4, %r11d; + vpxor (%r14, %r11), %xmm15, %xmm15; + vpxor (%rcx), %xmm15, %xmm0; + leaq 16(%rcx), %rcx; + + testl %r15d, %r15d; + jz .Locb_unaligned_blk1_dec; + /* AES rounds. */ + vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; + cmpl $12, %r9d; + jb .Locb_unaligned_blk1_enc_last; + vaesenc (10 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; + jz .Locb_unaligned_blk1_enc_last; + vaesenc (12 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; + + /* Last round and output handling. */ + .Locb_unaligned_blk1_enc_last: + vpxor (14 * 16)(%rsp), %xmm15, %xmm1; + vaesenclast %xmm1, %xmm0, %xmm0; + vmovdqu %xmm0, (%rdx); + leaq 16(%rdx), %rdx; + + jmp .Locb_unaligned_blk1; + + .align 8 + .Locb_unaligned_blk1_dec: + /* AES rounds. */ + vaesdec (1 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (2 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (3 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (4 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (5 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (6 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (7 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (8 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (9 * 16)(%rdi), %xmm0, %xmm0; + cmpl $12, %r9d; + jb .Locb_unaligned_blk1_dec_last; + vaesdec (10 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (11 * 16)(%rdi), %xmm0, %xmm0; + jz .Locb_unaligned_blk1_dec_last; + vaesdec (12 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (13 * 16)(%rdi), %xmm0, %xmm0; + + /* Last round and output handling. */ + .Locb_unaligned_blk1_dec_last: + vpxor (14 * 16)(%rsp), %xmm15, %xmm1; + vaesdeclast %xmm1, %xmm0, %xmm0; + vmovdqu %xmm0, (%rdx); + leaq 16(%rdx), %rdx; + + jmp .Locb_unaligned_blk1; + +.align 8 +.Lunaligned_ocb_done: + cmpq $1, %r8; + jb .Ldone_ocb; + + /* Short buffers do not benefit from L-array optimization. */ + movq %r8, %r10; + cmpq $16, %r8; + jb .Locb_unaligned_blk8; + + vinserti128 $1, %xmm15, %ymm15, %ymm15; + + /* Prepare L-array optimization. + * Since nblk is aligned to 16, offsets will have following + * construction: + * - block1 = ntz{0} = offset ^ L[0] + * - block2 = ntz{1} = offset ^ L[0] ^ L[1] + * - block3 = ntz{0} = offset ^ L[1] + * - block4 = ntz{2} = offset ^ L[1] ^ L[2] + * - block5 = ntz{0} = offset ^ L[0] ^ L[1] ^ L[2] + * - block6 = ntz{1} = offset ^ L[0] ^ L[2] + * - block7 = ntz{0} = offset ^ L[2] + * - block8 = ntz{3} = offset ^ L[2] ^ L[3] + * - block9 = ntz{0} = offset ^ L[0] ^ L[2] ^ L[3] + * - block10 = ntz{1} = offset ^ L[0] ^ L[1] ^ L[2] ^ L[3] + * - block11 = ntz{0} = offset ^ L[1] ^ L[2] ^ L[3] + * - block12 = ntz{2} = offset ^ L[1] ^ L[3] + * - block13 = ntz{0} = offset ^ L[0] ^ L[1] ^ L[3] + * - block14 = ntz{1} = offset ^ L[0] ^ L[3] + * - block15 = ntz{0} = offset ^ L[3] + * - block16 = ntz{x} = offset ^ L[3] ^ L[ntz{x}] + */ + vmovdqu (0 * 16)(%r14), %xmm0; + vmovdqu (1 * 16)(%r14), %xmm1; + vmovdqu (2 * 16)(%r14), %xmm2; + vmovdqu (3 * 16)(%r14), %xmm3; + vpxor %xmm0, %xmm1, %xmm4; /* L[0] ^ L[1] */ + vpxor %xmm0, %xmm2, %xmm5; /* L[0] ^ L[2] */ + vpxor %xmm0, %xmm3, %xmm6; /* L[0] ^ L[3] */ + vpxor %xmm1, %xmm2, %xmm7; /* L[1] ^ L[2] */ + vpxor %xmm1, %xmm3, %xmm8; /* L[1] ^ L[3] */ + vpxor %xmm2, %xmm3, %xmm9; /* L[2] ^ L[3] */ + vpxor %xmm4, %xmm2, %xmm10; /* L[0] ^ L[1] ^ L[2] */ + vpxor %xmm5, %xmm3, %xmm11; /* L[0] ^ L[2] ^ L[3] */ + vpxor %xmm7, %xmm3, %xmm12; /* L[1] ^ L[2] ^ L[3] */ + vpxor %xmm0, %xmm8, %xmm13; /* L[0] ^ L[1] ^ L[3] */ + vpxor %xmm4, %xmm9, %xmm14; /* L[0] ^ L[1] ^ L[2] ^ L[3] */ + vinserti128 $1, %xmm4, %ymm0, %ymm0; + vinserti128 $1, %xmm7, %ymm1, %ymm1; + vinserti128 $1, %xmm5, %ymm10, %ymm10; + vinserti128 $1, %xmm9, %ymm2, %ymm2; + vinserti128 $1, %xmm14, %ymm11, %ymm11; + vinserti128 $1, %xmm8, %ymm12, %ymm12; + vinserti128 $1, %xmm6, %ymm13, %ymm13; + vmovdqa %ymm0, (0 * 16)(%rsp); + vmovdqa %ymm1, (2 * 16)(%rsp); + vmovdqa %ymm10, (4 * 16)(%rsp); + vmovdqa %ymm2, (6 * 16)(%rsp); + vmovdqa %ymm11, (8 * 16)(%rsp); + vmovdqa %ymm12, (10 * 16)(%rsp); + vmovdqa %ymm13, (12 * 16)(%rsp); + + /* Aligned: Process 16 blocks per loop. */ +.align 8 +.Locb_aligned_blk16: + cmpq $16, %r8; + jb .Locb_aligned_blk8; + + leaq -16(%r8), %r8; + + leal 16(%esi), %esi; + tzcntl %esi, %eax; + shll $4, %eax; + + vpxor (0 * 16)(%rsp), %ymm15, %ymm8; + vpxor (2 * 16)(%rsp), %ymm15, %ymm9; + vpxor (4 * 16)(%rsp), %ymm15, %ymm10; + vpxor (6 * 16)(%rsp), %ymm15, %ymm11; + vpxor (8 * 16)(%rsp), %ymm15, %ymm12; + + vpxor (3 * 16)(%r14), %xmm15, %xmm13; /* offset ^ first key ^ L[3] */ + vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[3] ^ L[ntz{nblk+16}] */ + vinserti128 $1, %xmm14, %ymm13, %ymm14; + + vpxor (10 * 16)(%rsp), %ymm15, %ymm13; + vpxor (14 * 16)(%rcx), %ymm14, %ymm7; + + vpxor (0 * 16)(%rcx), %ymm8, %ymm0; + vpxor (2 * 16)(%rcx), %ymm9, %ymm1; + vpxor (4 * 16)(%rcx), %ymm10, %ymm2; + vpxor (6 * 16)(%rcx), %ymm11, %ymm3; + vpxor (8 * 16)(%rcx), %ymm12, %ymm4; + vpxor (10 * 16)(%rcx), %ymm13, %ymm5; + vmovdqa %ymm13, (16 * 16)(%rsp); + vpxor (12 * 16)(%rsp), %ymm15, %ymm13; + vpxor (12 * 16)(%rcx), %ymm13, %ymm6; + vmovdqa %ymm13, (18 * 16)(%rsp); + + leaq (16 * 16)(%rcx), %rcx; + + vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; + + testl %r15d, %r15d; + jz .Locb_aligned_blk16_dec; + /* AES rounds */ + vbroadcasti128 (1 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (2 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (3 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (4 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (5 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (6 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (7 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (8 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (9 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + cmpl $12, %r9d; + jb .Locb_aligned_blk16_enc_last; + vbroadcasti128 (10 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (11 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + jz .Locb_aligned_blk16_enc_last; + vbroadcasti128 (12 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (13 * 16)(%rdi), %ymm13; + VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + + /* Last round and output handling. */ + .Locb_aligned_blk16_enc_last: + vmovdqa (14 * 16)(%rsp), %ymm13; + vpxor %ymm8, %ymm13, %ymm8; + vpxor %ymm9, %ymm13, %ymm9; + vpxor %ymm10, %ymm13, %ymm10; + vpxor %ymm11, %ymm13, %ymm11; + vaesenclast %ymm8, %ymm0, %ymm0; + vaesenclast %ymm9, %ymm1, %ymm1; + vaesenclast %ymm10, %ymm2, %ymm2; + vaesenclast %ymm11, %ymm3, %ymm3; + vpxor %ymm12, %ymm13, %ymm12; + vpxor (16 * 16)(%rsp), %ymm13, %ymm8; + vpxor (18 * 16)(%rsp), %ymm13, %ymm9; + vpxor %ymm14, %ymm13, %ymm13; + vaesenclast %ymm12, %ymm4, %ymm4; + vaesenclast %ymm8, %ymm5, %ymm5; + vaesenclast %ymm9, %ymm6, %ymm6; + vaesenclast %ymm13, %ymm7, %ymm7; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + vmovdqu %ymm4, (8 * 16)(%rdx); + vmovdqu %ymm5, (10 * 16)(%rdx); + vmovdqu %ymm6, (12 * 16)(%rdx); + vmovdqu %ymm7, (14 * 16)(%rdx); + leaq (16 * 16)(%rdx), %rdx; + + jmp .Locb_aligned_blk16; + + .align 8 + .Locb_aligned_blk16_dec: + /* AES rounds */ + vbroadcasti128 (1 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (2 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (3 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (4 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (5 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (6 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (7 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (8 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (9 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + cmpl $12, %r9d; + jb .Locb_aligned_blk16_dec_last; + vbroadcasti128 (10 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (11 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + jz .Locb_aligned_blk16_dec_last; + vbroadcasti128 (12 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (13 * 16)(%rdi), %ymm13; + VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + + /* Last round and output handling. */ + .Locb_aligned_blk16_dec_last: + vmovdqa (14 * 16)(%rsp), %ymm13; + vpxor %ymm8, %ymm13, %ymm8; + vpxor %ymm9, %ymm13, %ymm9; + vpxor %ymm10, %ymm13, %ymm10; + vpxor %ymm11, %ymm13, %ymm11; + vaesdeclast %ymm8, %ymm0, %ymm0; + vaesdeclast %ymm9, %ymm1, %ymm1; + vaesdeclast %ymm10, %ymm2, %ymm2; + vaesdeclast %ymm11, %ymm3, %ymm3; + vpxor %ymm12, %ymm13, %ymm12; + vpxor (16 * 16)(%rsp), %ymm13, %ymm8; + vpxor (18 * 16)(%rsp), %ymm13, %ymm9; + vpxor %ymm14, %ymm13, %ymm13; + vaesdeclast %ymm12, %ymm4, %ymm4; + vaesdeclast %ymm8, %ymm5, %ymm5; + vaesdeclast %ymm9, %ymm6, %ymm6; + vaesdeclast %ymm13, %ymm7, %ymm7; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + vmovdqu %ymm4, (8 * 16)(%rdx); + vmovdqu %ymm5, (10 * 16)(%rdx); + vmovdqu %ymm6, (12 * 16)(%rdx); + vmovdqu %ymm7, (14 * 16)(%rdx); + leaq (16 * 16)(%rdx), %rdx; + + jmp .Locb_aligned_blk16; + + /* Aligned: Process trailing eight blocks. */ +.align 8 +.Locb_aligned_blk8: + cmpq $8, %r8; + jb .Locb_aligned_done; + + leaq -8(%r8), %r8; + + leal 8(%esi), %esi; + tzcntl %esi, %eax; + shll $4, %eax; + + vpxor (0 * 16)(%rsp), %ymm15, %ymm5; + vpxor (2 * 16)(%rsp), %ymm15, %ymm6; + vpxor (4 * 16)(%rsp), %ymm15, %ymm7; + + vpxor (2 * 16)(%r14), %xmm15, %xmm13; /* offset ^ first key ^ L[2] */ + vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[2] ^ L[ntz{nblk+8}] */ + vinserti128 $1, %xmm14, %ymm13, %ymm14; + + vpxor (0 * 16)(%rcx), %ymm5, %ymm0; + vpxor (2 * 16)(%rcx), %ymm6, %ymm1; + vpxor (4 * 16)(%rcx), %ymm7, %ymm2; + vpxor (6 * 16)(%rcx), %ymm14, %ymm3; + leaq (8 * 16)(%rcx), %rcx; + + vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; + + vmovdqa (14 * 16)(%rsp), %ymm8; + + testl %r15d, %r15d; + jz .Locb_aligned_blk8_dec; + /* AES rounds */ + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + cmpl $12, %r9d; + jb .Locb_aligned_blk8_enc_last; + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + jz .Locb_aligned_blk8_enc_last; + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + + /* Last round and output handling. */ + .Locb_aligned_blk8_enc_last: + vpxor %ymm5, %ymm8, %ymm5; + vpxor %ymm6, %ymm8, %ymm6; + vpxor %ymm7, %ymm8, %ymm7; + vpxor %ymm14, %ymm8, %ymm4; + vaesenclast %ymm5, %ymm0, %ymm0; + vaesenclast %ymm6, %ymm1, %ymm1; + vaesenclast %ymm7, %ymm2, %ymm2; + vaesenclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + jmp .Locb_aligned_done; + + .align 8 + .Locb_aligned_blk8_dec: + /* AES rounds */ + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + cmpl $12, %r9d; + jb .Locb_aligned_blk8_dec_last; + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + jz .Locb_aligned_blk8_dec_last; + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Locb_aligned_blk8_dec_last: + vpxor %ymm5, %ymm8, %ymm5; + vpxor %ymm6, %ymm8, %ymm6; + vpxor %ymm7, %ymm8, %ymm7; + vpxor %ymm14, %ymm8, %ymm4; + vaesdeclast %ymm5, %ymm0, %ymm0; + vaesdeclast %ymm6, %ymm1, %ymm1; + vaesdeclast %ymm7, %ymm2, %ymm2; + vaesdeclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + +.align 8 +.Locb_aligned_done: + /* Burn stack. */ + vpxor %ymm0, %ymm0, %ymm0; + vmovdqa %ymm0, (0 * 16)(%rsp); + vmovdqa %ymm0, (2 * 16)(%rsp); + vmovdqa %ymm0, (4 * 16)(%rsp); + vmovdqa %ymm0, (6 * 16)(%rsp); + vmovdqa %ymm0, (8 * 16)(%rsp); + vmovdqa %ymm0, (10 * 16)(%rsp); + vmovdqa %ymm0, (12 * 16)(%rsp); + vmovdqa %ymm0, (16 * 16)(%rsp); + vmovdqa %ymm0, (18 * 16)(%rsp); + + /* Handle tailing 1…7 blocks in nblk-unaligned loop. */ + movq %r8, %r10; + cmpq $1, %r8; + jnb .Locb_unaligned_blk8; + +.align 8 +.Ldone_ocb: + movq 16(%rbp), %r14; /* offset ptr. */ + vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key ^ first key */ + vmovdqu %xmm15, (%r14); /* Store offset. */ + + /* Handle decryption checksumming. */ + + testl %r15d, %r15d; + jnz .Locb_dec_checksum_done; + movq 24(%rbp), %rax; /* checksum ptr. */ + movq (STACK_REGS_POS + 4 * 8)(%rsp), %r10; + movq (STACK_REGS_POS + 5 * 8)(%rsp), %r11; + call _gcry_vaes_avx2_ocb_checksum; +.Locb_dec_checksum_done: + + /* Burn stack. */ + vpxor %ymm0, %ymm0, %ymm0; + vmovdqa %ymm0, (14 * 16)(%rsp); + + vzeroall; + + movq (STACK_REGS_POS + 0 * 8)(%rsp), %r12; + CFI_RESTORE(%r12); + movq (STACK_REGS_POS + 1 * 8)(%rsp), %r13; + CFI_RESTORE(%r13); + movq (STACK_REGS_POS + 2 * 8)(%rsp), %r14; + CFI_RESTORE(%r14); + movq (STACK_REGS_POS + 3 * 8)(%rsp), %r15; + CFI_RESTORE(%r15); + + leave; + CFI_LEAVE(); + ret + +#undef STACK_REGS_POS +#undef STACK_ALLOC + + CFI_ENDPROC(); +ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64) + +/********************************************************************** + CTR-mode encryption + **********************************************************************/ +ELF(.type _gcry_vaes_avx2_xts_crypt_amd64,@function) +.globl _gcry_vaes_avx2_xts_crypt_amd64 +_gcry_vaes_avx2_xts_crypt_amd64: + /* input: + * %rdi: round keys + * %rsi: tweak + * %rdx: dst + * %rcx: src + * %r8: nblocks + * %r9: nrounds + * 8(%rsp): encrypt + */ + CFI_STARTPROC(); + + movl 8(%rsp), %eax; + +#define tweak_clmul(shift, out, tweak, hi_tweak, tmp1, tmp2) \ + vpsrld $(32-(shift)), hi_tweak, tmp2; \ + vpsllq $(shift), tweak, out; \ + vpclmulqdq $0, .Lxts_gfmul_clmul rRIP, tmp2, tmp1; \ + vpunpckhqdq tmp2, tmp1, tmp1; \ + vpxor tmp1, out, out; + + /* Prepare tweak. */ + vmovdqu (%rsi), %xmm15; + vpshufb .Lxts_high_bit_shuf rRIP, %xmm15, %xmm13; + tweak_clmul(1, %xmm11, %xmm15, %xmm13, %xmm0, %xmm1); + vinserti128 $1, %xmm11, %ymm15, %ymm15; /* tweak:tweak1 */ + vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; + + cmpq $8, %r8; + jb .Lxts_crypt_blk4; + + /* Process eight blocks per loop. */ + leaq -8(%r8), %r8; + + vmovdqa %ymm15, %ymm5; + tweak_clmul(2, %ymm6, %ymm15, %ymm13, %ymm0, %ymm1); + tweak_clmul(4, %ymm7, %ymm15, %ymm13, %ymm0, %ymm1); + tweak_clmul(6, %ymm8, %ymm15, %ymm13, %ymm0, %ymm1); + tweak_clmul(8, %ymm15, %ymm15, %ymm13, %ymm0, %ymm1); + vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; + + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vpxor (0 * 16)(%rcx), %ymm5, %ymm0; + vpxor (2 * 16)(%rcx), %ymm6, %ymm1; + vpxor (4 * 16)(%rcx), %ymm7, %ymm2; + vpxor (6 * 16)(%rcx), %ymm8, %ymm3; + + leaq (8 * 16)(%rcx), %rcx; + +.align 8 +.Lxts_crypt_blk8_loop: + cmpq $8, %r8; + jb .Lxts_crypt_blk8_tail; + leaq -8(%r8), %r8; + + testl %eax, %eax; + jz .Lxts_dec_blk8; + /* AES rounds */ + XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vmovdqa %ymm15, %ymm9; + tweak_clmul(2, %ymm10, %ymm15, %ymm13, %ymm12, %ymm14); + tweak_clmul(4, %ymm11, %ymm15, %ymm13, %ymm12, %ymm14); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lxts_enc_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lxts_enc_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lxts_enc_blk8_last: + vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ + vpxor %ymm4, %ymm6, %ymm6; + vpxor %ymm4, %ymm7, %ymm7; + vpxor %ymm4, %ymm8, %ymm4; + tweak_clmul(6, %ymm8, %ymm15, %ymm13, %ymm12, %ymm14); + tweak_clmul(8, %ymm15, %ymm15, %ymm13, %ymm12, %ymm14); + vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; + vaesenclast %ymm5, %ymm0, %ymm0; + vaesenclast %ymm6, %ymm1, %ymm1; + vaesenclast %ymm7, %ymm2, %ymm2; + vaesenclast %ymm4, %ymm3, %ymm3; + + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vpxor (0 * 16)(%rcx), %ymm9, %ymm0; + vpxor (2 * 16)(%rcx), %ymm10, %ymm1; + vpxor (4 * 16)(%rcx), %ymm11, %ymm2; + vpxor (6 * 16)(%rcx), %ymm8, %ymm3; + + vmovdqa %ymm9, %ymm5; + vmovdqa %ymm10, %ymm6; + vmovdqa %ymm11, %ymm7; + + leaq (8 * 16)(%rcx), %rcx; + + jmp .Lxts_crypt_blk8_loop; + + .align 8 + .Lxts_dec_blk8: + /* AES rounds */ + XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vmovdqa %ymm15, %ymm9; + tweak_clmul(2, %ymm10, %ymm15, %ymm13, %ymm12, %ymm14); + tweak_clmul(4, %ymm11, %ymm15, %ymm13, %ymm12, %ymm14); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lxts_dec_blk8_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lxts_dec_blk8_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lxts_dec_blk8_last: + vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ + vpxor %ymm4, %ymm6, %ymm6; + vpxor %ymm4, %ymm7, %ymm7; + vpxor %ymm4, %ymm8, %ymm4; + tweak_clmul(6, %ymm8, %ymm15, %ymm13, %ymm12, %ymm14); + tweak_clmul(8, %ymm15, %ymm15, %ymm13, %ymm12, %ymm14); + vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; + vaesdeclast %ymm5, %ymm0, %ymm0; + vaesdeclast %ymm6, %ymm1, %ymm1; + vaesdeclast %ymm7, %ymm2, %ymm2; + vaesdeclast %ymm4, %ymm3, %ymm3; + + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vpxor (0 * 16)(%rcx), %ymm9, %ymm0; + vpxor (2 * 16)(%rcx), %ymm10, %ymm1; + vpxor (4 * 16)(%rcx), %ymm11, %ymm2; + vpxor (6 * 16)(%rcx), %ymm8, %ymm3; + + vmovdqa %ymm9, %ymm5; + vmovdqa %ymm10, %ymm6; + vmovdqa %ymm11, %ymm7; + + leaq (8 * 16)(%rcx), %rcx; + + jmp .Lxts_crypt_blk8_loop; + + .align 8 + .Lxts_crypt_blk8_tail: + testl %eax, %eax; + jz .Lxts_dec_tail_blk8; + /* AES rounds */ + XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lxts_enc_blk8_tail_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lxts_enc_blk8_tail_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lxts_enc_blk8_tail_last: + vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ + vpxor %ymm4, %ymm6, %ymm6; + vpxor %ymm4, %ymm7, %ymm7; + vpxor %ymm4, %ymm8, %ymm4; + vaesenclast %ymm5, %ymm0, %ymm0; + vaesenclast %ymm6, %ymm1, %ymm1; + vaesenclast %ymm7, %ymm2, %ymm2; + vaesenclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + jmp .Lxts_crypt_blk4; + + .align 8 + .Lxts_dec_tail_blk8: + /* AES rounds */ + XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lxts_dec_blk8_tail_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lxts_dec_blk8_tail_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lxts_dec_blk8_tail_last: + vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ + vpxor %ymm4, %ymm6, %ymm6; + vpxor %ymm4, %ymm7, %ymm7; + vpxor %ymm4, %ymm8, %ymm4; + vaesdeclast %ymm5, %ymm0, %ymm0; + vaesdeclast %ymm6, %ymm1, %ymm1; + vaesdeclast %ymm7, %ymm2, %ymm2; + vaesdeclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + /* Handle trailing four blocks. */ +.align 8 +.Lxts_crypt_blk4: + /* Try exit early as typically input length is large power of 2. */ + cmpq $0, %r8; + jb .Ldone_xts_crypt; + cmpq $4, %r8; + jb .Lxts_crypt_blk1; + + leaq -4(%r8), %r8; + + vmovdqa %ymm15, %ymm5; + tweak_clmul(2, %ymm6, %ymm15, %ymm13, %ymm0, %ymm1); + tweak_clmul(4, %ymm15, %ymm15, %ymm13, %ymm0, %ymm1); + vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; + + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vpxor (0 * 16)(%rcx), %ymm5, %ymm0; + vpxor (2 * 16)(%rcx), %ymm6, %ymm1; + + leaq (4 * 16)(%rcx), %rcx; + + testl %eax, %eax; + jz .Lxts_dec_blk4; + /* AES rounds */ + XOR2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lxts_enc_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lxts_enc_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lxts_enc_blk4_last: + vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ + vpxor %ymm4, %ymm6, %ymm6; + vaesenclast %ymm5, %ymm0, %ymm0; + vaesenclast %ymm6, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + + jmp .Lxts_crypt_blk1; + + .align 8 + .Lxts_dec_blk4: + /* AES rounds */ + XOR2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lxts_dec_blk4_last; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lxts_dec_blk4_last; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + + /* Last round and output handling. */ + .Lxts_dec_blk4_last: + vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ + vpxor %ymm4, %ymm6, %ymm6; + vaesdeclast %ymm5, %ymm0, %ymm0; + vaesdeclast %ymm6, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + + /* Process trailing one to three blocks, one per loop. */ +.align 8 +.Lxts_crypt_blk1: + cmpq $1, %r8; + jb .Ldone_xts_crypt; + + leaq -1(%r8), %r8; + + vpxor (%rcx), %xmm15, %xmm0; + vmovdqa %xmm15, %xmm5; + tweak_clmul(1, %xmm15, %xmm15, %xmm13, %xmm2, %xmm3); + vpshufb .Lxts_high_bit_shuf rRIP, %xmm15, %xmm13; + + leaq 16(%rcx), %rcx; + + testl %eax, %eax; + jz .Lxts_dec_blk1; + /* AES rounds. */ + vpxor (0 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (10 * 16)(%rdi), %xmm1; + cmpl $12, %r9d; + jb .Lxts_enc_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (12 * 16)(%rdi), %xmm1; + jz .Lxts_enc_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (14 * 16)(%rdi), %xmm1; + + /* Last round and output handling. */ + .Lxts_enc_blk1_last: + vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */ + vaesenclast %xmm5, %xmm0, %xmm0; + vmovdqu %xmm0, (%rdx); + leaq 16(%rdx), %rdx; + + jmp .Lxts_crypt_blk1; + + .align 8 + .Lxts_dec_blk1: + /* AES rounds. */ + vpxor (0 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (1 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (2 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (3 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (4 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (5 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (6 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (7 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (8 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (9 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (10 * 16)(%rdi), %xmm1; + cmpl $12, %r9d; + jb .Lxts_dec_blk1_last; + vaesdec %xmm1, %xmm0, %xmm0; + vaesdec (11 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (12 * 16)(%rdi), %xmm1; + jz .Lxts_dec_blk1_last; + vaesdec %xmm1, %xmm0, %xmm0; + vaesdec (13 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (14 * 16)(%rdi), %xmm1; + + /* Last round and output handling. */ + .Lxts_dec_blk1_last: + vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */ + vaesdeclast %xmm5, %xmm0, %xmm0; + vmovdqu %xmm0, (%rdx); + leaq 16(%rdx), %rdx; + + jmp .Lxts_crypt_blk1; + +.align 8 +.Ldone_xts_crypt: + /* Store IV. */ + vmovdqu %xmm15, (%rsi); + + vzeroall; + + xorl %eax, %eax + ret + CFI_ENDPROC(); +ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64) + +/********************************************************************** + constants + **********************************************************************/ +ELF(.type _gcry_vaes_consts,@object) +_gcry_vaes_consts: +.align 32 +.Lbige_addb_0: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +.Lbige_addb_1: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 +.Lbige_addb_2: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 +.Lbige_addb_3: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 +.Lbige_addb_4: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 +.Lbige_addb_5: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 +.Lbige_addb_6: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 +.Lbige_addb_7: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 +.Lbige_addb_8: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 +.Lbige_addb_9: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9 +.Lbige_addb_10: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10 +.Lbige_addb_11: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11 +.Lbige_addb_12: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 +.Lbige_addb_13: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 +.Lbige_addb_14: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 +.Lbige_addb_15: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 +.Lxts_gfmul_clmul: + .long 0x00, 0x87, 0x00, 0x00 + .long 0x00, 0x87, 0x00, 0x00 +.Lxts_high_bit_shuf: + .byte -1, -1, -1, -1, 12, 13, 14, 15 + .byte 4, 5, 6, 7, -1, -1, -1, -1 + .byte -1, -1, -1, -1, 12, 13, 14, 15 + .byte 4, 5, 6, 7, -1, -1, -1, -1 +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +ELF(.size _gcry_vaes_consts,.-_gcry_vaes_consts) + +#endif /* HAVE_GCC_INLINE_ASM_VAES */ +#endif /* __x86_64__ */ diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c new file mode 100644 index 00000000..56afce17 --- /dev/null +++ b/cipher/rijndael-vaes.c @@ -0,0 +1,176 @@ +/* VAES/AVX2 accelerated AES for Libgcrypt + * Copyright (C) 2021 Jussi Kivilinna <[hidden email]> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include <config.h> +#include <stdio.h> +#include <stdlib.h> + +#include "types.h" /* for byte and u32 typedefs */ +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "cipher-selftest.h" +#include "rijndael-internal.h" +#include "./cipher-internal.h" + + +#ifdef USE_VAES + + +# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +# else +# define ASM_FUNC_ABI +# endif + + +extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx); + + +extern void _gcry_vaes_avx2_cbc_dec_amd64 (const void *keysched, + unsigned char *iv, + void *outbuf_arg, + const void *inbuf_arg, + size_t nblocks, + unsigned int nrounds) ASM_FUNC_ABI; + +extern void _gcry_vaes_avx2_cfb_dec_amd64 (const void *keysched, + unsigned char *iv, + void *outbuf_arg, + const void *inbuf_arg, + size_t nblocks, + unsigned int nrounds) ASM_FUNC_ABI; + +extern void _gcry_vaes_avx2_ctr_enc_amd64 (const void *keysched, + unsigned char *ctr, + void *outbuf_arg, + const void *inbuf_arg, + size_t nblocks, + unsigned int nrounds) ASM_FUNC_ABI; + +extern void _gcry_vaes_avx2_ocb_crypt_amd64 (const void *keysched, + unsigned int blkn, + void *outbuf_arg, + const void *inbuf_arg, + size_t nblocks, + unsigned int nrounds, + unsigned char *offset, + unsigned char *checksum, + unsigned char *L_table, + int encrypt) ASM_FUNC_ABI; + +extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched, + unsigned char *tweak, + void *outbuf_arg, + const void *inbuf_arg, + size_t nblocks, + unsigned int nrounds, + int encrypt) ASM_FUNC_ABI; + + +void +_gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv, + void *outbuf, const void *inbuf, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + const void *keysched = ctx->keyschdec32; + unsigned int nrounds = ctx->rounds; + + if (!ctx->decryption_prepared) + { + _gcry_aes_aesni_prepare_decryption (ctx); + ctx->decryption_prepared = 1; + } + + _gcry_vaes_avx2_cbc_dec_amd64 (keysched, iv, outbuf, inbuf, nblocks, nrounds); +} + +void +_gcry_aes_vaes_cfb_dec (void *context, unsigned char *iv, + void *outbuf, const void *inbuf, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + const void *keysched = ctx->keyschenc32; + unsigned int nrounds = ctx->rounds; + + _gcry_vaes_avx2_cfb_dec_amd64 (keysched, iv, outbuf, inbuf, nblocks, nrounds); +} + +void +_gcry_aes_vaes_ctr_enc (void *context, unsigned char *iv, + void *outbuf, const void *inbuf, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + const void *keysched = ctx->keyschenc32; + unsigned int nrounds = ctx->rounds; + + _gcry_vaes_avx2_ctr_enc_amd64 (keysched, iv, outbuf, inbuf, nblocks, nrounds); +} + +size_t +_gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt) +{ + RIJNDAEL_context *ctx = (void *)&c->context.c; + const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned int nrounds = ctx->rounds; + u64 blkn = c->u_mode.ocb.data_nblocks; + + if (!encrypt && !ctx->decryption_prepared) + { + _gcry_aes_aesni_prepare_decryption (ctx); + ctx->decryption_prepared = 1; + } + + c->u_mode.ocb.data_nblocks = blkn + nblocks; + + _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, outbuf, inbuf, + nblocks, nrounds, c->u_iv.iv, c->u_ctr.ctr, + c->u_mode.ocb.L[0], encrypt); + + return 0; +} + +void +_gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak, + void *outbuf, const void *inbuf, + size_t nblocks, int encrypt) +{ + RIJNDAEL_context *ctx = context; + const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; + unsigned int nrounds = ctx->rounds; + + if (!encrypt && !ctx->decryption_prepared) + { + _gcry_aes_aesni_prepare_decryption (ctx); + ctx->decryption_prepared = 1; + } + + _gcry_vaes_avx2_xts_crypt_amd64 (keysched, tweak, outbuf, inbuf, nblocks, + nrounds, encrypt); +} + +#endif /* USE_VAES */ diff --git a/cipher/rijndael.c b/cipher/rijndael.c index fe137327..0b529030 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -102,6 +102,26 @@ extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak, size_t nblocks, int encrypt); #endif +#ifdef USE_VAES +/* VAES (AMD64) accelerated implementation of AES */ + +extern void _gcry_aes_vaes_cfb_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_vaes_ctr_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); +extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt); +#endif + #ifdef USE_SSSE3 /* SSSE3 (AMD64) vector permutation implementation of AES */ extern void _gcry_aes_ssse3_do_setkey(RIJNDAEL_context *ctx, const byte *key); @@ -480,6 +500,19 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt; + +#ifdef USE_VAES + if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL) && + (hwfeatures & HWF_INTEL_AVX2)) + { + /* Setup VAES bulk encryption routines. */ + bulk_ops->cfb_dec = _gcry_aes_vaes_cfb_dec; + bulk_ops->cbc_dec = _gcry_aes_vaes_cbc_dec; + bulk_ops->ctr_enc = _gcry_aes_vaes_ctr_enc; + bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt; + bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt; + } +#endif } #endif #ifdef USE_PADLOCK @@ -1644,7 +1677,11 @@ selftest_basic_256 (void) static const char* selftest_ctr_128 (void) { +#ifdef USE_VAES + const int nblocks = 16+1; +#else const int nblocks = 8+1; +#endif const int blocksize = BLOCKSIZE; const int context_size = sizeof(RIJNDAEL_context); @@ -1658,7 +1695,11 @@ selftest_ctr_128 (void) static const char* selftest_cbc_128 (void) { +#ifdef USE_VAES + const int nblocks = 16+2; +#else const int nblocks = 8+2; +#endif const int blocksize = BLOCKSIZE; const int context_size = sizeof(RIJNDAEL_context); @@ -1672,7 +1713,11 @@ selftest_cbc_128 (void) static const char* selftest_cfb_128 (void) { +#ifdef USE_VAES + const int nblocks = 16+2; +#else const int nblocks = 8+2; +#endif const int blocksize = BLOCKSIZE; const int context_size = sizeof(RIJNDAEL_context); -- 2.27.0 _______________________________________________ Gcrypt-devel mailing list [hidden email] http://lists.gnupg.org/mailman/listinfo/gcrypt-devel |
Free forum by Nabble | Edit this page |