[PATCH 1/2] configure.ac: run assembler checks through linker for better LTO support

classic Classic list List threaded Threaded
2 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH 1/2] configure.ac: run assembler checks through linker for better LTO support

Jussi Kivilinna-2
* configure.ac (gcry_cv_gcc_arm_platform_as_ok)
(gcry_cv_gcc_aarch64_platform_as_ok)
(gcry_cv_gcc_inline_asm_ssse3, gcry_cv_gcc_inline_asm_pclmul)
(gcry_cv_gcc_inline_asm_shaext, gcry_cv_gcc_inline_asm_sse41)
(gcry_cv_gcc_inline_asm_avx, gcry_cv_gcc_inline_asm_avx2)
(gcry_cv_gcc_inline_asm_bmi2, gcry_cv_gcc_as_const_division_ok)
(gcry_cv_gcc_as_const_division_with_wadivide_ok)
(gcry_cv_gcc_amd64_platform_as_ok, gcry_cv_gcc_win64_platform_as_ok)
(gcry_cv_gcc_platform_as_ok_for_intel_syntax)
(gcry_cv_gcc_inline_asm_neon, gcry_cv_gcc_inline_asm_aarch32_crypto)
(gcry_cv_gcc_inline_asm_aarch64_neon)
(gcry_cv_gcc_inline_asm_aarch64_crypto)
(gcry_cv_gcc_inline_asm_ppc_altivec)
(gcry_cv_gcc_inline_asm_ppc_arch_3_00)
(gcry_cv_gcc_inline_asm_s390x, gcry_cv_gcc_inline_asm_s390x): Use
AC_LINK_IFELSE check instead of AC_COMPILE_IFELSE.
--

LTO may defer assembly checking to linker stage, thus we need to use
AC_LINK_IFELSE instead of AC_COMPILE_IFELSE for these checks.

Signed-off-by: Jussi Kivilinna <[hidden email]>
---
 configure.ac | 111 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 65 insertions(+), 46 deletions(-)

diff --git a/configure.ac b/configure.ac
index 97abcf54..f7339a3e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1203,11 +1203,12 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementat
           gcry_cv_gcc_arm_platform_as_ok="n/a"
         else
           gcry_cv_gcc_arm_platform_as_ok=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
                 /* Test if assembler supports UAL syntax.  */
                 ".syntax unified\n\t"
                 ".arm\n\t" /* our assembly code is in ARM mode  */
+                ".text\n\t"
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
                 "add %r0, %r0, %r4, ror #12;\n\t"
@@ -1215,7 +1216,7 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementat
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,%function;\n\t"
-              );]])],
+              );]], [ asmfunc(); ] )],
             [gcry_cv_gcc_arm_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then
@@ -1235,13 +1236,14 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly i
           gcry_cv_gcc_aarch64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_aarch64_platform_as_ok=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
             [[__asm__(
+                ".text\n\t"
                 "asmfunc:\n\t"
                 "eor x0, x0, x30, ror #12;\n\t"
                 "add x0, x0, x30, asr #12;\n\t"
                 "eor v0.16b, v0.16b, v31.16b;\n\t"
-              );]])],
+              );]], [ asmfunc(); ] )],
             [gcry_cv_gcc_aarch64_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
@@ -1287,6 +1289,7 @@ AC_CACHE_CHECK([whether GCC assembler supports for ELF directives],
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if ELF directives '.type' and '.size' are supported. */
+                ".text\n\t"
                 "asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,STT_FUNC;\n\t"
@@ -1474,12 +1477,12 @@ AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions],
           gcry_cv_gcc_inline_asm_ssse3="n/a"
         else
           gcry_cv_gcc_inline_asm_ssse3=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
               { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
             void a(void) {
               __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):);
-            }]])],
+            }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_ssse3=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then
@@ -1498,10 +1501,10 @@ AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions],
           gcry_cv_gcc_inline_asm_pclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_pclmul=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc");
-            }]])],
+            }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_pclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
@@ -1520,7 +1523,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instruction
           gcry_cv_gcc_inline_asm_shaext="n/a"
         else
           gcry_cv_gcc_inline_asm_shaext=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc");
@@ -1529,7 +1532,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instruction
               __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc");
               __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc");
-            }]])],
+            }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_shaext=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
@@ -1548,11 +1551,11 @@ AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions],
           gcry_cv_gcc_inline_asm_sse41="n/a"
         else
           gcry_cv_gcc_inline_asm_sse41=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               int i;
               __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i));
-            }]])],
+            }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_sse41=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then
@@ -1571,10 +1574,10 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions],
           gcry_cv_gcc_inline_asm_avx="n/a"
         else
           gcry_cv_gcc_inline_asm_avx=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):);
-            }]])],
+            }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then
@@ -1593,10 +1596,10 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions],
           gcry_cv_gcc_inline_asm_avx2="n/a"
         else
           gcry_cv_gcc_inline_asm_avx2=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[void a(void) {
               __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc");
-            }]])],
+            }]], [ a(); ] )],
           [gcry_cv_gcc_inline_asm_avx2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then
@@ -1615,7 +1618,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
           gcry_cv_gcc_inline_asm_bmi2="n/a"
         else
           gcry_cv_gcc_inline_asm_bmi2=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[unsigned int a(unsigned int x, unsigned int y) {
               unsigned int tmp1, tmp2;
               asm ("rorxl %2, %1, %0"
@@ -1625,7 +1628,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
                    : "=r" (tmp2)
                    : "r0" (x), "rm" (y));
               return tmp1 + tmp2;
-            }]])],
+            }]], [ a(1, 2); ] )],
           [gcry_cv_gcc_inline_asm_bmi2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then
@@ -1642,8 +1645,9 @@ if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler handles division correctly],
        [gcry_cv_gcc_as_const_division_ok],
        [gcry_cv_gcc_as_const_division_ok=no
-        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-          [[__asm__("xorl \$(123456789/12345678), %ebp;\n\t");]])],
+        AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
+            [fn();])],
           [gcry_cv_gcc_as_const_division_ok=yes])])
   if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then
     #
@@ -1654,8 +1658,9 @@ if test $amd64_as_feature_detection = yes; then
     AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-            [[__asm__("xorl \$(123456789/12345678), %ebp;\n\t");]])],
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+            [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
+              [fn();])],
             [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])])
     if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then
       # '-Wa,--divide' did not work, restore old flags.
@@ -1677,10 +1682,11 @@ if test $amd64_as_feature_detection = yes; then
           gcry_cv_gcc_amd64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_amd64_platform_as_ok=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if '.type' and '.size' are supported.  */
                 /* These work only on ELF targets. */
+                ".text\n\t"
  "asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,@function;\n\t"
@@ -1689,7 +1695,7 @@ if test $amd64_as_feature_detection = yes; then
  * and "-Wa,--divide" workaround failed, this causes assembly
  * to be disable on this machine. */
  "xorl \$(123456789/12345678), %ebp;\n\t"
-            );]])],
+            );]], [ asmfunc(); ])],
           [gcry_cv_gcc_amd64_platform_as_ok=yes])
         fi])
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
@@ -1702,12 +1708,13 @@ if test $amd64_as_feature_detection = yes; then
     AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations],
       [gcry_cv_gcc_win64_platform_as_ok],
       [gcry_cv_gcc_win64_platform_as_ok=no
-      AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+      AC_LINK_IFELSE([AC_LANG_PROGRAM(
         [[__asm__(
+              ".text\n\t"
               ".globl asmfunc\n\t"
               "asmfunc:\n\t"
               "xorq \$(1234), %rbp;\n\t"
-          );]])],
+          );]], [ asmfunc(); ])],
         [gcry_cv_gcc_win64_platform_as_ok=yes])])
     if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then
       AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1,
@@ -1728,9 +1735,11 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly im
           gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a"
         else
           gcry_cv_gcc_platform_as_ok_for_intel_syntax=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".intel_syntax noprefix\n\t"
+                ".text\n\t"
+                "actest:\n\t"
                 "pxor xmm1, xmm7;\n\t"
                 /* Intel syntax implementation also use GAS macros, so check
                  * for them here. */
@@ -1747,7 +1756,8 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly im
                 "SET_VAL_B ebp\n\t"
                 "add VAL_A, VAL_B;\n\t"
                 "add VAL_B, 0b10101;\n\t"
-            );]])],
+                ".att_syntax prefix\n\t"
+            );]], [ actest(); ])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
         fi])
 if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
@@ -1800,17 +1810,19 @@ AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
           gcry_cv_gcc_inline_asm_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_neon=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arm\n\t"
                 ".fpu neon\n\t"
+                ".text\n\t"
+                "testfn:\n\t"
                 "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
                 "vrev64.8 %q0, %q3;\n\t"
                 "vadd.u64 %q0, %q1;\n\t"
                 "vadd.s64 %d3, %d2, %d3;\n\t"
                 );
-            ]])],
+            ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then
@@ -1829,13 +1841,15 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension i
           gcry_cv_gcc_inline_asm_aarch32_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch32_crypto=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arch armv8-a\n\t"
                 ".arm\n\t"
                 ".fpu crypto-neon-fp-armv8\n\t"
+                ".text\n\t"
 
+                "testfn:\n\t"
                 "sha1h.32 q0, q0;\n\t"
                 "sha1c.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
@@ -1855,7 +1869,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension i
 
                 "vmull.p64 q0, d0, d0;\n\t"
                 );
-            ]])],
+            ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch32_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then
@@ -1874,14 +1888,16 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions]
           gcry_cv_gcc_inline_asm_aarch64_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_neon=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd\n\t"
+                ".text\n\t"
+                "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
                 );
-            ]])],
+            ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then
@@ -1900,10 +1916,11 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension i
           gcry_cv_gcc_inline_asm_aarch64_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_crypto=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 ".cpu generic+simd+crypto\n\t"
-
+                ".text\n\t"
+                "testfn:\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
@@ -1928,7 +1945,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension i
                 "pmull v0.1q, v0.1d, v31.1d;\n\t"
                 "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
                 );
-            ]])],
+            ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
@@ -2010,8 +2027,9 @@ AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto
           gcry_cv_gcc_inline_asm_ppc_altivec="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_altivec=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(".globl testfn;\n"
+                    ".text\n\t"
     "testfn:\n"
     "stvx %v31,%r12,%r0;\n"
     "lvx  %v20,%r12,%r0;\n"
@@ -2022,7 +2040,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto
     "vshasigmad %v0, %v1, 0, 15;\n"
     "vpmsumd %v11, %v11, %v11;\n"
   );
-            ]])],
+            ]], [ testfn(); ] )],
           [gcry_cv_gcc_inline_asm_ppc_altivec=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" ; then
@@ -2041,12 +2059,13 @@ AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions
           gcry_cv_gcc_inline_asm_ppc_arch_3_00="n/a"
         else
           gcry_cv_gcc_inline_asm_ppc_arch_3_00=no
-          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-          [[__asm__(".globl testfn;\n"
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[__asm__(".text\n\t"
+    ".globl testfn;\n"
     "testfn:\n"
     "stxvb16x %r1,%v12,%v30;\n"
   );
-            ]])],
+            ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
@@ -2065,7 +2084,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions],
   gcry_cv_gcc_inline_asm_s390x="n/a"
  else
   gcry_cv_gcc_inline_asm_s390x=no
-  AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+  AC_LINK_IFELSE([AC_LANG_PROGRAM(
   [[typedef unsigned int u128_t __attribute__ ((mode (TI)));
     unsigned int testfunc(unsigned int x, void *y, unsigned int z)
     {
@@ -2106,7 +2125,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions],
     : "memory", "r14");
       return (unsigned int)r1 ^ reg0;
     }
-    ]])],
+    ]] , [ testfunc(0, 0, 0); ])],
   [gcry_cv_gcc_inline_asm_s390x=yes])
  fi])
 if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
@@ -2126,7 +2145,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instruction
  else
   gcry_cv_gcc_inline_asm_s390x_vx=no
   if test "$gcry_cv_gcc_inline_asm_s390x" = "yes" ; then
-    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+    AC_LINK_IFELSE([AC_LANG_PROGRAM(
     [[void testfunc(void)
       {
  asm volatile (".machine \"z13+vx\"\n\t"
@@ -2136,7 +2155,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports zSeries vector instruction
       :
       : "memory");
       }
-      ]])],
+      ]], [ testfunc(); ])],
     [gcry_cv_gcc_inline_asm_s390x_vx=yes])
   fi
  fi])
--
2.27.0


_______________________________________________
Gcrypt-devel mailing list
[hidden email]
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Reply | Threaded
Open this post in threaded view
|

[PATCH 2/2] sha512/sha256: remove assembler macros from AMD64 implementations

Jussi Kivilinna-2
* configure.ac (gcry_cv_gcc_platform_as_ok_for_intel_syntax): Remove
assembler macro check from Intel syntax assembly support check.
* cipher/sha256-avx-amd64.S: Replace assembler macros with C
preprocessor counterparts.
* cipher/sha256-avx2-bmi2-amd64.S: Ditto.
* cipher/sha256-ssse3-amd64.S: Ditto.
* cipher/sha512-avx-amd64.S: Ditto.
* cipher/sha512-avx2-bmi2-amd64.S: Ditto.
* cipher/sha512-ssse3-amd64.S: Ditto.
--

Removing GNU assembler macros allows building these implementations with
clang.

GnuPG-bug-id: 5255
Signed-off-by: Jussi Kivilinna <[hidden email]>
---
 cipher/sha256-avx-amd64.S       | 516 +++++++++++++++----------------
 cipher/sha256-avx2-bmi2-amd64.S | 421 +++++++++++--------------
 cipher/sha256-ssse3-amd64.S     | 529 +++++++++++++++-----------------
 cipher/sha512-avx-amd64.S       | 456 ++++++++++++++-------------
 cipher/sha512-avx2-bmi2-amd64.S | 498 +++++++++++++-----------------
 cipher/sha512-ssse3-amd64.S     | 455 ++++++++++++++-------------
 configure.ac                    |  20 +-
 7 files changed, 1387 insertions(+), 1508 deletions(-)

diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S
index 77143ff0..ec945f84 100644
--- a/cipher/sha256-avx-amd64.S
+++ b/cipher/sha256-avx-amd64.S
@@ -65,67 +65,64 @@
 
 #define VMOVDQ vmovdqu /* assume buffers not aligned */
 
-.macro ROR p1 p2
- /* shld is faster than ror on Intel Sandybridge */
- shld \p1, \p1, (32 - \p2)
-.endm
+#define ROR(p1, p2) \
+ /* shld is faster than ror on Intel Sandybridge */ \
+ shld p1, p1, (32 - p2);
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
 
 /* addm [mem], reg
  * Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
- add \p2, \p1
- mov \p1, \p2
-.endm
+#define addm(p1, p2) \
+ add p2, p1; \
+ mov p1, p2;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  * Load xmm with mem and byte swap each dword */
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p1, \p2
- vpshufb \p1, \p1, \p3
-.endm
+#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
+ VMOVDQ p1, p2; \
+ vpshufb p1, p1, p3;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
-X0 = xmm4
-X1 = xmm5
-X2 = xmm6
-X3 = xmm7
+#define X0 xmm4
+#define X1 xmm5
+#define X2 xmm6
+#define X3 xmm7
 
-XTMP0 = xmm0
-XTMP1 = xmm1
-XTMP2 = xmm2
-XTMP3 = xmm3
-XTMP4 = xmm8
-XFER  = xmm9
+#define XTMP0 xmm0
+#define XTMP1 xmm1
+#define XTMP2 xmm2
+#define XTMP3 xmm3
+#define XTMP4 xmm8
+#define XFER xmm9
 
-SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */
-SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = xmm12
+#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
+#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK xmm12
 
-NUM_BLKS = rdx /* 3rd arg */
-CTX = rsi /* 2nd arg */
-INP = rdi /* 1st arg */
+#define NUM_BLKS rdx /* 3rd arg */
+#define CTX rsi /* 2nd arg */
+#define INP rdi /* 1st arg */
 
-SRND = rdi /* clobbers INP */
-c = ecx
-d = r8d
-e = edx
+#define SRND rdi /* clobbers INP */
+#define c ecx
+#define d r8d
+#define e edx
 
-TBL = rbp
-a = eax
-b = ebx
+#define TBL rbp
+#define a eax
+#define b ebx
 
-f = r9d
-g = r10d
-h = r11d
+#define f r9d
+#define g r10d
+#define h r11d
 
-y0 = r13d
-y1 = r14d
-y2 = r15d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
 
 
 
@@ -142,220 +139,197 @@ y2 = r15d
 #define _XMM_SAVE (_XFER     + _XFER_SIZE + _ALIGN_SIZE)
 #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE)
 
-/* rotate_Xs
- * Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/* ROTATE_ARGS
- * Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- /* compute s0 four at a time and s1 two at a time
- * compute W[-16] + W[-7] 4 at a time */
- mov y0, e /* y0 = e */
- ROR y0, (25-11) /* y0 = e >> (25-11) */
- mov y1, a /* y1 = a */
- vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */
- ROR y1, (22-13) /* y1 = a >> (22-13) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- mov y2, f /* y2 = f */
- ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- xor y2, g /* y2 = f^g */
- vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- /* compute s0 */
- vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, y0 /* y2 = S1 + CH */
- add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- vpslld XTMP2, XTMP1, (32-7)
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- vpsrld XTMP3, XTMP1, 7
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- vpor XTMP3, XTMP3, XTMP2 /* XTMP1 = W[-15] ror 7 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+
+#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ /* compute s0 four at a time and s1 two at a time */; \
+ /* compute W[-16] + W[-7] 4 at a time */; \
+ mov y0, e /* y0 = e */; \
+ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
+ mov y1, a /* y1 = a */; \
+ vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */; \
+ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ mov y2, f /* y2 = f */; \
+ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ xor y2, g /* y2 = f^g */; \
+ vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ /* compute s0 */; \
+ vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ vpslld XTMP2, XTMP1, (32-7); \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ vpsrld XTMP3, XTMP1, 7; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ vpor XTMP3, XTMP3, XTMP2 /* XTMP1 = W[-15] ror 7 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
- mov y0, e /* y0 = e */
- mov y1, a /* y1 = a */
- ROR y0, (25-11) /* y0 = e >> (25-11) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- mov y2, f /* y2 = f */
- ROR y1, (22-13) /* y1 = a >> (22-13) */
- vpslld XTMP2, XTMP1, (32-18)
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- xor y2, g /* y2 = f^g */
- vpsrld XTMP4, XTMP1, 18
- ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- vpxor XTMP4, XTMP4, XTMP3
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- vpsrld XTMP1, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */
- add y2, y0 /* y2 = S1 + CH */
- add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */
- ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- vpxor XTMP1, XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- vpxor XTMP1, XTMP1, XTMP4 /* XTMP1 = s0 */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- /* compute low s1 */
- vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ mov y0, e /* y0 = e */; \
+ mov y1, a /* y1 = a */; \
+ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ mov y2, f /* y2 = f */; \
+ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
+ vpslld XTMP2, XTMP1, (32-18); \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ xor y2, g /* y2 = f^g */; \
+ vpsrld XTMP4, XTMP1, 18; \
+ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ vpxor XTMP4, XTMP4, XTMP3; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ vpsrld XTMP1, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \
+ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ vpxor XTMP1, XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ vpxor XTMP1, XTMP1, XTMP4 /* XTMP1 = s0 */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ /* compute low s1 */; \
+ vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
- mov y0, e /* y0 = e */
- mov y1, a /* y1 = a */
- ROR y0, (25-11) /* y0 = e >> (25-11) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- ROR y1, (22-13) /* y1 = a >> (22-13) */
- mov y2, f /* y2 = f */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */
- xor y2, g /* y2 = f^g */
- vpsrlq XTMP4, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- vpsrld XTMP2, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */
- ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- vpxor XTMP2, XTMP2, XTMP3
- add y2, y0 /* y2 = S1 + CH */
- ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */
- vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- /* compute high s1 */
- vpshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ mov y0, e /* y0 = e */; \
+ mov y1, a /* y1 = a */; \
+ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
+ mov y2, f /* y2 = f */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \
+ xor y2, g /* y2 = f^g */; \
+ vpsrlq XTMP4, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ vpsrld XTMP2, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \
+ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ vpxor XTMP2, XTMP2, XTMP3; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \
+ vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ /* compute high s1 */; \
+ vpshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
- mov y0, e /* y0 = e */
- ROR y0, (25-11) /* y0 = e >> (25-11) */
- mov y1, a /* y1 = a */
- ROR y1, (22-13) /* y1 = a >> (22-13) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- mov y2, f /* y2 = f */
- ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- xor y2, g /* y2 = f^g */
- vpsrlq X0, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- vpsrld XTMP2, XTMP2,    10 /* X0 = W[-2] >> 10 {DDCC} */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- vpxor XTMP2, XTMP2, XTMP3
- ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, y0 /* y2 = S1 + CH */
- add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */
- vpxor X0, X0, XTMP2 /* X0 = s1 {xDxC} */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- vpshufb X0, X0, SHUF_DC00 /* X0 = s1 {DC00} */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- vpaddd X0, X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ mov y0, e /* y0 = e */; \
+ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
+ mov y1, a /* y1 = a */; \
+ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ mov y2, f /* y2 = f */; \
+ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ xor y2, g /* y2 = f^g */; \
+ vpsrlq X0, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ vpsrld XTMP2, XTMP2,    10 /* X0 = W[-2] >> 10 {DDCC} */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ vpxor XTMP2, XTMP2, XTMP3; \
+ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \
+ vpxor X0, X0, XTMP2 /* X0 = s1 {xDxC} */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ vpshufb X0, X0, SHUF_DC00 /* X0 = s1 {DC00} */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ vpaddd X0, X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-rotate_Xs
-.endm
+#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
+ FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
+ FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
+ FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
 
 /* input is [rsp + _XFER + %1 * 4] */
-.macro DO_ROUND i1
- mov y0, e /* y0 = e */
- ROR y0, (25-11) /* y0 = e >> (25-11) */
- mov y1, a /* y1 = a */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- ROR y1, (22-13) /* y1 = a >> (22-13) */
- mov y2, f /* y2 = f */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- xor y2, g /* y2 = f^g */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- and y2, e /* y2 = (f^g)&e */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- add y2, y0 /* y2 = S1 + CH */
- ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
+ mov y0, e /* y0 = e */; \
+ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
+ mov y1, a /* y1 = a */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
+ mov y2, f /* y2 = f */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ xor y2, g /* y2 = f^g */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
- ROTATE_ARGS
-.endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -410,10 +384,10 @@ _gcry_sha256_transform_amd64_avx:
  lea TBL, [.LK256 ADD_RIP]
 
  /* byte swap first 16 dwords */
- COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
+ COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
+ COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
+ COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
 
  mov [rsp + _INP], INP
 
@@ -423,20 +397,20 @@ _gcry_sha256_transform_amd64_avx:
 .Loop1:
  vpaddd XFER, X0, [TBL + 0*16]
  vmovdqa [rsp + _XFER], XFER
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
 
- vpaddd XFER, X0, [TBL + 1*16]
+ vpaddd XFER, X1, [TBL + 1*16]
  vmovdqa [rsp + _XFER], XFER
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
 
- vpaddd XFER, X0, [TBL + 2*16]
+ vpaddd XFER, X2, [TBL + 2*16]
  vmovdqa [rsp + _XFER], XFER
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
 
- vpaddd XFER, X0, [TBL + 3*16]
+ vpaddd XFER, X3, [TBL + 3*16]
  vmovdqa [rsp + _XFER], XFER
  add TBL, 4*16
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
 
  sub SRND, 1
  jne .Loop1
@@ -445,17 +419,17 @@ _gcry_sha256_transform_amd64_avx:
 .Loop2:
  vpaddd X0, X0, [TBL + 0*16]
  vmovdqa [rsp + _XFER], X0
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
+ DO_ROUND(0, a, b, c, d, e, f, g, h)
+ DO_ROUND(1, h, a, b, c, d, e, f, g)
+ DO_ROUND(2, g, h, a, b, c, d, e, f)
+ DO_ROUND(3, f, g, h, a, b, c, d, e)
  vpaddd X1, X1, [TBL + 1*16]
  vmovdqa [rsp + _XFER], X1
  add TBL, 2*16
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
+ DO_ROUND(0, e, f, g, h, a, b, c, d)
+ DO_ROUND(1, d, e, f, g, h, a, b, c)
+ DO_ROUND(2, c, d, e, f, g, h, a, b)
+ DO_ROUND(3, b, c, d, e, f, g, h, a)
 
  vmovdqa X0, X2
  vmovdqa X1, X3
@@ -463,14 +437,14 @@ _gcry_sha256_transform_amd64_avx:
  sub SRND, 1
  jne .Loop2
 
- addm [4*0 + CTX],a
- addm [4*1 + CTX],b
- addm [4*2 + CTX],c
- addm [4*3 + CTX],d
- addm [4*4 + CTX],e
- addm [4*5 + CTX],f
- addm [4*6 + CTX],g
- addm [4*7 + CTX],h
+ addm([4*0 + CTX],a)
+ addm([4*1 + CTX],b)
+ addm([4*2 + CTX],c)
+ addm([4*3 + CTX],d)
+ addm([4*4 + CTX],e)
+ addm([4*5 + CTX],f)
+ addm([4*6 + CTX],g)
+ addm([4*7 + CTX],h)
 
  mov INP, [rsp + _INP]
  add INP, 64
diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index 52be1a07..faefba17 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -70,226 +70,171 @@
 
 /*  addm [mem], reg */
 /*  Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
- add \p2, \p1
- mov \p1, \p2
-.endm
+#define addm(p1, p2) \
+ add p2, p1; \
+ mov p1, p2;
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-X0 = ymm4
-X1 = ymm5
-X2 = ymm6
-X3 = ymm7
+#define X0 ymm4
+#define X1 ymm5
+#define X2 ymm6
+#define X3 ymm7
 
 /*  XMM versions of above */
-XWORD0 = xmm4
-XWORD1 = xmm5
-XWORD2 = xmm6
-XWORD3 = xmm7
-
-XTMP0 = ymm0
-XTMP1 = ymm1
-XTMP2 = ymm2
-XTMP3 = ymm3
-XTMP4 = ymm8
-XFER =  ymm9
-XTMP5 = ymm11
-
-SHUF_00BA = ymm10 /*  shuffle xBxA -> 00BA */
-SHUF_DC00 = ymm12 /*  shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = ymm13
-
-X_BYTE_FLIP_MASK = xmm13 /*  XMM version of BYTE_FLIP_MASK */
-
-NUM_BLKS = rdx /*  3rd arg */
-CTX = rsi   /*  2nd arg */
-INP = rdi /*  1st arg */
-c = ecx
-d = r8d
-e = edx /*  clobbers NUM_BLKS */
-y3 = edi /*  clobbers INP */
-
-TBL = rbp
-SRND = CTX /*  SRND is same register as CTX */
-
-a = eax
-b = ebx
-f = r9d
-g = r10d
-h = r11d
-old_h = r11d
-
-T1 = r12d
-y0 = r13d
-y1 = r14d
-y2 = r15d
-
-
-_XFER_SIZE = 2*64*4 /*  2 blocks, 64 rounds, 4 bytes/round */
-_XMM_SAVE_SIZE  = 0
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_CTX_SIZE = 8
-_RSP_SIZE = 8
-
-_XFER = 0
-_XMM_SAVE = _XFER     + _XFER_SIZE
-_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
-_INP = _INP_END  + _INP_END_SIZE
-_CTX = _INP      + _INP_SIZE
-_RSP = _CTX      + _CTX_SIZE
-STACK_SIZE = _RSP      + _RSP_SIZE
-
-/*  rotate_Xs */
-/*  Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/*  ROTATE_ARGS */
-/*  Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-old_h = h
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro ONE_ROUND_PART1 XFER
- /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
- * d += h;
- * h += Sum0 (a) + Maj (a, b, c);
- *
- * Ch(x, y, z) => ((x & y) + (~x & z))
- * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
- */
-
- mov y3, e
- add h, [\XFER]
- and y3, f
- rorx y0, e, 25
- rorx y1, e, 11
+#define XWORD0 xmm4
+#define XWORD1 xmm5
+#define XWORD2 xmm6
+#define XWORD3 xmm7
+
+#define XTMP0 ymm0
+#define XTMP1 ymm1
+#define XTMP2 ymm2
+#define XTMP3 ymm3
+#define XTMP4 ymm8
+#define XFER ymm9
+#define XTMP5 ymm11
+
+#define SHUF_00BA ymm10 /*  shuffle xBxA -> 00BA */
+#define SHUF_DC00 ymm12 /*  shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK ymm13
+
+#define X_BYTE_FLIP_MASK xmm13 /*  XMM version of BYTE_FLIP_MASK */
+
+#define NUM_BLKS rdx /*  3rd arg */
+#define CTX rsi      /*  2nd arg */
+#define INP rdi      /*  1st arg */
+#define c ecx
+#define d r8d
+#define e edx        /*  clobbers NUM_BLKS */
+#define y3 edi       /*  clobbers INP */
+
+#define TBL rbp
+#define SRND CTX     /*  SRND is same register as CTX */
+
+#define a eax
+#define b ebx
+#define f r9d
+#define g r10d
+#define h r11d
+#define old_h r11d
+
+#define T1 r12d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
+
+
+#define _XFER_SIZE 2*64*4 /*  2 blocks, 64 rounds, 4 bytes/round */
+#define _XMM_SAVE_SIZE 0
+#define _INP_END_SIZE 8
+#define _INP_SIZE 8
+#define _CTX_SIZE 8
+#define _RSP_SIZE 8
+
+#define _XFER 0
+#define _XMM_SAVE  _XFER     + _XFER_SIZE
+#define _INP_END   _XMM_SAVE + _XMM_SAVE_SIZE
+#define _INP       _INP_END  + _INP_END_SIZE
+#define _CTX       _INP      + _INP_SIZE
+#define _RSP       _CTX      + _CTX_SIZE
+#define STACK_SIZE _RSP      + _RSP_SIZE
+
+#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
+ /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); */ \
+ /* d += h; */ \
+ /* h += Sum0 (a) + Maj (a, b, c); */ \
+ \
+ /* Ch(x, y, z) => ((x & y) + (~x & z)) */ \
+ /* Maj(x, y, z) => ((x & y) + (z & (x ^ y))) */ \
+ \
+ mov y3, e; \
+ add h, [XFERIN]; \
+ and y3, f; \
+ rorx y0, e, 25; \
+ rorx y1, e, 11; \
+ lea h, [h + y3]; \
+ andn y3, e, g; \
+ rorx T1, a, 13; \
+ xor y0, y1; \
  lea h, [h + y3]
- andn y3, e, g
- rorx T1, a, 13
- xor y0, y1
- lea h, [h + y3]
-.endm
-.macro ONE_ROUND_PART2
- rorx y2, a, 22
- rorx y1, e, 6
- mov y3, a
- xor T1, y2
- xor y0, y1
- xor y3, b
- lea h, [h + y0]
- mov y0, a
- rorx y2, a, 2
- add d, h
- and y3, c
- xor T1, y2
- lea h, [h + y3]
- lea h, [h + T1]
- and y0, b
- lea h, [h + y0]
-.endm
-
-.macro ONE_ROUND XFER
- ONE_ROUND_PART1 \XFER
- ONE_ROUND_PART2
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED XFER, XFEROUT
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- vpalignr XTMP0, X3, X2, 4 /*  XTMP0 = W[-7] */
- vpaddd XTMP0, XTMP0, X0 /*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */
- vpalignr XTMP1, X1, X0, 4 /*  XTMP1 = W[-15] */
- vpsrld XTMP2, XTMP1, 7
- vpslld XTMP3, XTMP1, (32-7)
- vpor XTMP3, XTMP3, XTMP2 /*  XTMP3 = W[-15] ror 7 */
- vpsrld XTMP2, XTMP1,18
-
- ONE_ROUND 0*4+\XFER
- ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- vpsrld XTMP4, XTMP1, 3 /*  XTMP4 = W[-15] >> 3 */
- vpslld XTMP1, XTMP1, (32-18)
- vpxor XTMP3, XTMP3, XTMP1
- vpxor XTMP3, XTMP3, XTMP2 /*  XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */
- vpxor XTMP1, XTMP3, XTMP4 /*  XTMP1 = s0 */
- vpshufd XTMP2, X3, 0b11111010 /*  XTMP2 = W[-2] {BBAA} */
- vpaddd XTMP0, XTMP0, XTMP1 /*  XTMP0 = W[-16] + W[-7] + s0 */
- vpsrld XTMP4, XTMP2, 10 /*  XTMP4 = W[-2] >> 10 {BBAA} */
-
- ONE_ROUND 1*4+\XFER
- ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
- vpsrlq XTMP3, XTMP2, 19 /*  XTMP3 = W[-2] ror 19 {xBxA} */
- vpsrlq XTMP2, XTMP2, 17 /*  XTMP2 = W[-2] ror 17 {xBxA} */
- vpxor XTMP2, XTMP2, XTMP3
- vpxor XTMP4, XTMP4, XTMP2 /*  XTMP4 = s1 {xBxA} */
- vpshufb XTMP4, XTMP4, SHUF_00BA /*  XTMP4 = s1 {00BA} */
- vpaddd XTMP0, XTMP0, XTMP4 /*  XTMP0 = {..., ..., W[1], W[0]} */
- vpshufd XTMP2, XTMP0, 0b1010000 /*  XTMP2 = W[-2] {DDCC} */
-
- ONE_ROUND 2*4+\XFER
- ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- vpsrld XTMP5, XTMP2,   10 /*  XTMP5 = W[-2] >> 10 {DDCC} */
- vpsrlq XTMP3, XTMP2, 19 /*  XTMP3 = W[-2] ror 19 {xDxC} */
- vpsrlq XTMP2, XTMP2, 17 /*  XTMP2 = W[-2] ror 17 {xDxC} */
- vpxor XTMP2, XTMP2, XTMP3
- vpxor XTMP5, XTMP5, XTMP2 /*  XTMP5 = s1 {xDxC} */
- vpshufb XTMP5, XTMP5, SHUF_DC00 /*  XTMP5 = s1 {DC00} */
- vpaddd X0, XTMP5, XTMP0 /*  X0 = {W[3], W[2], W[1], W[0]} */
- vpaddd XFER, X0, [TBL + \XFEROUT]
-
- ONE_ROUND_PART1 3*4+\XFER
- vmovdqa [rsp + _XFER + \XFEROUT], XFER
- ONE_ROUND_PART2
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-.macro DO_4ROUNDS XFER
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- ONE_ROUND 0*4+\XFER
- ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- ONE_ROUND 1*4+\XFER
- ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- ONE_ROUND 2*4+\XFER
- ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
+#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
+ rorx y2, a, 22; \
+ rorx y1, e, 6; \
+ mov y3, a; \
+ xor T1, y2; \
+ xor y0, y1; \
+ xor y3, b; \
+ lea h, [h + y0]; \
+ mov y0, a; \
+ rorx y2, a, 2; \
+ add d, h; \
+ and y3, c; \
+ xor T1, y2; \
+ lea h, [h + y3]; \
+ lea h, [h + T1]; \
+ and y0, b; \
+ lea h, [h + y0]
 
- ONE_ROUND 3*4+\XFER
- ROTATE_ARGS
-.endm
+#define ONE_ROUND(XFER, a, b, c, d, e, f, g, h) \
+ ONE_ROUND_PART1(XFER, a, b, c, d, e, f, g, h); \
+ ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
+
+#define FOUR_ROUNDS_AND_SCHED(XFERIN, XFEROUT, X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+ vpalignr XTMP0, X3, X2, 4 /*  XTMP0 = W[-7] */; \
+ vpaddd XTMP0, XTMP0, X0 /*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */; \
+ vpalignr XTMP1, X1, X0, 4 /*  XTMP1 = W[-15] */; \
+ vpsrld XTMP2, XTMP1, 7; \
+ vpslld XTMP3, XTMP1, (32-7); \
+ vpor XTMP3, XTMP3, XTMP2 /*  XTMP3 = W[-15] ror 7 */; \
+ vpsrld XTMP2, XTMP1,18; \
+ \
+ ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \
+ \
+ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+ vpsrld XTMP4, XTMP1, 3 /*  XTMP4 = W[-15] >> 3 */; \
+ vpslld XTMP1, XTMP1, (32-18); \
+ vpxor XTMP3, XTMP3, XTMP1; \
+ vpxor XTMP3, XTMP3, XTMP2 /*  XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+ vpxor XTMP1, XTMP3, XTMP4 /*  XTMP1 = s0 */; \
+ vpshufd XTMP2, X3, 0b11111010 /*  XTMP2 = W[-2] {BBAA} */; \
+ vpaddd XTMP0, XTMP0, XTMP1 /*  XTMP0 = W[-16] + W[-7] + s0 */; \
+ vpsrld XTMP4, XTMP2, 10 /*  XTMP4 = W[-2] >> 10 {BBAA} */; \
+ \
+ ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \
+ \
+ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+ vpsrlq XTMP3, XTMP2, 19 /*  XTMP3 = W[-2] ror 19 {xBxA} */; \
+ vpsrlq XTMP2, XTMP2, 17 /*  XTMP2 = W[-2] ror 17 {xBxA} */; \
+ vpxor XTMP2, XTMP2, XTMP3; \
+ vpxor XTMP4, XTMP4, XTMP2 /*  XTMP4 = s1 {xBxA} */; \
+ vpshufb XTMP4, XTMP4, SHUF_00BA /*  XTMP4 = s1 {00BA} */; \
+ vpaddd XTMP0, XTMP0, XTMP4 /*  XTMP0 = {..., ..., W[1], W[0]} */; \
+ vpshufd XTMP2, XTMP0, 0b1010000 /*  XTMP2 = W[-2] {DDCC} */; \
+ \
+ ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \
+ \
+ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+ vpsrld XTMP5, XTMP2,   10 /*  XTMP5 = W[-2] >> 10 {DDCC} */; \
+ vpsrlq XTMP3, XTMP2, 19 /*  XTMP3 = W[-2] ror 19 {xDxC} */; \
+ vpsrlq XTMP2, XTMP2, 17 /*  XTMP2 = W[-2] ror 17 {xDxC} */; \
+ vpxor XTMP2, XTMP2, XTMP3; \
+ vpxor XTMP5, XTMP5, XTMP2 /*  XTMP5 = s1 {xDxC} */; \
+ vpshufb XTMP5, XTMP5, SHUF_DC00 /*  XTMP5 = s1 {DC00} */; \
+ vpaddd X0, XTMP5, XTMP0 /*  X0 = {W[3], W[2], W[1], W[0]} */; \
+ vpaddd XFER, X0, [TBL + XFEROUT]; \
+ \
+ ONE_ROUND_PART1(3*4+XFERIN, f, g, h, a, b, c, d, e); \
+ vmovdqa [rsp + _XFER + XFEROUT], XFER; \
+ ONE_ROUND_PART2(f, g, h, a, b, c, d, e);
+
+#define DO_4ROUNDS(XFERIN, a, b, c, d, e, f, g, h) \
+ ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \
+ ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \
+ ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \
+ ONE_ROUND(3*4+XFERIN, f, g, h, a, b, c, d, e)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -391,32 +336,32 @@ _gcry_sha256_transform_amd64_avx2:
 
 .align 16
 .Loop1:
- FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32, SRND + 4*32
- FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32, SRND + 5*32
- FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32, SRND + 6*32
- FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32, SRND + 7*32
+ FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 0*32, SRND + 4*32, X0, X1, X2, X3, a, b, c, d, e, f, g, h)
+ FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 1*32, SRND + 5*32, X1, X2, X3, X0, e, f, g, h, a, b, c, d)
+ FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 2*32, SRND + 6*32, X2, X3, X0, X1, a, b, c, d, e, f, g, h)
+ FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 3*32, SRND + 7*32, X3, X0, X1, X2, e, f, g, h, a, b, c, d)
 
  add SRND, 4*32
  cmp SRND, 3 * 4*32
  jb .Loop1
 
  /* ; Do last 16 rounds with no scheduling */
- DO_4ROUNDS rsp + _XFER + (3*4*32 + 0*32)
- DO_4ROUNDS rsp + _XFER + (3*4*32 + 1*32)
- DO_4ROUNDS rsp + _XFER + (3*4*32 + 2*32)
- DO_4ROUNDS rsp + _XFER + (3*4*32 + 3*32)
+ DO_4ROUNDS(rsp + _XFER + (3*4*32 + 0*32), a, b, c, d, e, f, g, h)
+ DO_4ROUNDS(rsp + _XFER + (3*4*32 + 1*32), e, f, g, h, a, b, c, d)
+ DO_4ROUNDS(rsp + _XFER + (3*4*32 + 2*32), a, b, c, d, e, f, g, h)
+ DO_4ROUNDS(rsp + _XFER + (3*4*32 + 3*32), e, f, g, h, a, b, c, d)
 
  mov CTX, [rsp + _CTX]
  mov INP, [rsp + _INP]
 
- addm [4*0 + CTX],a
- addm [4*1 + CTX],b
- addm [4*2 + CTX],c
- addm [4*3 + CTX],d
- addm [4*4 + CTX],e
- addm [4*5 + CTX],f
- addm [4*6 + CTX],g
- addm [4*7 + CTX],h
+ addm([4*0 + CTX],a)
+ addm([4*1 + CTX],b)
+ addm([4*2 + CTX],c)
+ addm([4*3 + CTX],d)
+ addm([4*4 + CTX],e)
+ addm([4*5 + CTX],f)
+ addm([4*6 + CTX],g)
+ addm([4*7 + CTX],h)
 
  cmp INP, [rsp + _INP_END]
  ja .Ldone_hash
@@ -425,8 +370,8 @@ _gcry_sha256_transform_amd64_avx2:
  xor SRND, SRND
 .align 16
 .Loop3:
- DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16
- DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16
+ DO_4ROUNDS(rsp + _XFER + SRND + 0*32 + 16, a, b, c, d, e, f, g, h)
+ DO_4ROUNDS(rsp + _XFER + SRND + 1*32 + 16, e, f, g, h, a, b, c, d)
  add SRND, 2*32
  cmp SRND, 4 * 4*32
  jb .Loop3
@@ -435,14 +380,14 @@ _gcry_sha256_transform_amd64_avx2:
  mov INP, [rsp + _INP]
  add INP, 64
 
- addm [4*0 + CTX],a
- addm [4*1 + CTX],b
- addm [4*2 + CTX],c
- addm [4*3 + CTX],d
- addm [4*4 + CTX],e
- addm [4*5 + CTX],f
- addm [4*6 + CTX],g
- addm [4*7 + CTX],h
+ addm([4*0 + CTX],a)
+ addm([4*1 + CTX],b)
+ addm([4*2 + CTX],c)
+ addm([4*3 + CTX],d)
+ addm([4*4 + CTX],e)
+ addm([4*5 + CTX],f)
+ addm([4*6 + CTX],g)
+ addm([4*7 + CTX],h)
 
  cmp INP, [rsp + _INP_END]
  jb .Loop0
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index 0fb94c1b..098b0eb6 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -70,58 +70,56 @@
 
 /* addm [mem], reg
  * Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
- add \p2, \p1
- mov \p1, \p2
-.endm
+#define addm(p1, p2) \
+ add p2, p1; \
+ mov p1, p2;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  * Load xmm with mem and byte swap each dword */
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- MOVDQ \p1, \p2
- pshufb \p1, \p3
-.endm
+#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
+ MOVDQ p1, p2; \
+ pshufb p1, p3;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
-X0 = xmm4
-X1 = xmm5
-X2 = xmm6
-X3 = xmm7
+#define X0 xmm4
+#define X1 xmm5
+#define X2 xmm6
+#define X3 xmm7
 
-XTMP0 = xmm0
-XTMP1 = xmm1
-XTMP2 = xmm2
-XTMP3 = xmm3
-XTMP4 = xmm8
-XFER  = xmm9
+#define XTMP0 xmm0
+#define XTMP1 xmm1
+#define XTMP2 xmm2
+#define XTMP3 xmm3
+#define XTMP4 xmm8
+#define XFER xmm9
 
-SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */
-SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = xmm12
+#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
+#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK xmm12
 
-NUM_BLKS = rdx /* 3rd arg */
-CTX = rsi /* 2nd arg */
-INP = rdi /* 1st arg */
+#define NUM_BLKS rdx /* 3rd arg */
+#define CTX rsi /* 2nd arg */
+#define INP rdi /* 1st arg */
 
-SRND = rdi /* clobbers INP */
-c = ecx
-d = r8d
-e = edx
+#define SRND rdi /* clobbers INP */
+#define c ecx
+#define d r8d
+#define e edx
 
-TBL = rbp
-a = eax
-b = ebx
+#define TBL rbp
+#define a eax
+#define b ebx
 
-f = r9d
-g = r10d
-h = r11d
+#define f r9d
+#define g r10d
+#define h r11d
 
-y0 = r13d
-y1 = r14d
-y2 = r15d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
 
 
 
@@ -138,230 +136,207 @@ y2 = r15d
 #define _XMM_SAVE (_XFER     + _XFER_SIZE + _ALIGN_SIZE)
 #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE)
 
-/* rotate_Xs
- * Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/* ROTATE_ARGS
- * Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- /* compute s0 four at a time and s1 two at a time
- * compute W[-16] + W[-7] 4 at a time */
- movdqa XTMP0, X3
- mov y0, e /* y0 = e */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- mov y1, a /* y1 = a */
- palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- mov y2, f /* y2 = f */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- movdqa XTMP1, X1
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- xor y2, g /* y2 = f^g */
- paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- /* compute s0 */
- palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, y0 /* y2 = S1 + CH */
- add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */
- movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- pslld XTMP1, (32-7)
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- psrld XTMP2, 7
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+
+#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ /* compute s0 four at a time and s1 two at a time */; \
+ /* compute W[-16] + W[-7] 4 at a time */; \
+ movdqa XTMP0, X3; \
+ mov y0, e /* y0 = e */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ mov y1, a /* y1 = a */; \
+ palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ mov y2, f /* y2 = f */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ movdqa XTMP1, X1; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ xor y2, g /* y2 = f^g */; \
+ paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ /* compute s0 */; \
+ palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \
+ movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ pslld XTMP1, (32-7); \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ psrld XTMP2, 7; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
- movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */
- mov y0, e /* y0 = e */
- mov y1, a /* y1 = a */
- movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- mov y2, f /* y2 = f */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- pslld XTMP3, (32-18)
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- xor y2, g /* y2 = f^g */
- psrld XTMP2, 18
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- pxor XTMP1, XTMP3
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */
- add y2, y0 /* y2 = S1 + CH */
- add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- pxor XTMP1, XTMP4 /* XTMP1 = s0 */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- /* compute low s1 */
- pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */; \
+ mov y0, e /* y0 = e */; \
+ mov y1, a /* y1 = a */; \
+ movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ mov y2, f /* y2 = f */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ pslld XTMP3, (32-18); \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ xor y2, g /* y2 = f^g */; \
+ psrld XTMP2, 18; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ pxor XTMP1, XTMP3; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ pxor XTMP1, XTMP4 /* XTMP1 = s0 */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ /* compute low s1 */; \
+ pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
- movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */
- mov y0, e /* y0 = e */
- mov y1, a /* y1 = a */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- mov y2, f /* y2 = f */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */
- xor y2, g /* y2 = f^g */
- psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- pxor XTMP2, XTMP3
- add y2, y0 /* y2 = S1 + CH */
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */
- pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- /* compute high s1 */
- pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */; \
+ mov y0, e /* y0 = e */; \
+ mov y1, a /* y1 = a */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ mov y2, f /* y2 = f */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \
+ xor y2, g /* y2 = f^g */; \
+ psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ pxor XTMP2, XTMP3; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \
+ pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ /* compute high s1 */; \
+ pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
- movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */
- mov y0, e /* y0 = e */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- mov y1, a /* y1 = a */
- movdqa X0,    XTMP2 /* X0    = W[-2] {DDCC} */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- mov y2, f /* y2 = f */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- xor y2, g /* y2 = f^g */
- psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- and y2, e /* y2 = (f^g)&e */
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- psrld X0,    10 /* X0 = W[-2] >> 10 {DDCC} */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- pxor XTMP2, XTMP3
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, y0 /* y2 = S1 + CH */
- add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */
- pxor X0, XTMP2 /* X0 = s1 {xDxC} */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */; \
+ mov y0, e /* y0 = e */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ mov y1, a /* y1 = a */; \
+ movdqa X0,    XTMP2 /* X0    = W[-2] {DDCC} */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ mov y2, f /* y2 = f */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ xor y2, g /* y2 = f^g */; \
+ psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ psrld X0,    10 /* X0 = W[-2] >> 10 {DDCC} */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ pxor XTMP2, XTMP3; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \
+ pxor X0, XTMP2 /* X0 = s1 {xDxC} */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-rotate_Xs
-.endm
+#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+ FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
+ FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
+ FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
+ FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
 
 /* input is [rsp + _XFER + %1 * 4] */
-.macro DO_ROUND i1
- mov y0, e /* y0 = e */
- ror y0, (25-11) /* y0 = e >> (25-11) */
- mov y1, a /* y1 = a */
- xor y0, e /* y0 = e ^ (e >> (25-11)) */
- ror y1, (22-13) /* y1 = a >> (22-13) */
- mov y2, f /* y2 = f */
- xor y1, a /* y1 = a ^ (a >> (22-13) */
- ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
- xor y2, g /* y2 = f^g */
- xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
- ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
- and y2, e /* y2 = (f^g)&e */
- xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
- ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
- xor y2, g /* y2 = CH = ((f^g)&e)^g */
- add y2, y0 /* y2 = S1 + CH */
- ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
- add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */
- mov y0, a /* y0 = a */
- add h, y2 /* h = h + S1 + CH + k + w */
- mov y2, a /* y2 = a */
- or y0, c /* y0 = a|c */
- add d, h /* d = d + h + S1 + CH + k + w */
- and y2, c /* y2 = a&c */
- and y0, b /* y0 = (a|c)&b */
- add h, y1 /* h = h + S1 + CH + k + w + S0 */
- or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */
+#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
+ mov y0, e /* y0 = e */; \
+ ror y0, (25-11) /* y0 = e >> (25-11) */; \
+ mov y1, a /* y1 = a */; \
+ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
+ ror y1, (22-13) /* y1 = a >> (22-13) */; \
+ mov y2, f /* y2 = f */; \
+ xor y1, a /* y1 = a ^ (a >> (22-13) */; \
+ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+ xor y2, g /* y2 = f^g */; \
+ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+ and y2, e /* y2 = (f^g)&e */; \
+ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
+ add y2, y0 /* y2 = S1 + CH */; \
+ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+ add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \
+ mov y0, a /* y0 = a */; \
+ add h, y2 /* h = h + S1 + CH + k + w */; \
+ mov y2, a /* y2 = a */; \
+ or y0, c /* y0 = a|c */; \
+ add d, h /* d = d + h + S1 + CH + k + w */; \
+ and y2, c /* y2 = a&c */; \
+ and y0, b /* y0 = (a|c)&b */; \
+ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
+ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
- ROTATE_ARGS
-.endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -414,10 +389,10 @@ _gcry_sha256_transform_amd64_ssse3:
  lea TBL, [.LK256 ADD_RIP]
 
  /* byte swap first 16 dwords */
- COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
+ COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
+ COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
+ COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
 
  mov [rsp + _INP], INP
 
@@ -428,23 +403,23 @@ _gcry_sha256_transform_amd64_ssse3:
  movdqa XFER, [TBL + 0*16]
  paddd XFER, X0
  movdqa [rsp + _XFER], XFER
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
 
  movdqa XFER, [TBL + 1*16]
- paddd XFER, X0
+ paddd XFER, X1
  movdqa [rsp + _XFER], XFER
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
 
  movdqa XFER, [TBL + 2*16]
- paddd XFER, X0
+ paddd XFER, X2
  movdqa [rsp + _XFER], XFER
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
 
  movdqa XFER, [TBL + 3*16]
- paddd XFER, X0
+ paddd XFER, X3
  movdqa [rsp + _XFER], XFER
  add TBL, 4*16
- FOUR_ROUNDS_AND_SCHED
+ FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
 
  sub SRND, 1
  jne .Loop1
@@ -453,17 +428,17 @@ _gcry_sha256_transform_amd64_ssse3:
 .Loop2:
  paddd X0, [TBL + 0*16]
  movdqa [rsp + _XFER], X0
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
+ DO_ROUND(0, a, b, c, d, e, f, g, h)
+ DO_ROUND(1, h, a, b, c, d, e, f, g)
+ DO_ROUND(2, g, h, a, b, c, d, e, f)
+ DO_ROUND(3, f, g, h, a, b, c, d, e)
  paddd X1, [TBL + 1*16]
  movdqa [rsp + _XFER], X1
  add TBL, 2*16
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
+ DO_ROUND(0, e, f, g, h, a, b, c, d)
+ DO_ROUND(1, d, e, f, g, h, a, b, c)
+ DO_ROUND(2, c, d, e, f, g, h, a, b)
+ DO_ROUND(3, b, c, d, e, f, g, h, a)
 
  movdqa X0, X2
  movdqa X1, X3
@@ -471,14 +446,14 @@ _gcry_sha256_transform_amd64_ssse3:
  sub SRND, 1
  jne .Loop2
 
- addm [4*0 + CTX],a
- addm [4*1 + CTX],b
- addm [4*2 + CTX],c
- addm [4*3 + CTX],d
- addm [4*4 + CTX],e
- addm [4*5 + CTX],f
- addm [4*6 + CTX],g
- addm [4*7 + CTX],h
+ addm([4*0 + CTX],a)
+ addm([4*1 + CTX],b)
+ addm([4*2 + CTX],c)
+ addm([4*3 + CTX],d)
+ addm([4*4 + CTX],e)
+ addm([4*5 + CTX],f)
+ addm([4*6 + CTX],g)
+ addm([4*7 + CTX],h)
 
  mov INP, [rsp + _INP]
  add INP, 64
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index 991fd639..75f7b070 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -53,32 +53,32 @@
 .text
 
 /* Virtual Registers */
-msg = rdi /* ARG1 */
-digest = rsi /* ARG2 */
-msglen = rdx /* ARG3 */
-T1 = rcx
-T2 = r8
-a_64 = r9
-b_64 = r10
-c_64 = r11
-d_64 = r12
-e_64 = r13
-f_64 = r14
-g_64 = r15
-h_64 = rbx
-tmp0 = rax
+#define msg rdi /* ARG1 */
+#define digest rsi /* ARG2 */
+#define msglen rdx /* ARG3 */
+#define T1 rcx
+#define T2 r8
+#define a_64 r9
+#define b_64 r10
+#define c_64 r11
+#define d_64 r12
+#define e_64 r13
+#define f_64 r14
+#define g_64 r15
+#define h_64 rbx
+#define tmp0 rax
 
 /*
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 */
-frame_W      = 0 /* Message Schedule */
-frame_W_size = (80 * 8)
-frame_WK      = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
-frame_WK_size = (2 * 8)
-frame_GPRSAVE      = ((frame_WK) + (frame_WK_size))
-frame_GPRSAVE_size = (5 * 8)
-frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
+#define frame_W 0 /* Message Schedule */
+#define frame_W_size (80 * 8)
+#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
+#define frame_WK_size (2 * 8)
+#define frame_GPRSAVE ((frame_WK) + (frame_WK_size))
+#define frame_GPRSAVE_size (5 * 8)
+#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 
 
 /* Useful QWORD "arrays" for simpler memory references */
@@ -90,162 +90,151 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 /* MSG, DIGEST, K_t, W_t are arrays */
 /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
 
-.macro RotateState
- /* Rotate symbles a..h right */
- __TMP = h_64
- h_64 =  g_64
- g_64 =  f_64
- f_64 =  e_64
- e_64 =  d_64
- d_64 =  c_64
- c_64 =  b_64
- b_64 =  a_64
- a_64 =  __TMP
-.endm
-
-.macro RORQ p1 p2
- /* shld is faster than ror on Intel Sandybridge */
- shld \p1, \p1, (64 - \p2)
-.endm
-
-.macro SHA512_Round t
- /* Compute Round %%t */
- mov T1,   f_64        /* T1 = f */
- mov tmp0, e_64        /* tmp = e */
- xor T1,   g_64        /* T1 = f ^ g */
- RORQ tmp0, 23 /* 41     ; tmp = e ror 23 */
- and T1,   e_64        /* T1 = (f ^ g) & e */
- xor tmp0, e_64        /* tmp = (e ror 23) ^ e */
- xor T1,   g_64        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */
- add T1,   [WK_2(\t)] /* W[t] + K[t] from message scheduler */
- RORQ tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */
- xor tmp0, e_64        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */
- mov T2,   a_64        /* T2 = a */
- add T1,   h_64        /* T1 = CH(e,f,g) + W[t] + K[t] + h */
- RORQ tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */
- add T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */
- mov tmp0, a_64        /* tmp = a */
- xor T2,   c_64        /* T2 = a ^ c */
- and tmp0, c_64        /* tmp = a & c */
- and T2,   b_64        /* T2 = (a ^ c) & b */
- xor T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */
- mov tmp0, a_64        /* tmp = a */
- RORQ tmp0, 5 /* 39      ; tmp = a ror 5 */
- xor tmp0, a_64        /* tmp = (a ror 5) ^ a */
- add d_64, T1          /* e(next_state) = d + T1  */
- RORQ tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */
- xor tmp0, a_64        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */
- lea h_64, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */
- RORQ tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */
- add h_64, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_avx t
-/* ; Compute rounds %%t-2 and %%t-1
- ; Compute message schedule QWORDS %%t and %%t+1
-
- ;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
- ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- ; scheduler.
- ;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
- ; They are then added to their respective SHA512 constants at
- ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
- ;   For brievity, the comments following vectored instructions only refer to
- ; the first of a pair of QWORDS.
- ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
- ;   The computation of the message schedule and the rounds are tightly
- ; stitched to take advantage of instruction-level parallelism.
- ; For clarity, integer instructions (for the rounds calculation) are indented
- ; by one tab. Vectored instructions (for the message scheduler) are indented
- ; by two tabs. */
-
- vmovdqa xmm4, [W_t(\t-2)]   /* XMM4 = W[t-2] */
- vmovdqu xmm5, [W_t(\t-15)]  /* XMM5 = W[t-15] */
- mov T1,   f_64
- vpsrlq xmm0, xmm4, 61       /* XMM0 = W[t-2]>>61 */
- mov tmp0, e_64
- vpsrlq xmm6, xmm5, 1        /* XMM6 = W[t-15]>>1 */
- xor T1,   g_64
- RORQ tmp0, 23 /* 41 */
- vpsrlq xmm1, xmm4, 19       /* XMM1 = W[t-2]>>19 */
- and T1,   e_64
- xor tmp0, e_64
- vpxor xmm0, xmm0, xmm1           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */
- xor T1,   g_64
- add T1,   [WK_2(\t)];
- vpsrlq xmm7, xmm5, 8        /* XMM7 = W[t-15]>>8 */
- RORQ tmp0, 4 /* 18 */
- vpsrlq xmm2, xmm4, 6        /* XMM2 = W[t-2]>>6 */
- xor tmp0, e_64
- mov T2,   a_64
- add T1,   h_64
- vpxor xmm6, xmm6, xmm7           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */
- RORQ tmp0, 14 /* 14 */
- add T1,   tmp0
- vpsrlq xmm8, xmm5, 7        /* XMM8 = W[t-15]>>7 */
- mov tmp0, a_64
- xor T2,   c_64
- vpsllq xmm3, xmm4, (64-61)  /* XMM3 = W[t-2]<<3 */
- and tmp0, c_64
- and T2,   b_64
- vpxor xmm2, xmm2, xmm3           /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */
- xor T2,   tmp0
- mov tmp0, a_64
- vpsllq xmm9, xmm5, (64-1)   /* XMM9 = W[t-15]<<63 */
- RORQ tmp0, 5 /* 39 */
- vpxor xmm8, xmm8, xmm9           /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */
- xor tmp0, a_64
- add d_64, T1
- RORQ tmp0, 6 /* 34 */
- xor tmp0, a_64
- vpxor xmm6, xmm6, xmm8           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */
- lea h_64, [T1 + T2]
- RORQ tmp0, 28 /* 28 */
- vpsllq xmm4, xmm4, (64-19)        /* XMM4 = W[t-2]<<25 */
- add h_64, tmp0
- RotateState
- vpxor xmm0, xmm0, xmm4           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */
- mov T1, f_64
- vpxor xmm0, xmm0, xmm2           /* XMM0 = s1(W[t-2]) */
- mov tmp0, e_64
- xor T1,   g_64
- vpaddq xmm0, xmm0, [W_t(\t-16)]  /* XMM0 = s1(W[t-2]) + W[t-16] */
- vmovdqu xmm1, [W_t(\t- 7)]  /* XMM1 = W[t-7] */
- RORQ tmp0, 23 /* 41 */
- and T1,   e_64
- xor tmp0, e_64
- xor T1,   g_64
- vpsllq xmm5, xmm5, (64-8)         /* XMM5 = W[t-15]<<56 */
- add T1,   [WK_2(\t+1)]
- vpxor xmm6, xmm6, xmm5           /* XMM6 = s0(W[t-15]) */
- RORQ tmp0, 4 /* 18 */
- vpaddq xmm0, xmm0, xmm6           /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */
- xor tmp0, e_64
- vpaddq xmm0, xmm0, xmm1           /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */
- mov T2,   a_64
- add T1,   h_64
- RORQ tmp0, 14 /* 14 */
- add T1,   tmp0
- vmovdqa [W_t(\t)], xmm0      /* Store W[t] */
- vpaddq xmm0, xmm0, [K_t(t)]        /* Compute W[t]+K[t] */
- vmovdqa [WK_2(t)], xmm0       /* Store W[t]+K[t] for next rounds */
- mov tmp0, a_64
- xor T2,   c_64
- and tmp0, c_64
- and T2,   b_64
- xor T2,   tmp0
- mov tmp0, a_64
- RORQ tmp0, 5 /* 39 */
- xor tmp0, a_64
- add d_64, T1
- RORQ tmp0, 6 /* 34 */
- xor tmp0, a_64
- lea h_64, [T1 + T2]
- RORQ tmp0, 28 /* 28 */
- add h_64, tmp0
- RotateState
-.endm
+#define RORQ(p1, p2) \
+ /* shld is faster than ror on Intel Sandybridge */ \
+ shld p1, p1, (64 - p2)
+
+#define SHA512_Round(t, a, b, c, d, e, f, g, h) \
+ /* Compute Round %%t */; \
+ mov T1,   f        /* T1 = f */; \
+ mov tmp0, e        /* tmp = e */; \
+ xor T1,   g        /* T1 = f ^ g */; \
+ RORQ( tmp0, 23) /* 41     ; tmp = e ror 23 */; \
+ and T1,   e        /* T1 = (f ^ g) & e */; \
+ xor tmp0, e        /* tmp = (e ror 23) ^ e */; \
+ xor T1,   g        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \
+ add T1,   [WK_2(t)] /* W[t] + K[t] from message scheduler */; \
+ RORQ( tmp0, 4) /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */; \
+ xor tmp0, e        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \
+ mov T2,   a        /* T2 = a */; \
+ add T1,   h        /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \
+ RORQ( tmp0, 14) /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \
+ add T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \
+ mov tmp0, a        /* tmp = a */; \
+ xor T2,   c        /* T2 = a ^ c */; \
+ and tmp0, c        /* tmp = a & c */; \
+ and T2,   b        /* T2 = (a ^ c) & b */; \
+ xor T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \
+ mov tmp0, a        /* tmp = a */; \
+ RORQ( tmp0, 5) /* 39      ; tmp = a ror 5 */; \
+ xor tmp0, a        /* tmp = (a ror 5) ^ a */; \
+ add d, T1          /* e(next_state) = d + T1  */; \
+ RORQ( tmp0, 6) /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */; \
+ xor tmp0, a        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \
+ lea h, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */; \
+ RORQ( tmp0, 28) /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \
+ add h, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
+
+#define SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h) \
+ /* \
+ ; Compute rounds %%t-2 and %%t-1 \
+ ; Compute message schedule QWORDS %%t and %%t+1 \
+ ; \
+ ;   Two rounds are computed based on the values for K[t-2]+W[t-2] and \
+ ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \
+ ; scheduler. \
+ ;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \
+ ; They are then added to their respective SHA512 constants at \
+ ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \
+ ;   For brievity, the comments following vectored instructions only refer to \
+ ; the first of a pair of QWORDS. \
+ ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} \
+ ;   The computation of the message schedule and the rounds are tightly \
+ ; stitched to take advantage of instruction-level parallelism. \
+ ; For clarity, integer instructions (for the rounds calculation) are indented \
+ ; by one tab. Vectored instructions (for the message scheduler) are indented \
+ ; by two tabs. \
+ */ \
+ \
+ vmovdqa xmm4, [W_t(t-2)]   /* XMM4 = W[t-2] */; \
+ vmovdqu xmm5, [W_t(t-15)]  /* XMM5 = W[t-15] */; \
+ mov T1,   f; \
+ vpsrlq xmm0, xmm4, 61       /* XMM0 = W[t-2]>>61 */; \
+ mov tmp0, e; \
+ vpsrlq xmm6, xmm5, 1        /* XMM6 = W[t-15]>>1 */; \
+ xor T1,   g; \
+ RORQ( tmp0, 23) /* 41 */; \
+ vpsrlq xmm1, xmm4, 19       /* XMM1 = W[t-2]>>19 */; \
+ and T1,   e; \
+ xor tmp0, e; \
+ vpxor xmm0, xmm0, xmm1           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */; \
+ xor T1,   g; \
+ add T1,   [WK_2(t)]; \
+ vpsrlq xmm7, xmm5, 8        /* XMM7 = W[t-15]>>8 */; \
+ RORQ( tmp0, 4) /* 18 */; \
+ vpsrlq xmm2, xmm4, 6        /* XMM2 = W[t-2]>>6 */; \
+ xor tmp0, e; \
+ mov T2,   a; \
+ add T1,   h; \
+ vpxor xmm6, xmm6, xmm7           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */; \
+ RORQ( tmp0, 14) /* 14 */; \
+ add T1,   tmp0; \
+ vpsrlq xmm8, xmm5, 7        /* XMM8 = W[t-15]>>7 */; \
+ mov tmp0, a; \
+ xor T2,   c; \
+ vpsllq xmm3, xmm4, (64-61)  /* XMM3 = W[t-2]<<3 */; \
+ and tmp0, c; \
+ and T2,   b; \
+ vpxor xmm2, xmm2, xmm3           /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */; \
+ xor T2,   tmp0; \
+ mov tmp0, a; \
+ vpsllq xmm9, xmm5, (64-1)   /* XMM9 = W[t-15]<<63 */; \
+ RORQ( tmp0, 5) /* 39 */; \
+ vpxor xmm8, xmm8, xmm9           /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */; \
+ xor tmp0, a; \
+ add d, T1; \
+ RORQ( tmp0, 6) /* 34 */; \
+ xor tmp0, a; \
+ vpxor xmm6, xmm6, xmm8           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */; \
+ lea h, [T1 + T2]; \
+ RORQ( tmp0, 28) /* 28 */; \
+ vpsllq xmm4, xmm4, (64-19)        /* XMM4 = W[t-2]<<25 */; \
+ add h, tmp0
+
+#define SHA512_2Sched_2Round_avx_PART2(t, a, b, c, d, e, f, g, h) \
+ vpxor xmm0, xmm0, xmm4           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */; \
+ mov T1, f; \
+ vpxor xmm0, xmm0, xmm2           /* XMM0 = s1(W[t-2]) */; \
+ mov tmp0, e; \
+ xor T1,   g; \
+ vpaddq xmm0, xmm0, [W_t(t-16)]  /* XMM0 = s1(W[t-2]) + W[t-16] */; \
+ vmovdqu xmm1, [W_t(t- 7)]  /* XMM1 = W[t-7] */; \
+ RORQ( tmp0, 23) /* 41 */; \
+ and T1,   e; \
+ xor tmp0, e; \
+ xor T1,   g; \
+ vpsllq xmm5, xmm5, (64-8)         /* XMM5 = W[t-15]<<56 */; \
+ add T1,   [WK_2(t+1)]; \
+ vpxor xmm6, xmm6, xmm5           /* XMM6 = s0(W[t-15]) */; \
+ RORQ( tmp0, 4) /* 18 */; \
+ vpaddq xmm0, xmm0, xmm6           /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */; \
+ xor tmp0, e; \
+ vpaddq xmm0, xmm0, xmm1           /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \
+ mov T2,   a; \
+ add T1,   h; \
+ RORQ( tmp0, 14) /* 14 */; \
+ add T1,   tmp0; \
+ vmovdqa [W_t(t)], xmm0      /* Store W[t] */; \
+ vpaddq xmm0, xmm0, [K_t(t)]        /* Compute W[t]+K[t] */; \
+ vmovdqa [WK_2(t)], xmm0       /* Store W[t]+K[t] for next rounds */; \
+ mov tmp0, a; \
+ xor T2,   c; \
+ and tmp0, c; \
+ and T2,   b; \
+ xor T2,   tmp0; \
+ mov tmp0, a; \
+ RORQ( tmp0, 5) /* 39 */; \
+ xor tmp0, a; \
+ add d, T1; \
+ RORQ( tmp0, 6) /* 34 */; \
+ xor tmp0, a; \
+ lea h, [T1 + T2]; \
+ RORQ( tmp0, 28) /* 28 */; \
+ add h, tmp0
+
+#define SHA512_2Sched_2Round_avx(t, a, b, c, d, e, f, g, h) \
+ SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h); \
+ SHA512_2Sched_2Round_avx_PART2(t, h, a, b, c, d, e, f, g)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -295,37 +284,77 @@ _gcry_sha512_transform_amd64_avx:
  mov g_64, [DIGEST(6)]
  mov h_64, [DIGEST(7)]
 
- t = 0
- .rept 80/2 + 1
- /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */
- /* +1 iteration because the scheduler leads hashing by 1 iteration */
- .if t < 2
- /* BSWAP 2 QWORDS */
- vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
- vmovdqu xmm0, [MSG(t)]
- vpshufb xmm0, xmm0, xmm1     /* BSWAP */
- vmovdqa [W_t(t)], xmm0       /* Store Scheduled Pair */
- vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */
- vmovdqa [WK_2(t)], xmm0      /* Store into WK for rounds */
- .elseif t < 16
- /* BSWAP 2 QWORDS, Compute 2 Rounds */
- vmovdqu xmm0, [MSG(t)]
- vpshufb xmm0, xmm0, xmm1     /* BSWAP */
- SHA512_Round (t - 2)         /* Round t-2 */
- vmovdqa [W_t(t)], xmm0       /* Store Scheduled Pair */
- vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */
- SHA512_Round (t - 1)         /* Round t-1 */
- vmovdqa [WK_2(t)], xmm0      /* W[t]+K[t] into WK */
- .elseif t < 79
- /* Schedule 2 QWORDS; Compute 2 Rounds */
- SHA512_2Sched_2Round_avx t
- .else
- /* Compute 2 Rounds */
- SHA512_Round (t - 2)
- SHA512_Round (t - 1)
- .endif
- t = ((t)+2)
- .endr
+ /* BSWAP 2 QWORDS */
+ vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
+ vmovdqu xmm0, [MSG(0)]
+ vpshufb xmm0, xmm0, xmm1     /* BSWAP */
+ vmovdqa [W_t(0)], xmm0       /* Store Scheduled Pair */
+ vpaddq xmm0, xmm0, [K_t(0)] /* Compute W[t]+K[t] */
+ vmovdqa [WK_2(0)], xmm0      /* Store into WK for rounds */
+
+ #define T_2_14(t, a, b, c, d, e, f, g, h) \
+ /* BSWAP 2 QWORDS, Compute 2 Rounds */; \
+ vmovdqu xmm0, [MSG(t)]; \
+ vpshufb xmm0, xmm0, xmm1     /* BSWAP */; \
+ SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \
+        e##_64, f##_64, g##_64, h##_64); \
+ vmovdqa [W_t(t)], xmm0       /* Store Scheduled Pair */; \
+ vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \
+ SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \
+        d##_64, e##_64, f##_64, g##_64); \
+ vmovdqa [WK_2(t)], xmm0      /* W[t]+K[t] into WK */
+
+ #define T_16_78(t, a, b, c, d, e, f, g, h) \
+ SHA512_2Sched_2Round_avx((t), a##_64, b##_64, c##_64, d##_64, \
+      e##_64, f##_64, g##_64, h##_64)
+
+ #define T_80(t, a, b, c, d, e, f, g, h) \
+ /* Compute 2 Rounds */; \
+ SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \
+      e##_64, f##_64, g##_64, h##_64); \
+ SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \
+      d##_64, e##_64, f##_64, g##_64)
+
+ T_2_14(2, a, b, c, d, e, f, g, h)
+ T_2_14(4, g, h, a, b, c, d, e, f)
+ T_2_14(6, e, f, g, h, a, b, c, d)
+ T_2_14(8, c, d, e, f, g, h, a, b)
+ T_2_14(10, a, b, c, d, e, f, g, h)
+ T_2_14(12, g, h, a, b, c, d, e, f)
+ T_2_14(14, e, f, g, h, a, b, c, d)
+ T_16_78(16, c, d, e, f, g, h, a, b)
+ T_16_78(18, a, b, c, d, e, f, g, h)
+ T_16_78(20, g, h, a, b, c, d, e, f)
+ T_16_78(22, e, f, g, h, a, b, c, d)
+ T_16_78(24, c, d, e, f, g, h, a, b)
+ T_16_78(26, a, b, c, d, e, f, g, h)
+ T_16_78(28, g, h, a, b, c, d, e, f)
+ T_16_78(30, e, f, g, h, a, b, c, d)
+ T_16_78(32, c, d, e, f, g, h, a, b)
+ T_16_78(34, a, b, c, d, e, f, g, h)
+ T_16_78(36, g, h, a, b, c, d, e, f)
+ T_16_78(38, e, f, g, h, a, b, c, d)
+ T_16_78(40, c, d, e, f, g, h, a, b)
+ T_16_78(42, a, b, c, d, e, f, g, h)
+ T_16_78(44, g, h, a, b, c, d, e, f)
+ T_16_78(46, e, f, g, h, a, b, c, d)
+ T_16_78(48, c, d, e, f, g, h, a, b)
+ T_16_78(50, a, b, c, d, e, f, g, h)
+ T_16_78(52, g, h, a, b, c, d, e, f)
+ T_16_78(54, e, f, g, h, a, b, c, d)
+ T_16_78(56, c, d, e, f, g, h, a, b)
+ T_16_78(58, a, b, c, d, e, f, g, h)
+ T_16_78(60, g, h, a, b, c, d, e, f)
+ T_16_78(62, e, f, g, h, a, b, c, d)
+ T_16_78(64, c, d, e, f, g, h, a, b)
+ T_16_78(66, a, b, c, d, e, f, g, h)
+ T_16_78(68, g, h, a, b, c, d, e, f)
+ T_16_78(70, e, f, g, h, a, b, c, d)
+ T_16_78(72, c, d, e, f, g, h, a, b)
+ T_16_78(74, a, b, c, d, e, f, g, h)
+ T_16_78(76, g, h, a, b, c, d, e, f)
+ T_16_78(78, e, f, g, h, a, b, c, d)
+ T_80(80, c, d, e, f, g, h, a, b)
 
  /* Update digest */
  add [DIGEST(0)], a_64
@@ -357,11 +386,12 @@ _gcry_sha512_transform_amd64_avx:
  vzeroall
 
  /* Burn stack */
- t = 0
- .rept frame_W_size / 32
- vmovups [rsp + frame_W + (t) * 32], ymm0
- t = ((t)+1)
- .endr
+ mov eax, 0
+.Lerase_stack:
+ vmovdqu [rsp + rax], ymm0
+ add eax, 32
+ cmp eax, frame_W_size
+ jne .Lerase_stack
  vmovdqu [rsp + frame_WK], xmm0
  xor     eax, eax
 
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index 3b28ab6c..7f119e6c 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -56,46 +56,45 @@
 .text
 
 /* Virtual Registers */
-Y_0 = ymm4
-Y_1 = ymm5
-Y_2 = ymm6
-Y_3 = ymm7
-
-YTMP0 = ymm0
-YTMP1 = ymm1
-YTMP2 = ymm2
-YTMP3 = ymm3
-YTMP4 = ymm8
-XFER =  YTMP0
-
-BYTE_FLIP_MASK =  ymm9
-MASK_YMM_LO    =  ymm10
-MASK_YMM_LOx   =  xmm10
-
-INP =         rdi /* 1st arg */
-CTX =         rsi /* 2nd arg */
-NUM_BLKS =    rdx /* 3rd arg */
-c =           rcx
-d =           r8
-e =           rdx
-y3 =          rdi
-
-TBL =   rbp
-
-a =     rax
-b =     rbx
-
-f =     r9
-g =     r10
-h =     r11
-old_h = rax
-
-T1 =    r12
-y0 =    r13
-y1 =    r14
-y2 =    r15
-
-y4 =    r12
+#define Y_0 ymm4
+#define Y_1 ymm5
+#define Y_2 ymm6
+#define Y_3 ymm7
+
+#define YTMP0 ymm0
+#define YTMP1 ymm1
+#define YTMP2 ymm2
+#define YTMP3 ymm3
+#define YTMP4 ymm8
+#define XFER YTMP0
+
+#define BYTE_FLIP_MASK ymm9
+#define MASK_YMM_LO ymm10
+#define MASK_YMM_LOx xmm10
+
+#define INP rdi /* 1st arg */
+#define CTX rsi /* 2nd arg */
+#define NUM_BLKS rdx /* 3rd arg */
+#define c rcx
+#define d r8
+#define e rdx
+#define y3 rdi
+
+#define TBL rbp
+
+#define a rax
+#define b rbx
+
+#define f r9
+#define g r10
+#define h r11
+
+#define T1 r12
+#define y0 r13
+#define y1 r14
+#define y2 r15
+
+#define y4 r12
 
 /* Local variables (stack frame) */
 #define frame_XFER      0
@@ -116,218 +115,153 @@ y4 =    r12
 
 /* addm [mem], reg */
 /* Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
- add \p2, \p1
- mov \p1, \p2
-.endm
+#define addm(p1, p2) \
+ add p2, p1; \
+ mov p1, p2;
 
 
 /* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */
 /* Load ymm with mem and byte swap each dword */
-.macro COPY_YMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p1, \p2
- vpshufb \p1, \p1, \p3
-.endm
-/* rotate_Ys */
-/* Rotate values of symbols Y0...Y3 */
-.macro rotate_Ys
- __Y_ = Y_0
- Y_0 = Y_1
- Y_1 = Y_2
- Y_2 = Y_3
- Y_3 = __Y_
-.endm
-
-/* RotateState */
-.macro RotateState
- /* Rotate symbles a..h right */
- old_h =  h
- __TMP_ = h
- h =      g
- g =      f
- f =      e
- e =      d
- d =      c
- c =      b
- b =      a
- a =      __TMP_
-.endm
+#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
+ VMOVDQ p1, p2; \
+ vpshufb p1, p1, p3
 
 /* %macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL */
 /* YDST = {YSRC1, YSRC2} >> RVAL*8 */
-.macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
- vperm2f128 \YDST, \YSRC1, \YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */
- vpalignr \YDST, \YDST, \YSRC2, \RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */
-.endm
-
-.macro ONE_ROUND_PART1 XFER
- /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
- * d += h;
- * h += Sum0 (a) + Maj (a, b, c);
- *
- * Ch(x, y, z) => ((x & y) + (~x & z))
- * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
- */
-
- mov y3, e
- add h, [\XFER]
- and y3, f
- rorx y0, e, 41
- rorx y1, e, 18
+#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
+ vperm2i128 YDST, YSRC1, YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */; \
+ vpalignr   YDST, YDST, YSRC2, RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */
+
+#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
+ /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \
+ * d += h; \
+ * h += Sum0 (a) + Maj (a, b, c); \
+ * \
+ * Ch(x, y, z) => ((x & y) + (~x & z)) \
+ * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \
+ */ \
+ \
+ mov y3, e; \
+ add h, [XFERIN]; \
+ and y3, f; \
+ rorx y0, e, 41; \
+ rorx y1, e, 18; \
+ lea h, [h + y3]; \
+ andn y3, e, g; \
+ rorx T1, a, 34; \
+ xor y0, y1; \
  lea h, [h + y3]
- andn y3, e, g
- rorx T1, a, 34
- xor y0, y1
- lea h, [h + y3]
-.endm
-.macro ONE_ROUND_PART2
- rorx y2, a, 39
- rorx y1, e, 14
- mov y3, a
- xor T1, y2
- xor y0, y1
- xor y3, b
- lea h, [h + y0]
- mov y0, a
- rorx y2, a, 28
- add d, h
- and y3, c
- xor T1, y2
- lea h, [h + y3]
- lea h, [h + T1]
- and y0, b
- lea h, [h + y0]
-.endm
-
-.macro ONE_ROUND XFER
- ONE_ROUND_PART1 \XFER
- ONE_ROUND_PART2
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED X
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- /* Extract w[t-7] */
- MY_VPALIGNR YTMP0, Y_3, Y_2, 8 /* YTMP0 = W[-7] */
- /* Calculate w[t-16] + w[t-7] */
- vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */
- /* Extract w[t-15] */
- MY_VPALIGNR YTMP1, Y_1, Y_0, 8 /* YTMP1 = W[-15] */
-
- /* Calculate sigma0 */
-
- /* Calculate w[t-15] ror 1 */
- vpsrlq YTMP2, YTMP1, 1
- vpsllq YTMP3, YTMP1, (64-1)
- vpor YTMP3, YTMP3, YTMP2 /* YTMP3 = W[-15] ror 1 */
- /* Calculate w[t-15] shr 7 */
- vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */
-
- ONE_ROUND rsp+frame_XFER+0*8+\X*32
- RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- /* Calculate w[t-15] ror 8 */
- vpsrlq YTMP2, YTMP1, 8
- vpsllq YTMP1, YTMP1, (64-8)
- vpor YTMP1, YTMP1, YTMP2 /* YTMP1 = W[-15] ror 8 */
- /* XOR the three components */
- vpxor YTMP3, YTMP3, YTMP4 /* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */
- vpxor YTMP1, YTMP3, YTMP1 /* YTMP1 = s0 */
-
-
- /* Add three components, w[t-16], w[t-7] and sigma0 */
- vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */
- /* Move to appropriate lanes for calculating w[16] and w[17] */
- vperm2f128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */
- /* Move to appropriate lanes for calculating w[18] and w[19] */
- vpand YTMP0, YTMP0, MASK_YMM_LO /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */
-
- /* Calculate w[16] and w[17] in both 128 bit lanes */
-
- /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */
- vperm2f128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */
- vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */
-
- ONE_ROUND rsp+frame_XFER+1*8+\X*32
- RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;; */
 
+#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
+ rorx y2, a, 39; \
+ rorx y1, e, 14; \
+ mov y3, a; \
+ xor T1, y2; \
+ xor y0, y1; \
+ xor y3, b; \
+ lea h, [h + y0]; \
+ mov y0, a; \
+ rorx y2, a, 28; \
+ add d, h; \
+ and y3, c; \
+ xor T1, y2; \
+ lea h, [h + y3]; \
+ lea h, [h + T1]; \
+ and y0, b; \
+ lea h, [h + y0]
 
- vpsrlq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] >> 19 {BABA} */
- vpsllq YTMP1, YTMP2, (64-19) /* YTMP1 = W[-2] << 19 {BABA} */
- vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {BABA} */
- vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */
- vpsrlq YTMP3, YTMP2, 61 /* YTMP3 = W[-2] >> 61 {BABA} */
- vpsllq YTMP1, YTMP2, (64-61) /* YTMP1 = W[-2] << 61 {BABA} */
- vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {BABA} */
- vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */
-
- /* Add sigma1 to the other compunents to get w[16] and w[17] */
- vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */
-
- /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */
- vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */
-
- ONE_ROUND rsp+frame_XFER+2*8+\X*32
- RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- vpsrlq YTMP3, Y_0, 19 /* YTMP3 = W[-2] >> 19 {DC--} */
- vpsllq YTMP1, Y_0, (64-19) /* YTMP1 = W[-2] << 19 {DC--} */
- vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {DC--} */
- vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */
- vpsrlq YTMP3, Y_0, 61 /* YTMP3 = W[-2] >> 61 {DC--} */
- vpsllq YTMP1, Y_0, (64-61) /* YTMP1 = W[-2] << 61 {DC--} */
- vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {DC--} */
- vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */
-
- /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */
- vpaddq YTMP2, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], --, --} */
-
- /* Form w[19, w[18], w17], w[16] */
- vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */
-
- ONE_ROUND_PART1 rsp+frame_XFER+3*8+\X*32
- vpaddq XFER, Y_0, [TBL + (4+\X)*32]
- vmovdqa [rsp + frame_XFER + \X*32], XFER
- ONE_ROUND_PART2
- RotateState
- rotate_Ys
-.endm
-
-.macro DO_4ROUNDS X
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- ONE_ROUND rsp+frame_XFER+0*8+\X*32
- RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- ONE_ROUND rsp+frame_XFER+1*8+\X*32
- RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- ONE_ROUND rsp+frame_XFER+2*8+\X*32
- RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
- ONE_ROUND rsp+frame_XFER+3*8+\X*32
- RotateState
-
-.endm
+#define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \
+ ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \
+ ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
+
+#define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \
+ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+ /* Extract w[t-7] */; \
+ MY_VPALIGNR( YTMP0, Y_3, Y_2, 8) /* YTMP0 = W[-7] */; \
+ /* Calculate w[t-16] + w[t-7] */; \
+ vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */; \
+ /* Extract w[t-15] */; \
+ MY_VPALIGNR( YTMP1, Y_1, Y_0, 8) /* YTMP1 = W[-15] */; \
+ \
+ /* Calculate sigma0 */; \
+ \
+ /* Calculate w[t-15] ror 1 */; \
+ vpsrlq YTMP2, YTMP1, 1; \
+ vpsllq YTMP3, YTMP1, (64-1); \
+ vpor YTMP3, YTMP3, YTMP2 /* YTMP3 = W[-15] ror 1 */; \
+ /* Calculate w[t-15] shr 7 */; \
+ vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */; \
+ \
+ ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+ \
+ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+ /* Calculate w[t-15] ror 8 */; \
+ vpsrlq YTMP2, YTMP1, 8; \
+ vpsllq YTMP1, YTMP1, (64-8); \
+ vpor YTMP1, YTMP1, YTMP2 /* YTMP1 = W[-15] ror 8 */; \
+ /* XOR the three components */; \
+ vpxor YTMP3, YTMP3, YTMP4 /* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */; \
+ vpxor YTMP1, YTMP3, YTMP1 /* YTMP1 = s0 */; \
+ \
+ /* Add three components, w[t-16], w[t-7] and sigma0 */; \
+ vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */; \
+ /* Move to appropriate lanes for calculating w[16] and w[17] */; \
+ vperm2i128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \
+ /* Move to appropriate lanes for calculating w[18] and w[19] */; \
+ vpand YTMP0, YTMP0, MASK_YMM_LO /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */; \
+ \
+ /* Calculate w[16] and w[17] in both 128 bit lanes */; \
+ \
+ /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \
+ vperm2i128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */; \
+ vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */; \
+ \
+ ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+ \
+ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+ vpsrlq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] >> 19 {BABA} */; \
+ vpsllq YTMP1, YTMP2, (64-19) /* YTMP1 = W[-2] << 19 {BABA} */; \
+ vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {BABA} */; \
+ vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */; \
+ vpsrlq YTMP3, YTMP2, 61 /* YTMP3 = W[-2] >> 61 {BABA} */; \
+ vpsllq YTMP1, YTMP2, (64-61) /* YTMP1 = W[-2] << 61 {BABA} */; \
+ vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {BABA} */; \
+ vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \
+ \
+ /* Add sigma1 to the other compunents to get w[16] and w[17] */; \
+ vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */; \
+ \
+ /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \
+ vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */; \
+ \
+ ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+ \
+ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+ vpsrlq YTMP3, Y_0, 19 /* YTMP3 = W[-2] >> 19 {DC--} */; \
+ vpsllq YTMP1, Y_0, (64-19) /* YTMP1 = W[-2] << 19 {DC--} */; \
+ vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {DC--} */; \
+ vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */; \
+ vpsrlq YTMP3, Y_0, 61 /* YTMP3 = W[-2] >> 61 {DC--} */; \
+ vpsllq YTMP1, Y_0, (64-61) /* YTMP1 = W[-2] << 61 {DC--} */; \
+ vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {DC--} */; \
+ vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \
+ \
+ /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \
+ vpaddq YTMP2, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], --, --} */; \
+ \
+ /* Form w[19, w[18], w17], w[16] */; \
+ vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */; \
+ \
+ ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \
+ vpaddq XFER, Y_0, [TBL + (4+X)*32]; \
+ vmovdqa [rsp + frame_XFER + X*32], XFER; \
+ ONE_ROUND_PART2(f, g, h, a, b, c, d, e)
+
+#define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \
+ ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+ ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+ ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+ ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -390,10 +324,10 @@ _gcry_sha512_transform_amd64_avx2:
  lea TBL,[.LK512 ADD_RIP]
 
  /*; byte swap first 16 dwords */
- COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+ COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+ COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+ COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
 
  add INP, 128
  mov [rsp + frame_INP], INP
@@ -408,20 +342,20 @@ _gcry_sha512_transform_amd64_avx2:
  vmovdqa [rsp + frame_XFER + 3*32], XFER
 
  /*; schedule 64 input dwords, by doing 12 rounds of 4 each */
- movq [rsp + frame_SRND],4
+ mov qword ptr [rsp + frame_SRND], 4
 
 .align 16
 .Loop0:
- FOUR_ROUNDS_AND_SCHED 0
- FOUR_ROUNDS_AND_SCHED 1
- FOUR_ROUNDS_AND_SCHED 2
- FOUR_ROUNDS_AND_SCHED 3
+ FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h)
+ FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d)
+ FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h)
+ FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d)
  add TBL, 4*32
 
- subq [rsp + frame_SRND], 1
+ sub qword ptr [rsp + frame_SRND], 1
  jne .Loop0
 
- subq [rsp + frame_NBLKS], 1
+ sub qword ptr [rsp + frame_NBLKS], 1
  je .Ldone_hash
 
  mov INP, [rsp + frame_INP]
@@ -429,62 +363,62 @@ _gcry_sha512_transform_amd64_avx2:
  lea TBL,[.LK512 ADD_RIP]
 
  /* load next block and byte swap */
- COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK
- COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+ COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+ COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+ COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
 
  add INP, 128
  mov [rsp + frame_INP], INP
 
- DO_4ROUNDS 0
+ DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
  vpaddq XFER, Y_0, [TBL + 0*32]
  vmovdqa [rsp + frame_XFER + 0*32], XFER
- DO_4ROUNDS 1
+ DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
  vpaddq XFER, Y_1, [TBL + 1*32]
  vmovdqa [rsp + frame_XFER + 1*32], XFER
- DO_4ROUNDS 2
+ DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
  vpaddq XFER, Y_2, [TBL + 2*32]
  vmovdqa [rsp + frame_XFER + 2*32], XFER
- DO_4ROUNDS 3
+ DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
  vpaddq XFER, Y_3, [TBL + 3*32]
  vmovdqa [rsp + frame_XFER + 3*32], XFER
 
- addm [8*0 + CTX],a
- addm [8*1 + CTX],b
- addm [8*2 + CTX],c
- addm [8*3 + CTX],d
- addm [8*4 + CTX],e
- addm [8*5 + CTX],f
- addm [8*6 + CTX],g
- addm [8*7 + CTX],h
+ addm([8*0 + CTX],a)
+ addm([8*1 + CTX],b)
+ addm([8*2 + CTX],c)
+ addm([8*3 + CTX],d)
+ addm([8*4 + CTX],e)
+ addm([8*5 + CTX],f)
+ addm([8*6 + CTX],g)
+ addm([8*7 + CTX],h)
 
  /*; schedule 64 input dwords, by doing 12 rounds of 4 each */
- movq [rsp + frame_SRND],4
+ mov qword ptr [rsp + frame_SRND],4
 
  jmp .Loop0
 
 .Ldone_hash:
  vzeroall
 
- DO_4ROUNDS 0
+ DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
  vmovdqa [rsp + frame_XFER + 0*32], ymm0 /* burn stack */
- DO_4ROUNDS 1
+ DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
  vmovdqa [rsp + frame_XFER + 1*32], ymm0 /* burn stack */
- DO_4ROUNDS 2
+ DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
  vmovdqa [rsp + frame_XFER + 2*32], ymm0 /* burn stack */
- DO_4ROUNDS 3
+ DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
  vmovdqa [rsp + frame_XFER + 3*32], ymm0 /* burn stack */
 
- addm [8*0 + CTX],a
+ addm([8*0 + CTX],a)
  xor     eax, eax /* burn stack */
- addm [8*1 + CTX],b
- addm [8*2 + CTX],c
- addm [8*3 + CTX],d
- addm [8*4 + CTX],e
- addm [8*5 + CTX],f
- addm [8*6 + CTX],g
- addm [8*7 + CTX],h
+ addm([8*1 + CTX],b)
+ addm([8*2 + CTX],c)
+ addm([8*3 + CTX],d)
+ addm([8*4 + CTX],e)
+ addm([8*5 + CTX],f)
+ addm([8*6 + CTX],g)
+ addm([8*7 + CTX],h)
 
  /* Restore GPRs */
  mov rbp, [rsp + frame_GPRSAVE + 8 * 0]
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 39bfe362..6a1328a6 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -56,32 +56,32 @@
 .text
 
 /* Virtual Registers */
-msg = rdi /* ARG1 */
-digest = rsi /* ARG2 */
-msglen = rdx /* ARG3 */
-T1 = rcx
-T2 = r8
-a_64 = r9
-b_64 = r10
-c_64 = r11
-d_64 = r12
-e_64 = r13
-f_64 = r14
-g_64 = r15
-h_64 = rbx
-tmp0 = rax
+#define msg rdi /* ARG1 */
+#define digest rsi /* ARG2 */
+#define msglen rdx /* ARG3 */
+#define T1 rcx
+#define T2 r8
+#define a_64 r9
+#define b_64 r10
+#define c_64 r11
+#define d_64 r12
+#define e_64 r13
+#define f_64 r14
+#define g_64 r15
+#define h_64 rbx
+#define tmp0 rax
 
 /*
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 */
-frame_W      = 0 /* Message Schedule */
-frame_W_size = (80 * 8)
-frame_WK      = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
-frame_WK_size = (2 * 8)
-frame_GPRSAVE      = ((frame_WK) + (frame_WK_size))
-frame_GPRSAVE_size = (5 * 8)
-frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
+#define frame_W 0 /* Message Schedule */
+#define frame_W_size (80 * 8)
+#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
+#define frame_WK_size (2 * 8)
+#define frame_GPRSAVE ((frame_WK) + (frame_WK_size))
+#define frame_GPRSAVE_size (5 * 8)
+#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 
 
 /* Useful QWORD "arrays" for simpler memory references */
@@ -93,161 +93,151 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 /* MSG, DIGEST, K_t, W_t are arrays */
 /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
 
-.macro RotateState
- /* Rotate symbles a..h right */
- __TMP = h_64
- h_64 =  g_64
- g_64 =  f_64
- f_64 =  e_64
- e_64 =  d_64
- d_64 =  c_64
- c_64 =  b_64
- b_64 =  a_64
- a_64 =  __TMP
-.endm
-
-.macro SHA512_Round t
- /* Compute Round %%t */
- mov T1,   f_64        /* T1 = f */
- mov tmp0, e_64        /* tmp = e */
- xor T1,   g_64        /* T1 = f ^ g */
- ror tmp0, 23 /* 41     ; tmp = e ror 23 */
- and T1,   e_64        /* T1 = (f ^ g) & e */
- xor tmp0, e_64        /* tmp = (e ror 23) ^ e */
- xor T1,   g_64        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */
- add T1,   [WK_2(\t)] /* W[t] + K[t] from message scheduler */
- ror tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */
- xor tmp0, e_64        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */
- mov T2,   a_64        /* T2 = a */
- add T1,   h_64        /* T1 = CH(e,f,g) + W[t] + K[t] + h */
- ror tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */
- add T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */
- mov tmp0, a_64        /* tmp = a */
- xor T2,   c_64        /* T2 = a ^ c */
- and tmp0, c_64        /* tmp = a & c */
- and T2,   b_64        /* T2 = (a ^ c) & b */
- xor T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */
- mov tmp0, a_64        /* tmp = a */
- ror tmp0, 5 /* 39      ; tmp = a ror 5 */
- xor tmp0, a_64        /* tmp = (a ror 5) ^ a */
- add d_64, T1          /* e(next_state) = d + T1  */
- ror tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */
- xor tmp0, a_64        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */
- lea h_64, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */
- ror tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */
- add h_64, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
- RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_sse t
-/* ; Compute rounds %%t-2 and %%t-1
- ; Compute message schedule QWORDS %%t and %%t+1
-
- ;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
- ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
- ; scheduler.
- ;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
- ; They are then added to their respective SHA512 constants at
- ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
- ;   For brievity, the comments following vectored instructions only refer to
- ; the first of a pair of QWORDS.
- ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
- ;   The computation of the message schedule and the rounds are tightly
- ; stitched to take advantage of instruction-level parallelism.
- ; For clarity, integer instructions (for the rounds calculation) are indented
- ; by one tab. Vectored instructions (for the message scheduler) are indented
- ; by two tabs. */
-
- mov T1, f_64
- movdqa xmm2, [W_t(\t-2)]  /* XMM2 = W[t-2] */
- xor T1,   g_64
- and T1,   e_64
- movdqa xmm0, xmm2          /* XMM0 = W[t-2] */
- xor T1,   g_64
- add T1,   [WK_2(\t)]
- movdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */
- mov tmp0, e_64
- ror tmp0, 23 /* 41 */
- movdqa xmm3, xmm5          /* XMM3 = W[t-15] */
- xor tmp0, e_64
- ror tmp0, 4 /* 18 */
- psrlq xmm0, 61 - 19       /* XMM0 = W[t-2] >> 42 */
- xor tmp0, e_64
- ror tmp0, 14 /* 14 */
- psrlq xmm3, (8 - 7)       /* XMM3 = W[t-15] >> 1 */
- add T1,   tmp0
- add T1,   h_64
- pxor xmm0, xmm2          /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */
- mov T2,   a_64
- xor T2,   c_64
- pxor xmm3, xmm5          /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */
- and T2,   b_64
- mov tmp0, a_64
- psrlq xmm0, 19 - 6        /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */
- and tmp0, c_64
- xor T2,   tmp0
- psrlq xmm3, (7 - 1)       /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */
- mov tmp0, a_64
- ror tmp0, 5 /* 39 */
- pxor xmm0, xmm2          /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */
- xor tmp0, a_64
- ror tmp0, 6 /* 34 */
- pxor xmm3, xmm5          /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */
- xor tmp0, a_64
- ror tmp0, 28 /* 28 */
- psrlq xmm0, 6             /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */
- add T2,   tmp0
- add d_64, T1
- psrlq xmm3, 1             /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */
- lea h_64, [T1 + T2]
- RotateState
- movdqa xmm1, xmm2          /* XMM1 = W[t-2] */
- mov T1, f_64
- xor T1,   g_64
- movdqa xmm4, xmm5          /* XMM4 = W[t-15] */
- and T1,   e_64
- xor T1,   g_64
- psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */
- add T1,   [WK_2(\t+1)]
- mov tmp0, e_64
- psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */
- ror tmp0, 23 /* 41 */
- xor tmp0, e_64
- pxor xmm1, xmm2          /* XMM1 = (W[t-2] << 42)^W[t-2] */
- ror tmp0, 4 /* 18 */
- xor tmp0, e_64
- pxor xmm4, xmm5          /* XMM4 = (W[t-15]<<7)^W[t-15] */
- ror tmp0, 14 /* 14 */
- add T1,   tmp0
- psllq xmm1, (64 - 61)     /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */
- add T1,   h_64
- mov T2,   a_64
- psllq xmm4, (64 - 8)      /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */
- xor T2,   c_64
- and T2,   b_64
- pxor xmm0, xmm1          /* XMM0 = s1(W[t-2]) */
- mov tmp0, a_64
- and tmp0, c_64
- movdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */
- xor T2,   tmp0
- pxor xmm3, xmm4          /* XMM3 = s0(W[t-15]) */
- mov tmp0, a_64
- paddq xmm0, xmm3          /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */
- ror tmp0, 5 /* 39 */
- paddq xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */
- xor tmp0, a_64
- paddq xmm0, xmm1          /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */
- ror tmp0, 6 /* 34 */
- movdqa [W_t(\t)], xmm0     /* Store scheduled qwords */
- xor tmp0, a_64
- paddq xmm0, [K_t(t)]      /* Compute W[t]+K[t] */
- ror tmp0, 28 /* 28 */
- movdqa [WK_2(t)], xmm0     /* Store W[t]+K[t] for next rounds */
- add T2,   tmp0
- add d_64, T1
- lea h_64, [T1 + T2]
- RotateState
-.endm
+#define SHA512_Round(t, a, b, c, d, e, f, g, h) \
+ /* Compute Round %%t */; \
+ mov T1,   f        /* T1 = f */; \
+ mov tmp0, e        /* tmp = e */; \
+ xor T1,   g        /* T1 = f ^ g */; \
+ ror tmp0, 23 /* 41     ; tmp = e ror 23 */; \
+ and T1,   e        /* T1 = (f ^ g) & e */; \
+ xor tmp0, e        /* tmp = (e ror 23) ^ e */; \
+ xor T1,   g        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \
+ add T1,   [WK_2(t)] /* W[t] + K[t] from message scheduler */; \
+ ror tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */; \
+ xor tmp0, e        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \
+ mov T2,   a        /* T2 = a */; \
+ add T1,   h        /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \
+ ror tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \
+ add T1,   tmp0     /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \
+ mov tmp0, a        /* tmp = a */; \
+ xor T2,   c        /* T2 = a ^ c */; \
+ and tmp0, c        /* tmp = a & c */; \
+ and T2,   b        /* T2 = (a ^ c) & b */; \
+ xor T2,   tmp0     /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \
+ mov tmp0, a        /* tmp = a */; \
+ ror tmp0, 5 /* 39      ; tmp = a ror 5 */; \
+ xor tmp0, a        /* tmp = (a ror 5) ^ a */; \
+ add d, T1          /* e(next_state) = d + T1  */; \
+ ror tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */; \
+ xor tmp0, a        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \
+ lea h, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */; \
+ ror tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \
+ add h, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
+
+#define SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h) \
+ /* \
+ ; Compute rounds %%t-2 and %%t-1 \
+ ; Compute message schedule QWORDS %%t and %%t+1 \
+ ; \
+ ;   Two rounds are computed based on the values for K[t-2]+W[t-2] and \
+ ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \
+ ; scheduler. \
+ ;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \
+ ; They are then added to their respective SHA512 constants at \
+ ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \
+ ;   For brievity, the comments following vectored instructions only refer to \
+ ; the first of a pair of QWORDS. \
+ ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} \
+ ;   The computation of the message schedule and the rounds are tightly \
+ ; stitched to take advantage of instruction-level parallelism. \
+ ; For clarity, integer instructions (for the rounds calculation) are indented \
+ ; by one tab. Vectored instructions (for the message scheduler) are indented \
+ ; by two tabs. \
+ */ \
+ \
+ mov T1, f; \
+ movdqa xmm2, [W_t(t-2)]  /* XMM2 = W[t-2] */; \
+ xor T1,   g; \
+ and T1,   e; \
+ movdqa xmm0, xmm2          /* XMM0 = W[t-2] */; \
+ xor T1,   g; \
+ add T1,   [WK_2(t)]; \
+ movdqu xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \
+ mov tmp0, e; \
+ ror tmp0, 23 /* 41 */; \
+ movdqa xmm3, xmm5          /* XMM3 = W[t-15] */; \
+ xor tmp0, e; \
+ ror tmp0, 4 /* 18 */; \
+ psrlq xmm0, 61 - 19       /* XMM0 = W[t-2] >> 42 */; \
+ xor tmp0, e; \
+ ror tmp0, 14 /* 14 */; \
+ psrlq xmm3, (8 - 7)       /* XMM3 = W[t-15] >> 1 */; \
+ add T1,   tmp0; \
+ add T1,   h; \
+ pxor xmm0, xmm2          /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */; \
+ mov T2,   a; \
+ xor T2,   c; \
+ pxor xmm3, xmm5          /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */; \
+ and T2,   b; \
+ mov tmp0, a; \
+ psrlq xmm0, 19 - 6        /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */; \
+ and tmp0, c; \
+ xor T2,   tmp0; \
+ psrlq xmm3, (7 - 1)       /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */; \
+ mov tmp0, a; \
+ ror tmp0, 5 /* 39 */; \
+ pxor xmm0, xmm2          /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */; \
+ xor tmp0, a; \
+ ror tmp0, 6 /* 34 */; \
+ pxor xmm3, xmm5          /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */; \
+ xor tmp0, a; \
+ ror tmp0, 28 /* 28 */; \
+ psrlq xmm0, 6             /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */; \
+ add T2,   tmp0; \
+ add d, T1; \
+ psrlq xmm3, 1             /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */; \
+ lea h, [T1 + T2]
+
+#define SHA512_2Sched_2Round_sse_PART2(t, a, b, c, d, e, f, g, h) \
+ movdqa xmm1, xmm2          /* XMM1 = W[t-2] */; \
+ mov T1,   f; \
+ xor T1,   g; \
+ movdqa xmm4, xmm5          /* XMM4 = W[t-15] */; \
+ and T1,   e; \
+ xor T1,   g; \
+ psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */; \
+ add T1,   [WK_2(t+1)]; \
+ mov tmp0, e; \
+ psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */; \
+ ror tmp0, 23 /* 41 */; \
+ xor tmp0, e; \
+ pxor xmm1, xmm2          /* XMM1 = (W[t-2] << 42)^W[t-2] */; \
+ ror tmp0, 4 /* 18 */; \
+ xor tmp0, e; \
+ pxor xmm4, xmm5          /* XMM4 = (W[t-15]<<7)^W[t-15] */; \
+ ror tmp0, 14 /* 14 */; \
+ add T1,   tmp0; \
+ psllq xmm1, (64 - 61)     /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */; \
+ add T1,   h; \
+ mov T2,   a; \
+ psllq xmm4, (64 - 8)      /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */; \
+ xor T2,   c; \
+ and T2,   b; \
+ pxor xmm0, xmm1          /* XMM0 = s1(W[t-2]) */; \
+ mov tmp0, a; \
+ and tmp0, c; \
+ movdqu xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \
+ xor T2,   tmp0; \
+ pxor xmm3, xmm4          /* XMM3 = s0(W[t-15]) */; \
+ mov tmp0, a; \
+ paddq xmm0, xmm3          /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */; \
+ ror tmp0, 5 /* 39 */; \
+ paddq xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */; \
+ xor tmp0, a; \
+ paddq xmm0, xmm1          /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \
+ ror tmp0, 6 /* 34 */; \
+ movdqa [W_t(t)], xmm0     /* Store scheduled qwords */; \
+ xor tmp0, a; \
+ paddq xmm0, [K_t(t)]      /* Compute W[t]+K[t] */; \
+ ror tmp0, 28 /* 28 */; \
+ movdqa [WK_2(t)], xmm0     /* Store W[t]+K[t] for next rounds */; \
+ add T2,   tmp0; \
+ add d, T1; \
+ lea h, [T1 + T2]
+
+#define SHA512_2Sched_2Round_sse(t, a, b, c, d, e, f, g, h) \
+ SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h); \
+ SHA512_2Sched_2Round_sse_PART2(t, h, a, b, c, d, e, f, g)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -295,37 +285,77 @@ _gcry_sha512_transform_amd64_ssse3:
  mov g_64, [DIGEST(6)]
  mov h_64, [DIGEST(7)]
 
- t = 0
- .rept 80/2 + 1
- /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */
- /* +1 iteration because the scheduler leads hashing by 1 iteration */
- .if t < 2
- /* BSWAP 2 QWORDS */
- movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
- movdqu xmm0, [MSG(t)]
- pshufb xmm0, xmm1      /* BSWAP */
- movdqa [W_t(t)], xmm0  /* Store Scheduled Pair */
- paddq xmm0, [K_t(t)]  /* Compute W[t]+K[t] */
- movdqa [WK_2(t)], xmm0 /* Store into WK for rounds */
- .elseif t < 16
- /* BSWAP 2 QWORDS; Compute 2 Rounds */
- movdqu xmm0, [MSG(t)]
- pshufb xmm0, xmm1      /* BSWAP */
- SHA512_Round (t - 2)    /* Round t-2 */
- movdqa [W_t(t)], xmm0  /* Store Scheduled Pair */
- paddq xmm0, [K_t(t)]  /* Compute W[t]+K[t] */
- SHA512_Round (t - 1)    /* Round t-1 */
- movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */
- .elseif t < 79
- /* Schedule 2 QWORDS; Compute 2 Rounds */
- SHA512_2Sched_2Round_sse t
- .else
- /* Compute 2 Rounds */
- SHA512_Round (t - 2)
- SHA512_Round (t - 1)
- .endif
- t = (t)+2
- .endr
+ /* BSWAP 2 QWORDS */
+ movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
+ movdqu xmm0, [MSG(0)]
+ pshufb xmm0, xmm1      /* BSWAP */
+ movdqa [W_t(0)], xmm0  /* Store Scheduled Pair */
+ paddq xmm0, [K_t(0)]  /* Compute W[t]+K[t] */
+ movdqa [WK_2(0)], xmm0 /* Store into WK for rounds */
+
+ #define T_2_14(t, a, b, c, d, e, f, g, h) \
+ /* BSWAP 2 QWORDS; Compute 2 Rounds */; \
+ movdqu xmm0, [MSG(t)]; \
+ pshufb xmm0, xmm1      /* BSWAP */; \
+ SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \
+        e##_64, f##_64, g##_64, h##_64); \
+ movdqa [W_t(t)], xmm0  /* Store Scheduled Pair */; \
+ paddq xmm0, [K_t(t)]  /* Compute W[t]+K[t] */; \
+ SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \
+        d##_64, e##_64, f##_64, g##_64); \
+ movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */
+
+ #define T_16_78(t, a, b, c, d, e, f, g, h) \
+ SHA512_2Sched_2Round_sse((t), a##_64, b##_64, c##_64, d##_64, \
+      e##_64, f##_64, g##_64, h##_64)
+
+ #define T_80(t, a, b, c, d, e, f, g, h) \
+ /* Compute 2 Rounds */; \
+ SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \
+      e##_64, f##_64, g##_64, h##_64); \
+ SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \
+      d##_64, e##_64, f##_64, g##_64)
+
+ T_2_14(2, a, b, c, d, e, f, g, h)
+ T_2_14(4, g, h, a, b, c, d, e, f)
+ T_2_14(6, e, f, g, h, a, b, c, d)
+ T_2_14(8, c, d, e, f, g, h, a, b)
+ T_2_14(10, a, b, c, d, e, f, g, h)
+ T_2_14(12, g, h, a, b, c, d, e, f)
+ T_2_14(14, e, f, g, h, a, b, c, d)
+ T_16_78(16, c, d, e, f, g, h, a, b)
+ T_16_78(18, a, b, c, d, e, f, g, h)
+ T_16_78(20, g, h, a, b, c, d, e, f)
+ T_16_78(22, e, f, g, h, a, b, c, d)
+ T_16_78(24, c, d, e, f, g, h, a, b)
+ T_16_78(26, a, b, c, d, e, f, g, h)
+ T_16_78(28, g, h, a, b, c, d, e, f)
+ T_16_78(30, e, f, g, h, a, b, c, d)
+ T_16_78(32, c, d, e, f, g, h, a, b)
+ T_16_78(34, a, b, c, d, e, f, g, h)
+ T_16_78(36, g, h, a, b, c, d, e, f)
+ T_16_78(38, e, f, g, h, a, b, c, d)
+ T_16_78(40, c, d, e, f, g, h, a, b)
+ T_16_78(42, a, b, c, d, e, f, g, h)
+ T_16_78(44, g, h, a, b, c, d, e, f)
+ T_16_78(46, e, f, g, h, a, b, c, d)
+ T_16_78(48, c, d, e, f, g, h, a, b)
+ T_16_78(50, a, b, c, d, e, f, g, h)
+ T_16_78(52, g, h, a, b, c, d, e, f)
+ T_16_78(54, e, f, g, h, a, b, c, d)
+ T_16_78(56, c, d, e, f, g, h, a, b)
+ T_16_78(58, a, b, c, d, e, f, g, h)
+ T_16_78(60, g, h, a, b, c, d, e, f)
+ T_16_78(62, e, f, g, h, a, b, c, d)
+ T_16_78(64, c, d, e, f, g, h, a, b)
+ T_16_78(66, a, b, c, d, e, f, g, h)
+ T_16_78(68, g, h, a, b, c, d, e, f)
+ T_16_78(70, e, f, g, h, a, b, c, d)
+ T_16_78(72, c, d, e, f, g, h, a, b)
+ T_16_78(74, a, b, c, d, e, f, g, h)
+ T_16_78(76, g, h, a, b, c, d, e, f)
+ T_16_78(78, e, f, g, h, a, b, c, d)
+ T_80(80, c, d, e, f, g, h, a, b)
 
  /* Update digest */
  add [DIGEST(0)], a_64
@@ -362,11 +392,12 @@ _gcry_sha512_transform_amd64_ssse3:
  pxor xmm5, xmm5
 
  /* Burn stack */
- t = 0
- .rept frame_W_size / 16
- movdqu [rsp + frame_W + (t) * 16], xmm0
- t = ((t)+1)
- .endr
+ mov eax, 0
+.Lerase_stack:
+ movdqu [rsp + rax], xmm0
+ add eax, 16
+ cmp eax, frame_W_size
+ jne .Lerase_stack
  movdqu [rsp + frame_WK], xmm0
  xor     eax, eax
 
diff --git a/configure.ac b/configure.ac
index f7339a3e..e4a10b78 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1741,21 +1741,11 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly im
                 ".text\n\t"
                 "actest:\n\t"
                 "pxor xmm1, xmm7;\n\t"
-                /* Intel syntax implementation also use GAS macros, so check
-                 * for them here. */
-                "VAL_A = xmm4\n\t"
-                "VAL_B = xmm2\n\t"
-                ".macro SET_VAL_A p1\n\t"
-                "  VAL_A = \\\\p1 \n\t"
-                ".endm\n\t"
-                ".macro SET_VAL_B p1\n\t"
-                "  VAL_B = \\\\p1 \n\t"
-                ".endm\n\t"
-                "vmovdqa VAL_A, VAL_B;\n\t"
-                "SET_VAL_A eax\n\t"
-                "SET_VAL_B ebp\n\t"
-                "add VAL_A, VAL_B;\n\t"
-                "add VAL_B, 0b10101;\n\t"
+                "vperm2i128 ymm2, ymm3, ymm0, 1;\n\t"
+                "add eax, ebp;\n\t"
+                "rorx eax, ebp, 1;\n\t"
+                "sub eax, [esp + 4];\n\t"
+                "add dword ptr [esp + eax], 0b10101;\n\t"
                 ".att_syntax prefix\n\t"
             );]], [ actest(); ])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
--
2.27.0


_______________________________________________
Gcrypt-devel mailing list
[hidden email]
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel