[PATCH] Add ARMv8/AArch64 implementation of chacha20

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[PATCH] Add ARMv8/AArch64 implementation of chacha20

Jussi Kivilinna-2
* cipher/Makefile.am: Add 'chacha20-aarch64.S'.
* cipher/chacha20-aarch64.S: New.
* cipher/chacha20.c (USE_AARCH64_SIMD): New.
(_gcry_chacha20_aarch_blocks): New.
(chacha20_do_setkey): Add HWF selection for Aarch64 implementation.
* configure.ac: Add 'chacha20-aarch64.lo'.
--

Patch adds ARMv8/AArch64 SIMD implementation based on public domain
ARMv7/NEON implementation by Andrew Moon at:
  https://github.com/floodyberry/chacha-opt

Benchmark on ARM Cortex-A53 (1536 Mhz):

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      5.70 ns/B     167.2 MiB/s      8.76 c/B
     STREAM dec |      5.71 ns/B     166.9 MiB/s      8.78 c/B

After (~1.7x faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      3.32 ns/B     287.7 MiB/s      5.09 c/B
     STREAM dec |      3.31 ns/B     287.9 MiB/s      5.09 c/B

Signed-off-by: Jussi Kivilinna <[hidden email]>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 95c45108..26d25e1a 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -65,7 +65,7 @@ arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
 chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
-  chacha20-armv7-neon.S \
+  chacha20-armv7-neon.S chacha20-aarch64.S \
 crc.c \
   crc-intel-pclmul.c \
 des.c des-amd64.S \
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
new file mode 100644
index 00000000..d07511ff
--- /dev/null
+++ b/cipher/chacha20-aarch64.S
@@ -0,0 +1,772 @@
+/* chacha20-aarch64.S - ARMv8/AArch64 accelerated chacha20 blocks function
+ *
+ * Copyright (C) 2014,2017 Jussi Kivilinna <[hidden email]>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain ARMv7/NEON implementation by Andrew Moon at
+ *  https://github.com/floodyberry/chacha-opt
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
+    defined(USE_CHACHA20)
+
+.cpu generic+simd
+
+.text
+
+#define STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
+ add x17, ptr, #8; \
+ stp l0, l1, [ptr], #16; \
+ stp l2, l3, [x17], #16; \
+ stp l4, l5, [ptr], #16; \
+ stp l6, l7, [x17];
+
+#define LDMIA16(ptr, l0, l1, l2, l3, l4, l5, l6, l7, \
+     l8, l9, l10, l11, l12, l13, l14, l15) \
+ add x17, ptr, #8; \
+ ldp l0, l1, [ptr], #16; \
+ ldp l2, l3, [x17], #16; \
+ ldp l4, l5, [ptr], #16; \
+ ldp l6, l7, [x17], #16; \
+ ldp l8, l9, [ptr], #16; \
+ ldp l10, l11, [x17], #16; \
+ ldp l12, l13, [ptr], #16; \
+ ldp l14, l15, [x17]; \
+
+#define LDMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
+ add x17, ptr, #8; \
+ ldp l0, l1, [ptr], #16; \
+ ldp l2, l3, [x17], #16; \
+ ldp l4, l5, [ptr], #16; \
+ ldp l6, l7, [x17];
+
+#define LDMIA4(ptr, l0, l1, l2, l3) \
+ ldp l0, l1, [ptr], #8; \
+ ldp l2, l3, [ptr], #8;
+
+#define EXT32(a,b,c,n) \
+ ext a,b,c,#(n*4);
+
+.text
+
+#define STACK_STATE 48
+#define STACK_SRC 56
+#define STACK_SP 192
+#define STACK_DST 200
+#define STACK_BYTES 208
+#define STACK_DST_TMP   216
+
+.globl _gcry_chacha20_aarch64_blocks
+.type  _gcry_chacha20_aarch64_blocks,%function;
+_gcry_chacha20_aarch64_blocks:
+.Lchacha_blocks_neon_local:
+ tst x3, x3
+ beq .Lchacha_blocks_neon_nobytes
+ mov x16, sp
+ mov x8, sp
+ sub x16, x16, #(216+8)
+ mov v16.16b, v8.16b
+ mov v17.16b, v9.16b
+ and x16, x16, #(-32)
+ mov v18.16b, v10.16b
+ mov v19.16b, v11.16b
+ mov v20.16b, v12.16b
+ mov sp, x16
+ add x16, x16, #64
+ mov v21.16b, v13.16b
+ mov v22.16b, v14.16b
+ mov v23.16b, v15.16b
+ mov w4, #20
+ ld1 {v24.4s-v27.4s}, [x0]
+ str x0, [sp, # STACK_STATE]
+ str x1, [sp, # STACK_SRC]
+ str x2, [sp, # STACK_DST]
+ str x3, [sp, # STACK_BYTES]
+ str x8, [sp, # STACK_SP]
+ st1 {v24.4s-v27.4s}, [x16]
+ str w4, [sp, #44]
+ cmp x3, #256
+ blo .Lchacha_blocks_neon_mainloop2
+.Lchacha_blocks_neon_mainloop1:
+ ldr w0, [sp, #44]
+ add x16, sp, #64
+ str w0, [sp, #0]
+ mov x2, #1
+ eor v12.16b, v12.16b, v12.16b
+ mov v0.16b, v24.16b
+ mov v1.16b, v25.16b
+ mov v2.16b, v26.16b
+ mov v3.16b, v27.16b
+ mov v12.2d[0], x2
+ add v3.2d, v3.2d, v12.2d
+ mov v4.16b, v0.16b
+ mov v5.16b, v1.16b
+ mov v6.16b, v2.16b
+ add v7.2d, v3.2d, v12.2d
+ LDMIA16(x16, w0, w1, w2, w3, w4, w5, w6, w7,
+     w8, w9, w10, w11, w12, w13, w14, w15)
+ mov v8.16b, v0.16b
+ mov v9.16b, v1.16b
+ mov v10.16b, v2.16b
+ add v11.2d, v7.2d, v12.2d
+ str w6, [sp, #8]
+ str w11, [sp, #12]
+ mov w11, w13
+ str w15, [sp, #28]
+.Lchacha_blocks_neon_rounds1:
+ ldr w6, [sp, #0]
+ add v0.4s, v0.4s, v1.4s
+ add w0, w0, w4
+ add v4.4s, v4.4s, v5.4s
+ add w1, w1, w5
+ add v8.4s, v8.4s, v9.4s
+ eor w12, w12, w0
+ eor v12.16b, v3.16b, v0.16b
+ eor w11, w11, w1
+ eor v13.16b, v7.16b, v4.16b
+ ror w12, w12, #16
+ eor v14.16b, v11.16b, v8.16b
+ ror w11, w11, #16
+ rev32 v3.8h, v12.8h
+ subs w6, w6, #2
+ rev32 v7.8h, v13.8h
+ add w8, w8, w12
+ rev32 v11.8h, v14.8h
+ add w9, w9, w11
+ add v2.4s, v2.4s, v3.4s
+ eor w4, w4, w8
+ add v6.4s, v6.4s, v7.4s
+ eor w5, w5, w9
+ add v10.4s, v10.4s, v11.4s
+ str w6, [sp, #0]
+ eor v12.16b, v1.16b, v2.16b
+ ror w4, w4, #20
+ eor v13.16b, v5.16b, v6.16b
+ ror w5, w5, #20
+ eor v14.16b, v9.16b, v10.16b
+ add w0, w0, w4
+ shl v1.4s, v12.4s, #12
+ add w1, w1, w5
+ shl v5.4s, v13.4s, #12
+ ldr w6, [sp, #8]
+ shl v9.4s, v14.4s, #12
+ eor w12, w12, w0
+ sri v1.4s, v12.4s, #20
+ eor w11, w11, w1
+ sri v5.4s, v13.4s, #20
+ ror w12, w12, #24
+ sri v9.4s, v14.4s, #20
+ ror w11, w11, #24
+ add v0.4s, v0.4s, v1.4s
+ add w8, w8, w12
+ add v4.4s, v4.4s, v5.4s
+ add w9, w9, w11
+ add v8.4s, v8.4s, v9.4s
+ eor w4, w4, w8
+ eor v12.16b, v3.16b, v0.16b
+ eor w5, w5, w9
+ eor v13.16b, v7.16b, v4.16b
+ str w11, [sp, #20]
+ eor v14.16b, v11.16b, v8.16b
+ ror w4, w4, #25
+ shl v3.4s, v12.4s, #8
+ ror w5, w5, #25
+ shl v7.4s, v13.4s, #8
+ str w4, [sp, #4]
+ shl v11.4s, v14.4s, #8
+ ldr w4, [sp, #28]
+ sri v3.4s, v12.4s, #24
+ add w2, w2, w6
+ sri v7.4s, v13.4s, #24
+ add w3, w3, w7
+ sri v11.4s, v14.4s, #24
+ ldr w11, [sp, #12]
+ add v2.4s, v2.4s, v3.4s
+ eor w14, w14, w2
+ add v6.4s, v6.4s, v7.4s
+ eor w4, w4, w3
+ add v10.4s, v10.4s, v11.4s
+ ror w14, w14, #16
+ eor v12.16b, v1.16b, v2.16b
+ ror w4, w4, #16
+ eor v13.16b, v5.16b, v6.16b
+ add w10, w10, w14
+ eor v14.16b, v9.16b, v10.16b
+ add w11, w11, w4
+ shl v1.4s, v12.4s, #7
+ eor w6, w6, w10
+ shl v5.4s, v13.4s, #7
+ eor w7, w7, w11
+ shl v9.4s, v14.4s, #7
+ ror w6, w6, #20
+ sri v1.4s, v12.4s, #25
+ ror w7, w7, #20
+ sri v5.4s, v13.4s, #25
+ add w2, w2, w6
+ sri v9.4s, v14.4s, #25
+ add w3, w3, w7
+ EXT32(v3.16b, v3.16b, v3.16b, 3)
+ eor w14, w14, w2
+ EXT32(v7.16b, v7.16b, v7.16b, 3)
+ eor w4, w4, w3
+ EXT32(v11.16b, v11.16b, v11.16b, 3)
+ ror w14, w14, #24
+ EXT32(v1.16b, v1.16b, v1.16b, 1)
+ ror w4, w4, #24
+ EXT32(v5.16b, v5.16b, v5.16b, 1)
+ add w10, w10, w14
+ EXT32(v9.16b, v9.16b, v9.16b, 1)
+ add w11, w11, w4
+ EXT32(v2.16b, v2.16b, v2.16b, 2)
+ eor w6, w6, w10
+ EXT32(v6.16b, v6.16b, v6.16b, 2)
+ eor w7, w7, w11
+ EXT32(v10.16b, v10.16b, v10.16b, 2)
+ ror w6, w6, #25
+ add v0.4s, v0.4s, v1.4s
+ ror w7, w7, #25
+ add v4.4s, v4.4s, v5.4s
+ add w0, w0, w5
+ add v8.4s, v8.4s, v9.4s
+ add w1, w1, w6
+ eor v12.16b, v3.16b, v0.16b
+ eor w4, w4, w0
+ eor v13.16b, v7.16b, v4.16b
+ eor w12, w12, w1
+ eor v14.16b, v11.16b, v8.16b
+ ror w4, w4, #16
+ rev32 v3.8h, v12.8h
+ ror w12, w12, #16
+ rev32 v7.8h, v13.8h
+ add w10, w10, w4
+ rev32 v11.8h, v14.8h
+ add w11, w11, w12
+ add v2.4s, v2.4s, v3.4s
+ eor w5, w5, w10
+ add v6.4s, v6.4s, v7.4s
+ eor w6, w6, w11
+ add v10.4s, v10.4s, v11.4s
+ ror w5, w5, #20
+ eor v12.16b, v1.16b, v2.16b
+ ror w6, w6, #20
+ eor v13.16b, v5.16b, v6.16b
+ add w0, w0, w5
+ eor v14.16b, v9.16b, v10.16b
+ add w1, w1, w6
+ shl v1.4s, v12.4s, #12
+ eor w4, w4, w0
+ shl v5.4s, v13.4s, #12
+ eor w12, w12, w1
+ shl v9.4s, v14.4s, #12
+ ror w4, w4, #24
+ sri v1.4s, v12.4s, #20
+ ror w12, w12, #24
+ sri v5.4s, v13.4s, #20
+ add w10, w10, w4
+ sri v9.4s, v14.4s, #20
+ add w11, w11, w12
+ add v0.4s, v0.4s, v1.4s
+ eor w5, w5, w10
+ add v4.4s, v4.4s, v5.4s
+ eor w6, w6, w11
+ add v8.4s, v8.4s, v9.4s
+ str w11, [sp, #12]
+ eor v12.16b, v3.16b, v0.16b
+ ror w5, w5, #25
+ eor v13.16b, v7.16b, v4.16b
+ ror w6, w6, #25
+ eor v14.16b, v11.16b, v8.16b
+ str w4, [sp, #28]
+ shl v3.4s, v12.4s, #8
+ ldr w4, [sp, #4]
+ shl v7.4s, v13.4s, #8
+ add w2, w2, w7
+ shl v11.4s, v14.4s, #8
+ add w3, w3, w4
+ sri v3.4s, v12.4s, #24
+ ldr w11, [sp, #20]
+ sri v7.4s, v13.4s, #24
+ eor w11, w11, w2
+ sri v11.4s, v14.4s, #24
+ eor w14, w14, w3
+ add v2.4s, v2.4s, v3.4s
+ ror w11, w11, #16
+ add v6.4s, v6.4s, v7.4s
+ ror w14, w14, #16
+ add v10.4s, v10.4s, v11.4s
+ add w8, w8, w11
+ eor v12.16b, v1.16b, v2.16b
+ add w9, w9, w14
+ eor v13.16b, v5.16b, v6.16b
+ eor w7, w7, w8
+ eor v14.16b, v9.16b, v10.16b
+ eor w4, w4, w9
+ shl v1.4s, v12.4s, #7
+ ror w7, w7, #20
+ shl v5.4s, v13.4s, #7
+ ror w4, w4, #20
+ shl v9.4s, v14.4s, #7
+ str w6, [sp, #8]
+ sri v1.4s, v12.4s, #25
+ add w2, w2, w7
+ sri v5.4s, v13.4s, #25
+ add w3, w3, w4
+ sri v9.4s, v14.4s, #25
+ eor w11, w11, w2
+ EXT32(v3.16b, v3.16b, v3.16b, 1)
+ eor w14, w14, w3
+ EXT32(v7.16b, v7.16b, v7.16b, 1)
+ ror w11, w11, #24
+ EXT32(v11.16b, v11.16b, v11.16b, 1)
+ ror w14, w14, #24
+ EXT32(v1.16b, v1.16b, v1.16b, 3)
+ add w8, w8, w11
+ EXT32(v5.16b, v5.16b, v5.16b, 3)
+ add w9, w9, w14
+ EXT32(v9.16b, v9.16b, v9.16b, 3)
+ eor w7, w7, w8
+ EXT32(v2.16b, v2.16b, v2.16b, 2)
+ eor w4, w4, w9
+ EXT32(v6.16b, v6.16b, v6.16b, 2)
+ ror w7, w7, #25
+ EXT32(v10.16b, v10.16b, v10.16b, 2)
+ ror w4, w4, #25
+ bne .Lchacha_blocks_neon_rounds1
+ str w8, [sp, #0]
+ str w9, [sp, #4]
+ mov v12.16b, v24.16b
+ str w10, [sp, #8]
+ str w12, [sp, #16]
+ mov v13.16b, v25.16b
+ str w11, [sp, #20]
+ str w14, [sp, #24]
+ mov v14.16b, v26.16b
+ mov v15.16b, v27.16b
+ ldr x12, [sp, # STACK_SRC]
+ ldr x14, [sp, # STACK_DST]
+ add v0.4s, v0.4s, v12.4s
+ ldr w8, [sp, #(64 +0)]
+ add v4.4s, v4.4s, v12.4s
+ ldr w9, [sp, #(64 +4)]
+ add v8.4s, v8.4s, v12.4s
+ ldr w10, [sp, #(64 +8)]
+ add v1.4s, v1.4s, v13.4s
+ ldr w11, [sp, #(64 +12)]
+ add v5.4s, v5.4s, v13.4s
+ add w0, w0, w8
+ add v9.4s, v9.4s, v13.4s
+ add w1, w1, w9
+ add v2.4s, v2.4s, v14.4s
+ add w2, w2, w10
+ add v6.4s, v6.4s, v14.4s
+ ldr w8, [sp, #(64 +16)]
+ add v10.4s, v10.4s, v14.4s
+ add w3, w3, w11
+ eor v14.16b, v14.16b, v14.16b
+ ldr w9, [sp, #(64 +20)]
+ mov x11, #1
+ add w4, w4, w8
+ mov v14.2d[0], x11
+ ldr w10, [sp, #(64 +24)]
+ add v12.2d, v14.2d, v15.2d
+ add w5, w5, w9
+ add v13.2d, v14.2d, v12.2d
+ ldr w11, [sp, #(64 +28)]
+ add v14.2d, v14.2d, v13.2d
+ add w6, w6, w10
+ add v3.4s, v3.4s, v12.4s
+ tst x12, x12
+ add v7.4s, v7.4s, v13.4s
+ add w7, w7, w11
+ add v11.4s, v11.4s, v14.4s
+ beq .Lchacha_blocks_neon_nomessage11
+ LDMIA4(x12, w8, w9, w10, w11)
+ tst x12, x12
+ eor w0, w0, w8
+ eor w1, w1, w9
+ eor w2, w2, w10
+ ldr w8, [x12, #0]
+ eor w3, w3, w11
+ ldr w9, [x12, #4]
+ eor w4, w4, w8
+ ldr w10, [x12, #8]
+ eor w5, w5, w9
+ ldr w11, [x12, #12]
+ eor w6, w6, w10
+ add x12, x12, #16
+ eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage11:
+ mov x16, sp
+ STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+ tst x12, x12
+ LDMIA8(x16, w0, w1, w2, w3, w4, w5, w6, w7)
+ ldr w8, [sp, #(64 +32)]
+ ldr w9, [sp, #(64 +36)]
+ ldr w10, [sp, #(64 +40)]
+ ldr w11, [sp, #(64 +44)]
+ add w0, w0, w8
+ add w1, w1, w9
+ add w2, w2, w10
+ ldr w8, [sp, #(64 +48)]
+ add w3, w3, w11
+ ldr w9, [sp, #(64 +52)]
+ add w4, w4, w8
+ ldr w10, [sp, #(64 +56)]
+ add w5, w5, w9
+ ldr w11, [sp, #(64 +60)]
+ add w6, w6, w10
+ adds w8, w8, #4
+ add w7, w7, w11
+ adc w9, w9, wzr
+ str w8, [sp, #(64 +48)]
+ mov v27.4s[0], w8
+ tst x12, x12
+ str w9, [sp, #(64 +52)]
+ mov v27.4s[1], w9
+ beq .Lchacha_blocks_neon_nomessage12
+ LDMIA4(x12, w8, w9, w10, w11)
+ tst x12, x12
+ eor w0, w0, w8
+ eor w1, w1, w9
+ eor w2, w2, w10
+ ldr w8, [x12, #0]
+ eor w3, w3, w11
+ ldr w9, [x12, #4]
+ eor w4, w4, w8
+ ldr w10, [x12, #8]
+ eor w5, w5, w9
+ ldr w11, [x12, #12]
+ eor w6, w6, w10
+ add x12, x12, #16
+ eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage12:
+ STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+ tst x12, x12
+ beq .Lchacha_blocks_neon_nomessage13
+ ld1 {v12.4s-v15.4s}, [x12], #64
+ eor v0.16b, v0.16b, v12.16b
+ eor v1.16b, v1.16b, v13.16b
+ eor v2.16b, v2.16b, v14.16b
+ eor v3.16b, v3.16b, v15.16b
+.Lchacha_blocks_neon_nomessage13:
+ st1 {v0.4s-v3.4s}, [x14], #64
+ beq .Lchacha_blocks_neon_nomessage14
+ ld1 {v12.4s-v15.4s}, [x12], #64
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+.Lchacha_blocks_neon_nomessage14:
+ st1 {v4.4s-v7.4s}, [x14], #64
+ beq .Lchacha_blocks_neon_nomessage15
+ ld1 {v12.4s-v15.4s}, [x12], #64
+ eor v8.16b, v8.16b, v12.16b
+ eor v9.16b, v9.16b, v13.16b
+ eor v10.16b, v10.16b, v14.16b
+ eor v11.16b, v11.16b, v15.16b
+.Lchacha_blocks_neon_nomessage15:
+ st1 {v8.4s-v11.4s}, [x14], #64
+ str x12, [sp, # STACK_SRC]
+ str x14, [sp, # STACK_DST]
+ ldr x3, [sp, # STACK_BYTES]
+ sub x3, x3, #256
+ cmp x3, #256
+ str x3, [sp, # STACK_BYTES]
+ bhs .Lchacha_blocks_neon_mainloop1
+ tst x3, x3
+ beq .Lchacha_blocks_neon_done
+.Lchacha_blocks_neon_mainloop2:
+ ldr x3, [sp, # STACK_BYTES]
+ ldr x1, [sp, # STACK_SRC]
+ cmp x3, #64
+ bhs .Lchacha_blocks_neon_noswap1
+ add x4, sp, #128
+ mov x5, x4
+ tst x1, x1
+ beq .Lchacha_blocks_neon_nocopy1
+.Lchacha_blocks_neon_copyinput1:
+ subs x3, x3, #1
+ ldrb w0, [x1], #1
+ strb w0, [x4], #1
+ bne .Lchacha_blocks_neon_copyinput1
+ str x5, [sp, # STACK_SRC]
+.Lchacha_blocks_neon_nocopy1:
+ ldr x4, [sp, # STACK_DST]
+ str x5, [sp, # STACK_DST]
+ str x4, [sp, # STACK_DST_TMP]
+.Lchacha_blocks_neon_noswap1:
+ add x16, sp, #64
+ ldr w0, [sp, #44]
+ str w0, [sp, #0]
+ LDMIA16(x16, w0, w1, w2, w3, w4, w5, w6, w7,
+     w8, w9, w10, w11, w12, w13, w14, w15)
+ str w6, [sp, #8]
+ str w11, [sp, #12]
+ mov w11, w13
+ str w15, [sp, #28]
+.Lchacha_blocks_neon_rounds2:
+ ldr w6, [sp, #0]
+ add w0, w0, w4
+ add w1, w1, w5
+ eor w12, w12, w0
+ eor w11, w11, w1
+ ror w12, w12, #16
+ ror w11, w11, #16
+ subs w6, w6, #2
+ add w8, w8, w12
+ add w9, w9, w11
+ eor w4, w4, w8
+ eor w5, w5, w9
+ str w6, [sp, #0]
+ ror w4, w4, #20
+ ror w5, w5, #20
+ add w0, w0, w4
+ add w1, w1, w5
+ ldr w6, [sp, #8]
+ eor w12, w12, w0
+ eor w11, w11, w1
+ ror w12, w12, #24
+ ror w11, w11, #24
+ add w8, w8, w12
+ add w9, w9, w11
+ eor w4, w4, w8
+ eor w5, w5, w9
+ str w11, [sp, #20]
+ ror w4, w4, #25
+ ror w5, w5, #25
+ str w4, [sp, #4]
+ ldr w4, [sp, #28]
+ add w2, w2, w6
+ add w3, w3, w7
+ ldr w11, [sp, #12]
+ eor w14, w14, w2
+ eor w4, w4, w3
+ ror w14, w14, #16
+ ror w4, w4, #16
+ add w10, w10, w14
+ add w11, w11, w4
+ eor w6, w6, w10
+ eor w7, w7, w11
+ ror w6, w6, #20
+ ror w7, w7, #20
+ add w2, w2, w6
+ add w3, w3, w7
+ eor w14, w14, w2
+ eor w4, w4, w3
+ ror w14, w14, #24
+ ror w4, w4, #24
+ add w10, w10, w14
+ add w11, w11, w4
+ eor w6, w6, w10
+ eor w7, w7, w11
+ ror w6, w6, #25
+ ror w7, w7, #25
+ add w0, w0, w5
+ add w1, w1, w6
+ eor w4, w4, w0
+ eor w12, w12, w1
+ ror w4, w4, #16
+ ror w12, w12, #16
+ add w10, w10, w4
+ add w11, w11, w12
+ eor w5, w5, w10
+ eor w6, w6, w11
+ ror w5, w5, #20
+ ror w6, w6, #20
+ add w0, w0, w5
+ add w1, w1, w6
+ eor w4, w4, w0
+ eor w12, w12, w1
+ ror w4, w4, #24
+ ror w12, w12, #24
+ add w10, w10, w4
+ add w11, w11, w12
+ eor w5, w5, w10
+ eor w6, w6, w11
+ str w11, [sp, #12]
+ ror w5, w5, #25
+ ror w6, w6, #25
+ str w4, [sp, #28]
+ ldr w4, [sp, #4]
+ add w2, w2, w7
+ add w3, w3, w4
+ ldr w11, [sp, #20]
+ eor w11, w11, w2
+ eor w14, w14, w3
+ ror w11, w11, #16
+ ror w14, w14, #16
+ add w8, w8, w11
+ add w9, w9, w14
+ eor w7, w7, w8
+ eor w4, w4, w9
+ ror w7, w7, #20
+ ror w4, w4, #20
+ str w6, [sp, #8]
+ add w2, w2, w7
+ add w3, w3, w4
+ eor w11, w11, w2
+ eor w14, w14, w3
+ ror w11, w11, #24
+ ror w14, w14, #24
+ add w8, w8, w11
+ add w9, w9, w14
+ eor w7, w7, w8
+ eor w4, w4, w9
+ ror w7, w7, #25
+ ror w4, w4, #25
+ bne .Lchacha_blocks_neon_rounds2
+ str w8, [sp, #0]
+ str w9, [sp, #4]
+ str w10, [sp, #8]
+ str w12, [sp, #16]
+ str w11, [sp, #20]
+ str w14, [sp, #24]
+ ldr x12, [sp, # STACK_SRC]
+ ldr x14, [sp, # STACK_DST]
+ ldr w8, [sp, #(64 +0)]
+ ldr w9, [sp, #(64 +4)]
+ ldr w10, [sp, #(64 +8)]
+ ldr w11, [sp, #(64 +12)]
+ add w0, w0, w8
+ add w1, w1, w9
+ add w2, w2, w10
+ ldr w8, [sp, #(64 +16)]
+ add w3, w3, w11
+ ldr w9, [sp, #(64 +20)]
+ add w4, w4, w8
+ ldr w10, [sp, #(64 +24)]
+ add w5, w5, w9
+ ldr w11, [sp, #(64 +28)]
+ add w6, w6, w10
+ tst x12, x12
+ add w7, w7, w11
+ beq .Lchacha_blocks_neon_nomessage21
+ LDMIA4(x12, w8, w9, w10, w11)
+ tst x12, x12
+ eor w0, w0, w8
+ eor w1, w1, w9
+ eor w2, w2, w10
+ ldr w8, [x12, #0]
+ eor w3, w3, w11
+ ldr w9, [x12, #4]
+ eor w4, w4, w8
+ ldr w10, [x12, #8]
+ eor w5, w5, w9
+ ldr w11, [x12, #12]
+ eor w6, w6, w10
+ add x12, x12, #16
+ eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage21:
+ mov x16, sp
+ STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+ LDMIA8(x16, w0, w1, w2, w3, w4, w5, w6, w7)
+ ldr w8, [sp, #(64 +32)]
+ ldr w9, [sp, #(64 +36)]
+ ldr w10, [sp, #(64 +40)]
+ ldr w11, [sp, #(64 +44)]
+ add w0, w0, w8
+ add w1, w1, w9
+ add w2, w2, w10
+ ldr w8, [sp, #(64 +48)]
+ add w3, w3, w11
+ ldr w9, [sp, #(64 +52)]
+ add w4, w4, w8
+ ldr w10, [sp, #(64 +56)]
+ add w5, w5, w9
+ ldr w11, [sp, #(64 +60)]
+ add w6, w6, w10
+ adds w8, w8, #1
+ add w7, w7, w11
+ adc w9, w9, wzr
+ str w8, [sp, #(64 +48)]
+ tst x12, x12
+ str w9, [sp, #(64 +52)]
+ beq .Lchacha_blocks_neon_nomessage22
+ LDMIA4(x12, w8, w9, w10, w11)
+ tst x12, x12
+ eor w0, w0, w8
+ eor w1, w1, w9
+ eor w2, w2, w10
+ ldr w8, [x12, #0]
+ eor w3, w3, w11
+ ldr w9, [x12, #4]
+ eor w4, w4, w8
+ ldr w10, [x12, #8]
+ eor w5, w5, w9
+ ldr w11, [x12, #12]
+ eor w6, w6, w10
+ add x12, x12, #16
+ eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage22:
+ STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+ str x12, [sp, # STACK_SRC]
+ str x14, [sp, # STACK_DST]
+ ldr x3, [sp, # STACK_BYTES]
+ cmp x3, #64
+ sub x4, x3, #64
+ str x4, [sp, # STACK_BYTES]
+ bhi .Lchacha_blocks_neon_mainloop2
+ cmp x3, #64
+ beq .Lchacha_blocks_neon_nocopy2
+ ldr x1, [sp, # STACK_DST_TMP]
+ sub x14, x14, #64
+.Lchacha_blocks_neon_copyinput2:
+ subs x3, x3, #1
+ ldrb w0, [x14], #1
+ strb w0, [x1], #1
+ bne .Lchacha_blocks_neon_copyinput2
+.Lchacha_blocks_neon_nocopy2:
+.Lchacha_blocks_neon_done:
+ ldr x16, [sp, # STACK_SP]
+ ldr x7, [sp, # STACK_STATE]
+ ldr w8, [sp, #(64 +48)]
+ ldr w9, [sp, #(64 +52)]
+ str w8, [x7, #(48 + 0)]
+ str w9, [x7, #(48 + 4)]
+ sub x0, sp, #8
+ mov v8.16b, v16.16b
+ mov v9.16b, v17.16b
+ mov v10.16b, v18.16b
+ mov v11.16b, v19.16b
+ mov sp, x16
+ mov v12.16b, v20.16b
+ mov v13.16b, v21.16b
+ mov v14.16b, v22.16b
+ mov v15.16b, v23.16b
+ sub x0, sp, x0
+ eor v0.16b, v0.16b, v0.16b
+ eor v1.16b, v1.16b, v1.16b
+ eor v2.16b, v2.16b, v2.16b
+ eor v3.16b, v3.16b, v3.16b
+ eor v4.16b, v4.16b, v4.16b
+ eor v5.16b, v5.16b, v5.16b
+ eor v6.16b, v6.16b, v6.16b
+ eor v7.16b, v7.16b, v7.16b
+ ret
+.Lchacha_blocks_neon_nobytes:
+ mov x0, xzr;
+ ret
+.ltorg
+.size _gcry_chacha20_aarch64_blocks,.-_gcry_chacha20_aarch64_blocks;
+
+#endif
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 613fa82a..a11986c1 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -81,6 +81,16 @@
 # endif
 #endif /*ENABLE_NEON_SUPPORT*/
 
+/* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
+ * code. */
+#undef USE_AARCH64_SIMD
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
+#  define USE_AARCH64_SIMD 1
+# endif
+#endif
 
 struct CHACHA20_context_s;
 
@@ -144,6 +154,14 @@ unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
 
 #endif /* USE_NEON */
 
+#ifdef USE_AARCH64_SIMD
+
+unsigned int _gcry_chacha20_aarch64_blocks(u32 *state, const byte *in,
+   byte *out,
+   size_t bytes) ASM_FUNC_ABI;
+
+#endif /* USE_AARCH64_SIMD */
+
 
 static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
 static const char *selftest (void);
@@ -406,6 +424,10 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
   if (features & HWF_ARM_NEON)
     ctx->blocks = _gcry_chacha20_armv7_neon_blocks;
 #endif
+#ifdef USE_AARCH64_SIMD
+  if (features & HWF_ARM_NEON)
+    ctx->blocks = _gcry_chacha20_aarch64_blocks;
+#endif
 
   (void)features;
 
diff --git a/configure.ac b/configure.ac
index 66e7cd67..1e6ac9d7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2243,6 +2243,10 @@ if test "$found" = "1" ; then
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-aarch64.lo"
+      ;;
    esac
 
    if test x"$neonsupport" = xyes ; then


_______________________________________________
Gcrypt-devel mailing list
[hidden email]
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: [PATCH] Add ARMv8/AArch64 implementation of chacha20

Werner Koch
On Sun,  6 Aug 2017 14:09, [hidden email] said:

> Patch adds ARMv8/AArch64 SIMD implementation based on public domain
> ARMv7/NEON implementation by Andrew Moon at:
>   https://github.com/floodyberry/chacha-opt

Can you please contact the author and ask to clarify the license?  I
only found this in the README:

  Public Domain. or MIT

This is not sufficient.  We need to know who has put this into the PD.
There are several MIT licenses.  We need to know which one.  And also
the copyright holder.


Salam-Shalom,

   Werner

--
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.

_______________________________________________
Gcrypt-devel mailing list
[hidden email]
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel

attachment0 (233 bytes) Download Attachment
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: [PATCH] Add ARMv8/AArch64 implementation of chacha20

Jussi Kivilinna-2
On 07.08.2017 17:52, Werner Koch wrote:

> On Sun,  6 Aug 2017 14:09, [hidden email] said:
>
>> Patch adds ARMv8/AArch64 SIMD implementation based on public domain
>> ARMv7/NEON implementation by Andrew Moon at:
>>   https://github.com/floodyberry/chacha-opt
>
> Can you please contact the author and ask to clarify the license?  I
> only found this in the README:
>
>   Public Domain. or MIT
>
> This is not sufficient.  We need to know who has put this into the PD.
> There are several MIT licenses.  We need to know which one.  And also
> the copyright holder.
>

I've sent author e-mail on this issue, and now waiting for reply.

-Jussi

_______________________________________________
Gcrypt-devel mailing list
[hidden email]
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Loading...