[PATCH 1/3] chacha20-ppc: fix 32-bit counter overflow handling

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH 1/3] chacha20-ppc: fix 32-bit counter overflow handling

Jussi Kivilinna-2
* cipher/chacha20-ppc.c (vec_add_ctr_u64, ADD_U64): New.
(_gcry_chacha20_ppc8_blocks1, _gcry_chacha20_ppc8_blocks4)
(_gcry_chacha20_poly1305_ppc8_blocks4): Use ADD_U64 when incrementing
counter.
--

Patch fixes 32-bit overflow for PowerPC ChaCha20 implementation.
In typical use case, overflow happens after 256 GiB bytes of output.

Typical use case here means use of 96-bit or 64-bit IV which causes
lower 32-bits of counter to start from zero.

Signed-off-by: Jussi Kivilinna <[hidden email]>
---
 cipher/chacha20-ppc.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c
index 985f2fcd..4a21b837 100644
--- a/cipher/chacha20-ppc.c
+++ b/cipher/chacha20-ppc.c
@@ -88,6 +88,24 @@ vec_store_le(vector4x_u32 vec, unsigned long offset, unsigned char *ptr)
 }
 
 
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_add_ctr_u64(vector4x_u32 v, vector4x_u32 a)
+{
+#ifdef WORDS_BIGENDIAN
+  static const vector16x_u8 swap32 =
+    { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 };
+  vector2x_u64 vec, add, sum;
+
+  vec = (vector2x_u64)vec_perm((vector16x_u8)v, (vector16x_u8)v, swap32);
+  add = (vector2x_u64)vec_perm((vector16x_u8)a, (vector16x_u8)a, swap32);
+  sum = vec + add;
+  return (vector4x_u32)vec_perm((vector16x_u8)sum, (vector16x_u8)sum, swap32);
+#else
+  return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a));
+#endif
+}
+
+
 /**********************************************************************
   2-way && 1-way chacha20
  **********************************************************************/
@@ -115,6 +133,9 @@ vec_store_le(vector4x_u32 vec, unsigned long offset, unsigned char *ptr)
    ROTATE(x1, rotate_7); \
   WORD_ROL(x1, rol_x1);
 
+#define ADD_U64(v,a) \
+ (v = vec_add_ctr_u64(v, a))
+
 unsigned int ASM_FUNC_ATTR
 _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
     size_t nblks)
@@ -152,7 +173,7 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
       v5 = state1;
       v6 = state2;
       v7 = state3;
-      v7 += counter_1;
+      ADD_U64(v7, counter_1);
 
       for (i = 20; i > 0; i -= 2)
  {
@@ -166,12 +187,12 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
       v1 += state1;
       v2 += state2;
       v3 += state3;
-      state3 += counter_1; /* update counter */
+      ADD_U64(state3, counter_1); /* update counter */
       v4 += state0;
       v5 += state1;
       v6 += state2;
       v7 += state3;
-      state3 += counter_1; /* update counter */
+      ADD_U64(state3, counter_1); /* update counter */
 
       v0 ^= vec_load_le(0 * 16, src);
       v1 ^= vec_load_le(1 * 16, src);
@@ -214,7 +235,7 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
       v1 += state1;
       v2 += state2;
       v3 += state3;
-      state3 += counter_1; /* update counter */
+      ADD_U64(state3, counter_1); /* update counter */
 
       v0 ^= vec_load_le(0 * 16, src);
       v1 ^= vec_load_le(1 * 16, src);
@@ -339,7 +360,7 @@ _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
       v13 += vec_splat(state3, 1) - vec_cmplt(tmp, counters_0123);
       v14 += vec_splat(state3, 2);
       v15 += vec_splat(state3, 3);
-      state3 += counter_4; /* update counter */
+      ADD_U64(state3, counter_4); /* update counter */
 
       transpose_4x4(v0, v1, v2, v3);
       transpose_4x4(v4, v5, v6, v7);
@@ -554,7 +575,7 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
       v13 += vec_splat(state3, 1) - vec_cmplt(tmp, counters_0123);
       v14 += vec_splat(state3, 2);
       v15 += vec_splat(state3, 3);
-      state3 += counter_4; /* update counter */
+      ADD_U64(state3, counter_4); /* update counter */
 
       transpose_4x4(v0, v1, v2, v3);
       transpose_4x4(v4, v5, v6, v7);
--
2.27.0


_______________________________________________
Gcrypt-devel mailing list
[hidden email]
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Reply | Threaded
Open this post in threaded view
|

[PATCH 2/3] tests/basic: check 32-bit and 64-bit overflow for CTR and ChaCha20

Jussi Kivilinna-2
* tests/basic.c (check_one_cipher_ctr_reset)
(check_one_cipher_ctr_overflow): New.
(check_one_cipher): Add counter overflow tests for ChaCha20 and CTR
mode.
--

Patch adds counter overflow tests to check for correct counter handling
in bulk processing implementations.

Signed-off-by: Jussi Kivilinna <[hidden email]>
---
 tests/basic.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)

diff --git a/tests/basic.c b/tests/basic.c
index 1d12c4a2..4beeeed9 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -9415,6 +9415,210 @@ err_out_free:
 
 
 
+static int
+check_one_cipher_ctr_reset (gcry_cipher_hd_t hd, int algo, int mode,
+    u32 ctr_high_bits, int be_ctr,
+    int pass)
+{
+  unsigned char iv[16] = { 0 };
+  unsigned char swap;
+  unsigned int ivlen;
+  u32 ctr_low_bits;
+  int err;
+  int i;
+
+  /* This should be largest parallel block processing count in any
+   * implementation negated. Currently for CTR this is 32 and, for
+   * ChaCha20, count is 8. */
+  ctr_low_bits = (mode == GCRY_CIPHER_MODE_CTR) ? -32 : -8;
+
+  gcry_cipher_reset (hd);
+
+  if (mode == GCRY_CIPHER_MODE_CTR)
+    ivlen = get_algo_mode_blklen(algo, GCRY_CIPHER_MODE_ECB);
+  else
+    ivlen = 16;
+
+  /* Little-endian fill. */
+  for (i = 0; i < 4; i++)
+    iv[i + 0] = (ctr_low_bits >> (i * 8)) & 0xff;
+  for (i = 0; i < 4; i++)
+    iv[i + 4] = (ctr_high_bits >> (i * 8)) & 0xff;
+
+  if (be_ctr)
+    {
+      /* Swap to big-endian. */
+      for (i = 0; i < ivlen / 2; i++)
+ {
+  swap = iv[i];
+  iv[i] = iv[ivlen - (i + 1)];
+  iv[ivlen - (i + 1)] = swap;
+ }
+    }
+
+  clutter_vector_registers();
+  if (mode == GCRY_CIPHER_MODE_CTR)
+    err = gcry_cipher_setctr (hd, iv, ivlen);
+  else
+    err = gcry_cipher_setiv (hd, iv, ivlen);
+
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_setiv failed: %s\n",
+    pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      return -1;
+    }
+
+  return 0;
+}
+
+static int
+check_one_cipher_ctr_overflow (int algo, int mode, int flags,
+       const char *key, size_t nkey,
+       const unsigned char *plain, size_t nplain,
+       unsigned long ctr_high_bits, int be_ctr,
+       int pass)
+{
+  gcry_cipher_hd_t hd;
+  unsigned char *out;
+  unsigned char *enc_result;
+  int keylen;
+  gcry_error_t err = 0;
+  unsigned int firstlen;
+  unsigned int leftlen;
+  unsigned int blklen;
+  unsigned int pos;
+  unsigned int i;
+
+  out = malloc (nplain);
+  enc_result = malloc (nplain);
+  if (!out || !enc_result)
+    {
+      fail ("pass %d, algo %d, mode %d, malloc failed\n",
+    pass, algo, mode);
+      goto err_out_free;
+    }
+
+  assert (nkey == 64);
+  assert (nplain > 0);
+  assert ((nplain % 16) == 0);
+
+  keylen = gcry_cipher_get_algo_keylen (algo);
+  if (!keylen)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_get_algo_keylen failed\n",
+    pass, algo, mode);
+      goto err_out_free;
+    }
+
+  if (keylen < 40 / 8 || keylen > 32)
+    {
+      fail ("pass %d, algo %d, mode %d, keylength problem (%d)\n",
+    pass, algo, mode, keylen);
+      goto err_out_free;
+    }
+
+  err = gcry_cipher_open (&hd, algo, mode, flags);
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_open failed: %s\n",
+    pass, algo, mode, gpg_strerror (err));
+      goto err_out_free;
+    }
+
+  clutter_vector_registers();
+  err = gcry_cipher_setkey (hd, key, keylen);
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_setkey failed: %s\n",
+    pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      goto err_out_free;
+    }
+
+  if (check_one_cipher_ctr_reset (hd, algo, mode, ctr_high_bits, be_ctr,
+  pass) < 0)
+    goto err_out_free;
+
+  /* Non-bulk processing. */
+  for (i = 0; i < nplain; i += 16)
+    {
+      clutter_vector_registers();
+      err = gcry_cipher_encrypt (hd, out + i, 16, plain + i, 16);
+      if (err)
+ {
+  fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt failed: %s\n",
+ pass, algo, mode, gpg_strerror (err));
+  gcry_cipher_close (hd);
+  goto err_out_free;
+ }
+    }
+
+  memcpy (enc_result, out, nplain);
+
+  /* Test with different bulk processing sizes. */
+  for (blklen = 2 * 16; blklen <= 32 * 16; blklen *= 2)
+    {
+      /* Move bulk processing start offset, test at different spots to
+       * test bulk counter calculation throughly. */
+      for (firstlen = 16; firstlen < 8 * 64; firstlen += 16)
+ {
+  if (check_one_cipher_ctr_reset (hd, algo, mode, ctr_high_bits, be_ctr,
+  pass) < 0)
+    goto err_out_free;
+
+  clutter_vector_registers();
+  err = gcry_cipher_encrypt (hd, out, firstlen, plain, firstlen);
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt "
+    "failed: %s\n", pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      goto err_out_free;
+    }
+
+  leftlen = nplain - firstlen;
+  pos = firstlen;
+  while (leftlen)
+    {
+      unsigned int currlen = leftlen > blklen ? blklen : leftlen;
+
+      clutter_vector_registers();
+      err = gcry_cipher_encrypt (hd, out + pos, currlen, plain + pos,
+ currlen);
+      if (err)
+ {
+  fail ("pass %d, algo %d, mode %d, block len %d, first len %d,"
+ "gcry_cipher_encrypt failed: %s\n", pass, algo, mode,
+ blklen, firstlen, gpg_strerror (err));
+  gcry_cipher_close (hd);
+  goto err_out_free;
+ }
+
+      pos += currlen;
+      leftlen -= currlen;
+    }
+
+  if (memcmp (enc_result, out, nplain))
+    fail ("pass %d, algo %d, mode %d, block len %d, first len %d, "
+          "encrypt mismatch\n", pass, algo, mode, blklen, firstlen);
+ }
+    }
+
+  gcry_cipher_close (hd);
+
+  free (enc_result);
+  free (out);
+  return 0;
+
+err_out_free:
+  free (enc_result);
+  free (out);
+  return -1;
+}
+
+
 static void
 check_one_cipher (int algo, int mode, int flags)
 {
@@ -9491,6 +9695,34 @@ check_one_cipher (int algo, int mode, int flags)
      50))
     goto out;
 
+  /* Pass 6: Counter overflow tests for ChaCha20 and CTR mode. */
+  if (mode == GCRY_CIPHER_MODE_STREAM && algo == GCRY_CIPHER_CHACHA20)
+    {
+      /* 32bit overflow test (little-endian counter) */
+      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
+  medium_buffer_size, 0UL,
+  0, 60))
+ goto out;
+      /* 64bit overflow test (little-endian counter) */
+      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
+  medium_buffer_size, 0xffffffffUL,
+  0, 61))
+ goto out;
+    }
+   else if (mode == GCRY_CIPHER_MODE_CTR)
+    {
+      /* 32bit overflow test (big-endian counter) */
+      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
+  medium_buffer_size, 0UL,
+  1, 62))
+ goto out;
+      /* 64bit overflow test (big-endian counter) */
+      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
+  medium_buffer_size, 0xffffffffUL,
+  1, 63))
+ goto out;
+    }
+
 out:
   free (plain);
 }
--
2.27.0


_______________________________________________
Gcrypt-devel mailing list
[hidden email]
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Reply | Threaded
Open this post in threaded view
|

[PATCH 3/3] Prevent link-time optimization from inlining __gcry_burn_stack

Jussi Kivilinna-2
In reply to this post by Jussi Kivilinna-2
* src/g10lib.h (NOINLINE_FUNC): New attribute macro.
* src/misc.c (__gcry_burn_stack): Add NOINLINE_FUNC attribute.
--

LTO can cause inline of __gcry_burn_stack and result tail-call
to _gcry_fast_wipememory and defeat tail-call prevention in
_gcry_burn_stack macro. Mark __gcry_burn_stack with 'noinline'
attribute to prevent unwanted inlining of this function in
LTO builds.

Signed-off-by: Jussi Kivilinna <[hidden email]>
---
 src/g10lib.h | 6 ++++++
 src/misc.c   | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/g10lib.h b/src/g10lib.h
index c85e6649..ffd71018 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -75,6 +75,12 @@
 #define GCC_ATTR_UNUSED
 #endif
 
+#if __GNUC__ > 3
+#define NOINLINE_FUNC     __attribute__((noinline))
+#else
+#define NOINLINE_FUNC
+#endif
+
 #if __GNUC__ >= 3
 #define LIKELY(expr)      __builtin_expect( !!(expr), 1 )
 #define UNLIKELY(expr)    __builtin_expect( !!(expr), 0 )
diff --git a/src/misc.c b/src/misc.c
index 283e3a72..4db2d9a4 100644
--- a/src/misc.c
+++ b/src/misc.c
@@ -545,7 +545,7 @@ _gcry_fast_wipememory2 (void *ptr, int set, size_t len)
 }
 
 
-void
+void NOINLINE_FUNC
 __gcry_burn_stack (unsigned int bytes)
 {
 #ifdef HAVE_VLA
--
2.27.0


_______________________________________________
Gcrypt-devel mailing list
[hidden email]
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel