From 141d345cfe2d882fb15f036a56052ec8fd5128e4 Mon Sep 17 00:00:00 2001 From: NIIBE Yutaka Date: Tue, 24 Dec 2013 09:56:30 +0900 Subject: [PATCH] more improvement of mpi_montsqr and multiply. --- ChangeLog | 8 ++ polarssl/include/polarssl/bn_mul.h | 42 +++++------ polarssl/library/bignum.c | 116 ++++++++++++++--------------- 3 files changed, 84 insertions(+), 82 deletions(-) diff --git a/ChangeLog b/ChangeLog index dda577c..6c2397a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2013-12-24 Niibe Yutaka + + * polarssl/include/polarssl/bn_mul.h (MULADDC_1024_CORE) + (MULADDC_1024_LOOP): Use younger number registers more for shorter + instructions and better performance. + * polarssl/library/bignum.c (mpi_montsqr): Likewise. Change loop + structure and conditional branch for better performance. + 2013-12-23 Niibe Yutaka * polarssl/library/bignum.c (mpi_montmul): Computation diff --git a/polarssl/include/polarssl/bn_mul.h b/polarssl/include/polarssl/bn_mul.h index 9dd5bf6..504c70a 100644 --- a/polarssl/include/polarssl/bn_mul.h +++ b/polarssl/include/polarssl/bn_mul.h @@ -496,21 +496,21 @@ #if defined(__arm__) #define MULADDC_1024_CORE \ - "ldmia %0!, { r5, r6, r7 } \n\t" \ - "ldmia %1, { r8, r9, r10 } \n\t" \ - "adcs r8, r8, %2 \n\t" \ - "umull r11, r12, r5, %4 \n\t" \ + "ldmia %0!, { r8, r9, r10 } \n\t" \ + "ldmia %1, { r5, r6, r7 } \n\t" \ + "adcs r5, r5, %2 \n\t" \ + "umull r11, r12, r8, %4 \n\t" \ "adc %2, r12, #0 \n\t" \ - "adds r8, r8, r11 \n\t" \ - "adcs r9, r9, %2 \n\t" \ - "umull r11, r12, r6, %4 \n\t" \ + "adds r5, r5, r11 \n\t" \ + "adcs r6, r6, %2 \n\t" \ + "umull r11, r12, r9, %4 \n\t" \ "adc %2, r12, #0 \n\t" \ - "adds r9, r9, r11 \n\t" \ - "adcs r10, r10, %2 \n\t" \ - "umull r11, r12, r7, %4 \n\t" \ + "adds r6, r6, r11 \n\t" \ + "adcs r7, r7, %2 \n\t" \ + "umull r11, r12, r10, %4 \n\t" \ "adc %2, r12, #0 \n\t" \ - "adds r10, r10, r11 \n\t" \ - "stmia %1!, { r8, r9, r10 } \n\t" + "adds r7, r7, r11 \n\t" \ + "stmia %1!, { r5, r6, r7 } \n\t" #define MULADDC_1024_LOOP \ asm( "tst %3, #0xfe0 \n\t" \ @@ -536,17 +536,17 @@ MULADDC_1024_CORE MULADDC_1024_CORE \ MULADDC_1024_CORE MULADDC_1024_CORE \ MULADDC_1024_CORE \ - "ldmia %0!, { r5, r6 } \n\t" \ - "ldmia %1, { r8, r9 } \n\t" \ - "adcs r8, r8, %2 \n\t" \ - "umull r11, r12, %4, r5 \n\t" \ + "ldmia %0!, { r8, r9 } \n\t" \ + "ldmia %1, { r5, r6 } \n\t" \ + "adcs r5, r5, %2 \n\t" \ + "umull r11, r12, %4, r8 \n\t" \ "adc %2, r12, #0 \n\t" \ - "adds r8, r8, r11 \n\t" \ - "adcs r9, r9, %2 \n\t" \ - "umull r11, r12, %4, r6 \n\t" \ + "adds r5, r5, r11 \n\t" \ + "adcs r6, r6, %2 \n\t" \ + "umull r11, r12, %4, r9 \n\t" \ "adc %2, r12, #0 \n\t" \ - "adds r9, r9, r11 \n\t" \ - "stmia %1!, { r8, r9 } \n\t" \ + "adds r6, r6, r11 \n\t" \ + "stmia %1!, { r5, r6 } \n\t" \ "adc %2, %2, #0 \n\t" \ "tst %3, #0xfe0 \n\t" \ "bne 1b \n" \ diff --git a/polarssl/library/bignum.c b/polarssl/library/bignum.c index 250be59..684445c 100644 --- a/polarssl/library/bignum.c +++ b/polarssl/library/bignum.c @@ -1454,86 +1454,80 @@ static void mpi_montred( size_t n, const t_uint *np, t_uint mm, t_uint *d ) static void mpi_montsqr( size_t n, const t_uint *np, t_uint mm, t_uint *d ) { size_t i; - t_uint c = 0; + register t_uint c = 0; for (i = 0; i < n; i++) { t_uint *wij = &d[i*2]; t_uint *xj = &d[i+n]; - t_uint u, x_i; + t_uint x_i; x_i = *xj; *xj++ = c; asm ("mov r8, #0\n\t" /* R8 := 0, the constant ZERO from here. */ - /* (C,U,R9) := w_i_i + x_i*x_i; w_i_i := R9; */ - "ldr r9, [%[wij]]\n\t" /* R9 := w_i_i; */ + /* (C,R4,R5) := w_i_i + x_i*x_i; w_i_i := R5; */ + "ldr r5, [%[wij]]\n\t" /* R5 := w_i_i; */ "mov %[c], r8\n\t" - "umull r11, r12, %[x_i], %[x_i]\n\t" - "adds r9, r9, r11\n\t" - "adc %[u], r8, r12\n\t" - "str r9, [%[wij]], #4\n\t" - /**/ - "subs r9, %[xj_max], %[xj]\n\t" - "bls 1f\n\t" - /**/ - "tst r9, #4\n\t" - "beq 0f\n\t" - /* (C,U,R9) := (C,U) + w_i_j + 2*x_i*x_j; */ - "ldr r10, [%[xj]], #4\n\t" - "ldr r9, [%[wij]]\n\t" - "adds r9, r9, %[u]\n\t" - "adc %[u], %[c], r8\n\t" - "umull r11, r12, %[x_i], r10\n\t" - "adds r9, r9, r11\n\t" - "adcs %[u], %[u], r12\n\t" + "umull r6, r11, %[x_i], %[x_i]\n\t" + "adds r5, r5, r6\n\t" + "adc r4, r8, r11\n\t" + "subs r11, %[xj], %[x_max1]\n\t" /* could use "CMP" but slower */ + "str r5, [%[wij]], #4\n\t" + "beq 1f\n\t" + "bhi 0f\n" + "2:\n\t" + "ldmia %[xj]!, { r9, r10 }\n\t" + "ldmia %[wij], { r5, r7 }\n\t" + /* (C,R4,R5) := (C,R4) + w_i_j + 2*x_i*x_j; */ + "umull r6, r11, %[x_i], r9\n\t" + "adds r5, r5, r4\n\t" + "adc r4, %[c], r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r4, r11\n\t" "adc %[c], r8, r8\n\t" - "adds r9, r9, r11\n\t" - "adcs %[u], %[u], r12\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r4, r11\n\t" "adc %[c], %[c], r8\n\t" - "str r9, [%[wij]], #4\n\t" - /**/ - "subs r9, %[xj_max], %[xj]\n\t" - "bls 1f\n" - "0:\n\t" - "ldmia %[xj]!, { r6, r7 }\n\t" - "ldmia %[wij], { r9, r10 }\n\t" - /* (C,U,R9) := (C,U) + w_i_j + 2*x_i*x_j; */ - "umull r11, r12, %[x_i], r6\n\t" - "adds r9, r9, %[u]\n\t" - "adc %[u], %[c], r8\n\t" - "adds r9, r9, r11\n\t" - "adcs %[u], %[u], r12\n\t" + /* (C,R4,R7) := (C,R4) + w_i_j + 2*x_i*x_j; */ + "adds r7, r7, r4\n\t" + "adc r4, %[c], r8\n\t" + "umull r6, r11, %[x_i], r10\n\t" + "adds r7, r7, r6\n\t" + "adcs r4, r4, r11\n\t" "adc %[c], r8, r8\n\t" - "adds r9, r9, r11\n\t" - "adcs %[u], %[u], r12\n\t" - "adc %[c], %[c], r8\n\t" - /* (C,U,R10) := (C,U) + w_i_j + 2*x_i*x_j; */ - "adds r10, r10, %[u]\n\t" - "adc %[u], %[c], r8\n\t" - "umull r11, r12, %[x_i], r7\n\t" - "adds r10, r10, r11\n\t" - "adcs %[u], %[u], r12\n\t" - "adc %[c], r8, r8\n\t" - "adds r10, r10, r11\n\t" - "adcs %[u], %[u], r12\n\t" + "adds r7, r7, r6\n\t" + "adcs r4, r4, r11\n\t" "adc %[c], %[c], r8\n\t" /**/ - "stmia %[wij]!, { r9, r10 }\n\t" - /**/ - "cmp %[xj], %[xj_max]\n\t" - "bcc 0b\n" + "cmp %[xj], %[x_max1]\n\t" + "stmia %[wij]!, { r5, r7 }\n\t" + "bcc 2b\n\t" + "bne 0f\n" "1:\n\t" - "ldr r9, [%[wij]]\n\t" - "adds %[u], %[u], r9\n\t" + /* (C,R4,R5) := (C,R4) + w_i_j + 2*x_i*x_j; */ + "ldr r7, [%[xj]], #4\n\t" + "ldr r5, [%[wij]]\n\t" + "adds r5, r5, r4\n\t" + "adc r4, %[c], r8\n\t" + "umull r6, r11, %[x_i], r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r4, r11\n\t" + "adc %[c], r8, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r4, r11\n\t" "adc %[c], %[c], r8\n\t" - "str %[u], [%[wij]]" - : [c] "=&r" (c), [u] "=&r" (u), [wij] "=r" (wij), [xj] "=r" (xj) - : [x_i] "r" (x_i), [xj_max] "r" (&d[n*2]), + "str r5, [%[wij]], #4\n" + "0:\n\t" + "ldr r5, [%[wij]]\n\t" + "adds r4, r4, r5\n\t" + "adc %[c], %[c], r8\n\t" + "str r4, [%[wij]]" + : [c] "=&r" (c), [wij] "=r" (wij), [xj] "=r" (xj) + : [x_i] "r" (x_i), [x_max1] "r" (&d[n*2-1]), "[wij]" (wij), "[xj]" (xj) - : "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory", "cc" ); + : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "memory", "cc" ); - u = d[i] * mm; - c += mpi_mul_hlp( n, np, &d[i], u ); + c += mpi_mul_hlp( n, np, &d[i], d[i] * mm ); } d += n;