diff --git a/ChangeLog b/ChangeLog index cca307a..b9efa95 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2014-04-15 Niibe Yutaka + + * src/ecc-mont.c: New. + + * src/mod25638.c (p25519): Move from ecc-edwards.c. + (mod25519_reduce, add19): Likewise. + (mod25638_reduce) [!ASM_IMPLEMENTATION]: Use bn256_add_uint. + 2014-04-14 Niibe Yutaka * src/jpc.c (jpc_to_ac): Error check before mod_inv. diff --git a/src/ecc-edwards.c b/src/ecc-edwards.c index 9345897..78a37e5 100644 --- a/src/ecc-edwards.c +++ b/src/ecc-edwards.c @@ -93,10 +93,6 @@ * Gy: 0x6666666666666666666666666666666666666666666666666666666666666658 */ -static const bn256 p25519[1] = { - {{ 0xffffffed, 0xffffffff, 0xffffffff, 0xffffffff, - 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }} }; - /* d + 2^255 - 19 */ static const bn256 coefficient_d[1] = { {{ 0x135978a3, 0x75eb4dca, 0x4141d8ab, 0x00700a4d, @@ -240,84 +236,6 @@ point_add (ptc *X, const ptc *A, const ac *B) } -static void -add19 (bn256 *r, bn256 *x) -{ - uint32_t v; - int i; - - v = 19; - for (i = 0; i < BN256_WORDS; i++) - { - r->word[i] = x->word[i] + v; - v = (r->word[i] < v); - } -} - -/* - * @brief X = A mod 2^255-19 - * - * It's precisely modulo 2^255-19 (unlike mod25638_reduce). - */ -static void -mod25519_reduce (bn256 *X) -{ - uint32_t q; - bn256 r0[1], r1[1]; - int flag; - - memcpy (r0, X, sizeof (bn256)); - q = (r0->word[7] >> 31); - r0->word[7] &= 0x7fffffff; - if (q) - { - add19 (r0, r0); - q = (r0->word[7] >> 31); - r0->word[7] &= 0x7fffffff; - if (q) - { - add19 (r1, r0); - q = (r1->word[7] >> 31); - r1->word[7] &= 0x7fffffff; - flag = 0; - } - else - flag = 1; - } - else - { - add19 (r1, r0); /* dummy */ - q = (r1->word[7] >> 31); /* dummy */ - r1->word[7] &= 0x7fffffff; /* dummy */ - if (q) - flag = 2; - else - flag = 3; - } - - if (flag) - { - add19 (r1, r0); - q = (r1->word[7] >> 31); - r1->word[7] &= 0x7fffffff; - if (q) - memcpy (X, r1, sizeof (bn256)); - else - memcpy (X, r0, sizeof (bn256)); - } - else - { - if (q) - { - asm volatile ("" : : "r" (q) : "memory"); - memcpy (X, r1, sizeof (bn256)); - asm volatile ("" : : "r" (q) : "memory"); - } - else - memcpy (X, r1, sizeof (bn256)); - } -} - /** * @brief X = convert A * diff --git a/src/ecc-mont.c b/src/ecc-mont.c new file mode 100644 index 0000000..707521c --- /dev/null +++ b/src/ecc-mont.c @@ -0,0 +1,179 @@ +/* -*- coding: utf-8 -*- + * ecc-mont.c - Elliptic curve computation for + * the Montgomery curve: y^2 = x^3 + 486662*x^2 + x. + * + * Copyright (C) 2014 Free Software Initiative of Japan + * Author: NIIBE Yutaka + * + * This file is a part of Gnuk, a GnuPG USB Token implementation. + * + * Gnuk is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Gnuk is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + * License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#include +#include +#include "bn.h" +#include "mod25638.h" +#include "mod.h" + +/* + * References: + * + * [1] D. J. Bernstein. Curve25519: new Diffie-Hellman speed records. + * Proceedings of PKC 2006, to appear. + * http://cr.yp.to/papers.html#curve25519. Date: 2006.02.09. + * + * [2] D. J. Bernstein. Can we avoid tests for zero in fast + * elliptic-curve arithmetic? + * http://cr.yp.to/papers.html#curvezero. Date: 2006.07.26. + * + */ + +/* + * IMPLEMENTATION NOTE + * + * (0) We assume that the processor has no cache, nor branch target + * prediction. Thus, we don't avoid indexing by secret value. + * We don't avoid conditional jump if both cases have same timing, + * either. + * + * (1) We use Radix-32 field arithmetic. It's a representation like + * 2^256-38, but it's more redundant. For example, "1" can be + * represented in three ways in 256-bit: 1, 2^255-18, and + * 2^256-37. + * + * (2) We use Montgomery double-and-add. + * + */ + +/* + * + * 121665 = 0x1db41 + * 1 1101 1011 0100 0001 + */ +static void +mod25638_mul_121665 (bn256 *x, const bn256 *a) +{ + uint32_t c; + bn256 m[1]; + + c = 0; + memcpy (x, a, sizeof (bn256)); /* X = A */ + c += bn256_shift (m, a, 6); c += bn256_add (x, x, m); /* X += A << 6 */ + c += bn256_shift (m, a, 8); c += bn256_add (x, x, m); /* X += A << 8 */ + c += bn256_shift (m, a, 9); c += bn256_add (x, x, m); /* X += A << 9 */ + c += bn256_shift (m, a, 11); c += bn256_add (x, x, m); /* X += A << 11 */ + c += bn256_shift (m, a, 12); c += bn256_add (x, x, m); /* X += A << 12 */ + c += bn256_shift (m, a, 14); c += bn256_add (x, x, m); /* X += A << 14 */ + c += bn256_shift (m, a, 15); c += bn256_add (x, x, m); /* X += A << 15 */ + c += bn256_shift (m, a, 16); c += bn256_add (x, x, m); /* X += A << 16 */ + + c *= 38; + c = bn256_add_uint (x, x, c); + x->word[0] += c * 38; +} + + +typedef struct +{ + bn256 x[1]; + bn256 z[1]; +} pt; + + +/** + * @brief Process Montgomery double-and-add + * + * With Q0, Q1, DIF (= Q0 - Q1), compute PRD = 2Q0, SUM = Q0 + Q1 + * Q0 and Q1 are clobbered. + * + */ +static void +mont_d_and_a (pt *prd, pt *sum, pt *q0, pt *q1, const bn256 *dif_x) +{ + mod25638_add (sum->x, q1->x, q1->z); + mod25638_sub (q1->z, q1->x, q1->z); + mod25638_add (prd->x, q0->x, q0->z); + mod25638_sub (q0->z, q0->x, q0->z); + mod25638_mul (q1->x, q0->z, sum->x); + mod25638_mul (q1->z, prd->x, q1->z); + mod25638_sqr (q0->x, prd->x); + mod25638_sqr (q0->z, q0->z); + mod25638_add (sum->x, q1->x, q1->z); + mod25638_sub (q1->z, q1->x, q1->z); + mod25638_mul (prd->x, q0->x, q0->z); + mod25638_sub (q0->z, q0->x, q0->z); + mod25638_sqr (sum->x, sum->x); + mod25638_sqr (sum->z, q1->z); + mod25638_mul_121665 (prd->z, q0->z); + mod25638_mul (sum->z, sum->z, dif_x); + mod25638_add (prd->z, q0->x, prd->z); + mod25638_mul (prd->z, prd->z, q0->z); +} + + +/** + * @brief RES = x-coordinate of [n]Q + * + * @param N Scalar N (three least significant bits are 000) + * @param Q_X x-coordinate of Q + * + */ +void +compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x) +{ + int i, j; + pt p0[1], p1[1], p0_[1], p1_[1]; + + /* P0 = O = (1:0) */ + memset (p0->x, 0, sizeof (bn256)); + p0->x->word[0] = 1; + memset (p0->z, 0, sizeof (bn256)); + + /* P1 = (X:1) */ + memcpy (p1->x, q_x, sizeof (bn256)); + memset (p1->z, 0, sizeof (bn256)); + p1->z->word[0] = 1; + + for (i = 0; i < 8; i++) + { + uint32_t u = n->word[7-i]; + + for (j = 0; j < 16; j++) + { + pt *q0, *q1; + pt *sum_n, *prd_n; + + if ((u & 0x80000000)) + q0 = p1, q1 = p0, sum_n = p0_, prd_n = p1_; + else + q0 = p0, q1 = p1, sum_n = p1_, prd_n = p0_; + mont_d_and_a (prd_n, sum_n, q0, q1, q_x); + + if ((u & 0x40000000)) + q0 = p1_, q1 = p0_, sum_n = p0, prd_n = p1; + else + q0 = p0_, q1 = p1_, sum_n = p1, prd_n = p0; + mont_d_and_a (prd_n, sum_n, q0, q1, q_x); + + u <<= 2; + } + } + + /* We know the LSB of N is always 0. Thus, result is always in P0. */ + mod_inv (res, p0->z, p25519); + mod25638_mul (res, res, p0->x); + mod25519_reduce (res); +} diff --git a/src/mod25638.c b/src/mod25638.c index dde9fa6..be4fb6e 100644 --- a/src/mod25638.c +++ b/src/mod25638.c @@ -26,10 +26,10 @@ * * We use radix-32. During computation, it's not reduced to 2^255-19, * but it is represented in 256-bit (it is redundant representation), - * that is, 2^256-38. + * that is, something like 2^256-38. * - * The idea is, to keep 2^256-38 until it will be converted to affine - * coordinates. + * The idea is, keeping within 256-bit until it will be converted to + * affine coordinates. */ #include @@ -79,6 +79,10 @@ const bn256 n25638[1] = { {{0xffffffda, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }} }; +const bn256 p25519[1] = { + {{ 0xffffffed, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }} }; + /* * Implementation Note. @@ -152,7 +156,6 @@ mod25638_reduce (bn256 *X, bn512 *A) { int i; uint64_t r; - uint32_t r0; uint32_t carry; r = 0; @@ -172,16 +175,8 @@ mod25638_reduce (bn256 *X, bn512 *A) } d[i] = (uint32_t)r; - r0 = A->word[8] * 38; - d = &A->word[0]; - for (i = 0; i < BN256_WORDS; i++) - { - r0 += d[i]; - carry = (r0 < d[i]); - d[i] = r0; - r0 = carry; - } - A->word[0] += r0 * 38; + carry = bn256_add_uint ((bn256 *)A, (bn256 *)A, A->word[8] * 38); + A->word[0] += carry * 38; } memcpy (X, A, sizeof (bn256)); @@ -237,3 +232,81 @@ mod25638_shift (bn256 *X, const bn256 *A, int shift) mod25638_add (X, X, tmp); } + +static void +add19 (bn256 *r, bn256 *x) +{ + uint32_t v; + int i; + + v = 19; + for (i = 0; i < BN256_WORDS; i++) + { + r->word[i] = x->word[i] + v; + v = (r->word[i] < v); + } +} + +/* + * @brief X = A mod 2^255-19 + * + * It's precisely modulo 2^255-19 (unlike mod25638_reduce). + */ +void +mod25519_reduce (bn256 *X) +{ + uint32_t q; + bn256 r0[1], r1[1]; + int flag; + + memcpy (r0, X, sizeof (bn256)); + q = (r0->word[7] >> 31); + r0->word[7] &= 0x7fffffff; + if (q) + { + add19 (r0, r0); + q = (r0->word[7] >> 31); + r0->word[7] &= 0x7fffffff; + if (q) + { + add19 (r1, r0); + q = (r1->word[7] >> 31); + r1->word[7] &= 0x7fffffff; + flag = 0; + } + else + flag = 1; + } + else + { + add19 (r1, r0); /* dummy */ + q = (r1->word[7] >> 31); /* dummy */ + r1->word[7] &= 0x7fffffff; /* dummy */ + if (q) + flag = 2; + else + flag = 3; + } + + if (flag) + { + add19 (r1, r0); + q = (r1->word[7] >> 31); + r1->word[7] &= 0x7fffffff; + if (q) + memcpy (X, r1, sizeof (bn256)); + else + memcpy (X, r0, sizeof (bn256)); + } + else + { + if (q) + { + asm volatile ("" : : "r" (q) : "memory"); + memcpy (X, r1, sizeof (bn256)); + asm volatile ("" : : "r" (q) : "memory"); + } + else + memcpy (X, r1, sizeof (bn256)); + } +} diff --git a/src/mod25638.h b/src/mod25638.h index 4dd95c0..281f98a 100644 --- a/src/mod25638.h +++ b/src/mod25638.h @@ -1,7 +1,8 @@ extern const bn256 n25638[1]; +extern const bn256 p25519[1]; void mod25638_add (bn256 *X, const bn256 *A, const bn256 *B); void mod25638_sub (bn256 *X, const bn256 *A, const bn256 *B); void mod25638_mul (bn256 *X, const bn256 *A, const bn256 *B); void mod25638_sqr (bn256 *X, const bn256 *A); -void mod25638_shift (bn256 *X, const bn256 *A, int shift); +void mod25519_reduce (bn256 *X);