diff --git a/ChangeLog b/ChangeLog
index cca307a..b9efa95 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2014-04-15  Niibe Yutaka  <gniibe@fsij.org>
+
+	* src/ecc-mont.c: New.
+
+	* src/mod25638.c (p25519): Move from ecc-edwards.c.
+	(mod25519_reduce, add19): Likewise.
+	(mod25638_reduce) [!ASM_IMPLEMENTATION]: Use bn256_add_uint.
+
 2014-04-14  Niibe Yutaka  <gniibe@fsij.org>
 
 	* src/jpc.c (jpc_to_ac): Error check before mod_inv.
diff --git a/src/ecc-edwards.c b/src/ecc-edwards.c
index 9345897..78a37e5 100644
--- a/src/ecc-edwards.c
+++ b/src/ecc-edwards.c
@@ -93,10 +93,6 @@
  * Gy: 0x6666666666666666666666666666666666666666666666666666666666666658
  */
 
-static const bn256 p25519[1] = {
-  {{ 0xffffffed, 0xffffffff, 0xffffffff, 0xffffffff,
-     0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }} };
-
 /* d + 2^255 - 19 */
 static const bn256 coefficient_d[1] = {
   {{ 0x135978a3, 0x75eb4dca, 0x4141d8ab, 0x00700a4d,
@@ -240,84 +236,6 @@ point_add (ptc *X, const ptc *A, const ac *B)
 }
 
 
-static void
-add19 (bn256 *r, bn256 *x)
-{
-  uint32_t v;
-  int i;
-
-  v = 19;
-  for (i = 0; i < BN256_WORDS; i++)
-    {
-      r->word[i] = x->word[i] + v;
-      v = (r->word[i] < v);
-    }
-}
-
-/*
- * @brief  X = A mod 2^255-19
- *
- * It's precisely modulo 2^255-19 (unlike mod25638_reduce).
- */
-static void
-mod25519_reduce (bn256 *X)
-{
-  uint32_t q;
-  bn256 r0[1], r1[1];
-  int flag;
-
-  memcpy (r0, X, sizeof (bn256));
-  q = (r0->word[7] >> 31);
-  r0->word[7] &= 0x7fffffff;
-  if (q)
-    {
-      add19 (r0, r0);
-      q = (r0->word[7] >> 31);
-      r0->word[7] &= 0x7fffffff;
-      if (q)
-	{
-	  add19 (r1, r0);
-	  q = (r1->word[7] >> 31);
-	  r1->word[7] &= 0x7fffffff;
-	  flag = 0;
-	}
-      else
-	flag = 1;
-    }
-  else
-    {
-      add19 (r1, r0);		 /* dummy */
-      q = (r1->word[7] >> 31);	 /* dummy */
-      r1->word[7] &= 0x7fffffff; /* dummy */
-      if (q)
-	flag = 2;
-      else
-	flag = 3;
-    }
-
-  if (flag)
-    {
-      add19 (r1, r0);
-      q = (r1->word[7] >> 31);
-      r1->word[7] &= 0x7fffffff;
-      if (q)
-	memcpy (X, r1, sizeof (bn256));
-      else
-	memcpy (X, r0, sizeof (bn256));
-    }
-  else
-    {
-      if (q)
-	{
-	  asm volatile ("" : : "r" (q) : "memory");
-	  memcpy (X, r1, sizeof (bn256));
-	  asm volatile ("" : : "r" (q) : "memory");
-	}
-      else
-	memcpy (X, r1, sizeof (bn256));
-    }
-}
-
 /**
  * @brief	X = convert A
  *
diff --git a/src/ecc-mont.c b/src/ecc-mont.c
new file mode 100644
index 0000000..707521c
--- /dev/null
+++ b/src/ecc-mont.c
@@ -0,0 +1,179 @@
+/*                                                    -*- coding: utf-8 -*-
+ * ecc-mont.c - Elliptic curve computation for
+ *              the Montgomery curve: y^2 = x^3 + 486662*x^2 + x.
+ *
+ * Copyright (C) 2014 Free Software Initiative of Japan
+ * Author: NIIBE Yutaka <gniibe@fsij.org>
+ *
+ * This file is a part of Gnuk, a GnuPG USB Token implementation.
+ *
+ * Gnuk is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Gnuk is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include "bn.h"
+#include "mod25638.h"
+#include "mod.h"
+
+/*
+ * References:
+ *
+ * [1] D. J. Bernstein. Curve25519: new Diffie-Hellman speed records.
+ *     Proceedings of PKC 2006, to appear. 
+ *     http://cr.yp.to/papers.html#curve25519. Date: 2006.02.09.
+ *
+ * [2] D. J. Bernstein. Can we avoid tests for zero in fast
+ *     elliptic-curve arithmetic?
+ *     http://cr.yp.to/papers.html#curvezero. Date: 2006.07.26.
+ *
+ */
+
+/*
+ * IMPLEMENTATION NOTE
+ *
+ * (0) We assume that the processor has no cache, nor branch target
+ *     prediction.  Thus, we don't avoid indexing by secret value. 
+ *     We don't avoid conditional jump if both cases have same timing,
+ *     either.
+ *
+ * (1) We use Radix-32 field arithmetic.  It's a representation like
+ *     2^256-38, but it's more redundant.  For example, "1" can be
+ *     represented in three ways in 256-bit: 1, 2^255-18, and
+ *     2^256-37.
+ *
+ * (2) We use Montgomery double-and-add.
+ *
+ */
+
+/*
+ *
+ * 121665 = 0x1db41
+ *            1 1101 1011 0100 0001
+ */
+static void
+mod25638_mul_121665 (bn256 *x, const bn256 *a)
+{
+  uint32_t c;
+  bn256 m[1];
+
+  c = 0;
+  memcpy (x, a, sizeof (bn256));                           /* X = A        */
+  c += bn256_shift (m, a, 6);   c += bn256_add (x, x, m);  /* X += A << 6  */
+  c += bn256_shift (m, a, 8);   c += bn256_add (x, x, m);  /* X += A << 8  */
+  c += bn256_shift (m, a, 9);   c += bn256_add (x, x, m);  /* X += A << 9  */
+  c += bn256_shift (m, a, 11);  c += bn256_add (x, x, m);  /* X += A << 11 */
+  c += bn256_shift (m, a, 12);  c += bn256_add (x, x, m);  /* X += A << 12 */
+  c += bn256_shift (m, a, 14);  c += bn256_add (x, x, m);  /* X += A << 14 */
+  c += bn256_shift (m, a, 15);  c += bn256_add (x, x, m);  /* X += A << 15 */
+  c += bn256_shift (m, a, 16);  c += bn256_add (x, x, m);  /* X += A << 16 */
+
+  c *= 38;
+  c = bn256_add_uint (x, x, c);
+  x->word[0] += c * 38;
+}
+
+
+typedef struct
+{
+  bn256 x[1];
+  bn256 z[1];
+} pt;
+
+
+/**
+ * @brief  Process Montgomery double-and-add
+ *
+ * With Q0, Q1, DIF (= Q0 - Q1), compute PRD = 2Q0, SUM = Q0 + Q1
+ * Q0 and Q1 are clobbered.
+ *
+ */
+static void
+mont_d_and_a (pt *prd, pt *sum, pt *q0, pt *q1, const bn256 *dif_x)
+{
+                                        mod25638_add (sum->x, q1->x, q1->z);
+                                        mod25638_sub (q1->z, q1->x, q1->z);
+  mod25638_add (prd->x, q0->x, q0->z);
+  mod25638_sub (q0->z, q0->x, q0->z);
+                                        mod25638_mul (q1->x, q0->z, sum->x);
+                                        mod25638_mul (q1->z, prd->x, q1->z);
+  mod25638_sqr (q0->x, prd->x);
+  mod25638_sqr (q0->z, q0->z);
+                                        mod25638_add (sum->x, q1->x, q1->z);
+                                        mod25638_sub (q1->z, q1->x, q1->z);
+  mod25638_mul (prd->x, q0->x, q0->z);
+  mod25638_sub (q0->z, q0->x, q0->z);
+                                        mod25638_sqr (sum->x, sum->x);
+                                        mod25638_sqr (sum->z, q1->z);
+  mod25638_mul_121665 (prd->z, q0->z);
+                                        mod25638_mul (sum->z, sum->z, dif_x);
+  mod25638_add (prd->z, q0->x, prd->z);
+  mod25638_mul (prd->z, prd->z, q0->z);
+}
+
+
+/**
+ * @brief	RES  = x-coordinate of [n]Q
+ *
+ * @param N	Scalar N (three least significant bits are 000)
+ * @param Q_X	x-coordinate of Q
+ *
+ */
+void
+compute_nQ (bn256 *res, const bn256 *n, const bn256 *q_x)
+{
+  int i, j;
+  pt p0[1], p1[1], p0_[1], p1_[1];
+
+  /* P0 = O = (1:0)  */
+  memset (p0->x, 0, sizeof (bn256));
+  p0->x->word[0] = 1;
+  memset (p0->z, 0, sizeof (bn256));
+
+  /* P1 = (X:1) */
+  memcpy (p1->x, q_x, sizeof (bn256));
+  memset (p1->z, 0, sizeof (bn256));
+  p1->z->word[0] = 1;
+
+  for (i = 0; i < 8; i++)
+    {
+      uint32_t u = n->word[7-i];
+
+      for (j = 0; j < 16; j++)
+	{
+	  pt *q0, *q1;
+	  pt *sum_n, *prd_n;
+
+	  if ((u & 0x80000000))
+	    q0 = p1,  q1 = p0,  sum_n = p0_, prd_n = p1_;
+	  else
+	    q0 = p0,  q1 = p1,  sum_n = p1_, prd_n = p0_;
+	  mont_d_and_a (prd_n, sum_n, q0, q1, q_x);
+
+	  if ((u & 0x40000000))
+	    q0 = p1_, q1 = p0_, sum_n = p0,  prd_n = p1;
+	  else
+	    q0 = p0_, q1 = p1_, sum_n = p1,  prd_n = p0;
+	  mont_d_and_a (prd_n, sum_n, q0, q1, q_x);
+
+	  u <<= 2;
+	}
+    }
+
+  /* We know the LSB of N is always 0.  Thus, result is always in P0.  */
+  mod_inv (res, p0->z, p25519);
+  mod25638_mul (res, res, p0->x);
+  mod25519_reduce (res);
+}
diff --git a/src/mod25638.c b/src/mod25638.c
index dde9fa6..be4fb6e 100644
--- a/src/mod25638.c
+++ b/src/mod25638.c
@@ -26,10 +26,10 @@
  *
  * We use radix-32.  During computation, it's not reduced to 2^255-19,
  * but it is represented in 256-bit (it is redundant representation),
- * that is, 2^256-38.
+ * that is, something like 2^256-38.
  *
- * The idea is, to keep 2^256-38 until it will be converted to affine
- * coordinates.
+ * The idea is, keeping within 256-bit until it will be converted to
+ * affine coordinates.
  */
 
 #include <stdint.h>
@@ -79,6 +79,10 @@ const bn256 n25638[1] = {
   {{0xffffffda, 0xffffffff, 0xffffffff, 0xffffffff,
     0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }} };
 
+const bn256 p25519[1] = {
+  {{ 0xffffffed, 0xffffffff, 0xffffffff, 0xffffffff,
+     0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff }} };
+
 
 /*
  * Implementation Note.
@@ -152,7 +156,6 @@ mod25638_reduce (bn256 *X, bn512 *A)
   {
     int i;
     uint64_t r;
-    uint32_t r0;
     uint32_t carry;
 
     r = 0;
@@ -172,16 +175,8 @@ mod25638_reduce (bn256 *X, bn512 *A)
       }
     d[i] = (uint32_t)r;
 
-    r0 = A->word[8] * 38;
-    d = &A->word[0];
-    for (i = 0; i < BN256_WORDS; i++)
-      {
-	r0 += d[i];
-	carry = (r0 < d[i]);
-	d[i] = r0;
-	r0 = carry;
-      }
-    A->word[0] += r0 * 38;
+    carry = bn256_add_uint ((bn256 *)A, (bn256 *)A, A->word[8] * 38);
+    A->word[0] += carry * 38;
   }
 
   memcpy (X, A, sizeof (bn256));
@@ -237,3 +232,81 @@ mod25638_shift (bn256 *X, const bn256 *A, int shift)
 
   mod25638_add (X, X, tmp);
 }
+
+static void
+add19 (bn256 *r, bn256 *x)
+{
+  uint32_t v;
+  int i;
+
+  v = 19;
+  for (i = 0; i < BN256_WORDS; i++)
+    {
+      r->word[i] = x->word[i] + v;
+      v = (r->word[i] < v);
+    }
+}
+
+/*
+ * @brief  X = A mod 2^255-19
+ *
+ * It's precisely modulo 2^255-19 (unlike mod25638_reduce).
+ */
+void
+mod25519_reduce (bn256 *X)
+{
+  uint32_t q;
+  bn256 r0[1], r1[1];
+  int flag;
+
+  memcpy (r0, X, sizeof (bn256));
+  q = (r0->word[7] >> 31);
+  r0->word[7] &= 0x7fffffff;
+  if (q)
+    {
+      add19 (r0, r0);
+      q = (r0->word[7] >> 31);
+      r0->word[7] &= 0x7fffffff;
+      if (q)
+	{
+	  add19 (r1, r0);
+	  q = (r1->word[7] >> 31);
+	  r1->word[7] &= 0x7fffffff;
+	  flag = 0;
+	}
+      else
+	flag = 1;
+    }
+  else
+    {
+      add19 (r1, r0);		 /* dummy */
+      q = (r1->word[7] >> 31);	 /* dummy */
+      r1->word[7] &= 0x7fffffff; /* dummy */
+      if (q)
+	flag = 2;
+      else
+	flag = 3;
+    }
+
+  if (flag)
+    {
+      add19 (r1, r0);
+      q = (r1->word[7] >> 31);
+      r1->word[7] &= 0x7fffffff;
+      if (q)
+	memcpy (X, r1, sizeof (bn256));
+      else
+	memcpy (X, r0, sizeof (bn256));
+    }
+  else
+    {
+      if (q)
+	{
+	  asm volatile ("" : : "r" (q) : "memory");
+	  memcpy (X, r1, sizeof (bn256));
+	  asm volatile ("" : : "r" (q) : "memory");
+	}
+      else
+	memcpy (X, r1, sizeof (bn256));
+    }
+}
diff --git a/src/mod25638.h b/src/mod25638.h
index 4dd95c0..281f98a 100644
--- a/src/mod25638.h
+++ b/src/mod25638.h
@@ -1,7 +1,8 @@
 extern const bn256 n25638[1];
+extern const bn256 p25519[1];
 
 void mod25638_add (bn256 *X, const bn256 *A, const bn256 *B);
 void mod25638_sub (bn256 *X, const bn256 *A, const bn256 *B);
 void mod25638_mul (bn256 *X, const bn256 *A, const bn256 *B);
 void mod25638_sqr (bn256 *X, const bn256 *A);
-void mod25638_shift (bn256 *X, const bn256 *A, int shift);
+void mod25519_reduce (bn256 *X);