changeset 190:d8254fc979e9 libtommath-orig LTM_0.35

Initial import of libtommath 0.35
author Matt Johnston <matt@ucc.asn.au>
date Fri, 06 May 2005 08:59:30 +0000
parents d29b64170cf0
children c5c969ed76f3
files TODO bn.pdf bn.tex bn_fast_mp_invmod.c bn_fast_mp_montgomery_reduce.c bn_fast_s_mp_mul_digs.c bn_fast_s_mp_mul_high_digs.c bn_fast_s_mp_sqr.c bn_mp_div.c bn_mp_dr_reduce.c bn_mp_exptmod.c bn_mp_exptmod_fast.c bn_mp_exteuclid.c bn_mp_gcd.c bn_mp_invmod_slow.c bn_mp_jacobi.c bn_mp_lcm.c bn_mp_mod_2d.c bn_mp_montgomery_calc_normalization.c bn_mp_mul_d.c bn_mp_n_root.c bn_mp_neg.c bn_mp_prime_fermat.c bn_mp_prime_is_divisible.c bn_mp_prime_is_prime.c bn_mp_prime_miller_rabin.c bn_mp_prime_next_prime.c bn_mp_prime_random_ex.c bn_mp_radix_size.c bn_mp_rand.c bn_mp_read_radix.c bn_mp_reduce.c bn_mp_reduce_2k.c bn_mp_reduce_2k_l.c bn_mp_reduce_2k_setup.c bn_mp_reduce_2k_setup_l.c bn_mp_reduce_is_2k.c bn_mp_reduce_is_2k_l.c bn_mp_to_signed_bin.c bn_mp_to_signed_bin_n.c bn_mp_to_unsigned_bin.c bn_mp_to_unsigned_bin_n.c bn_mp_toom_mul.c bn_mp_unsigned_bin_size.c bn_mp_xor.c bn_mp_zero.c bn_prime_tab.c bn_s_mp_exptmod.c bn_s_mp_mul_digs.c bn_s_mp_sqr.c bncore.c callgraph.txt changes.txt demo/demo.c demo/timing.c dep.pl etc/mersenne.c etc/pprime.c etc/tune.c logs/add.log logs/expt.log logs/expt_2k.log logs/expt_2kl.log logs/expt_dr.log logs/mult.log logs/mult_kara.log logs/sqr.log logs/sqr_kara.log logs/sub.log makefile makefile.bcc makefile.cygwin_dll makefile.icc makefile.msvc makefile.shared mtest/mtest.c poster.pdf pre_gen/mpi.c tombc/grammar.txt tommath.h tommath.pdf tommath.src tommath.tex tommath_class.h
diffstat 84 files changed, 8248 insertions(+), 5516 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TODO	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,16 @@
+things for book in order of importance...
+
+- Fix up pseudo-code [only] for combas that are not consistent with source
+- Start in chapter 3 [basics] and work up...
+   - re-write to prose [less abrupt]
+   - clean up pseudo code [spacing]
+   - more examples where appropriate and figures
+
+Goal:
+   - Get sync done by mid January [roughly 8-12 hours work]
+   - Finish ch3-6 by end of January [roughly 12-16 hours of work]
+   - Finish ch7-end by mid Feb [roughly 20-24 hours of work].
+
+Goal isn't "first edition" but merely cleaner to read.
+
+
Binary file bn.pdf has changed
--- a/bn.tex	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn.tex	Fri May 06 08:59:30 2005 +0000
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{LibTomMath User Manual \\ v0.32}
+\title{LibTomMath User Manual \\ v0.35}
 \author{Tom St Denis \\ [email protected]}
 \maketitle
 This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
@@ -263,12 +263,12 @@
 \begin{center}
 \begin{tabular}{|l|c|c|l|}
 \hline \textbf{Criteria} & \textbf{Pro} & \textbf{Con} & \textbf{Notes} \\
-\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 76.04$ \\
+\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 71.97$ \\
 \hline Commented function prototypes & X && GnuPG function names are cryptic. \\
 \hline Speed && X & LibTomMath is slower.  \\
 \hline Totally free & X & & GPL has unfavourable restrictions.\\
 \hline Large function base & X & & GnuPG is barebones. \\
-\hline Four modular reduction algorithms & X & & Faster modular exponentiation. \\
+\hline Five modular reduction algorithms & X & & Faster modular exponentiation for a variety of moduli. \\
 \hline Portable & X & & GnuPG requires configuration to build. \\
 \hline
 \end{tabular}
@@ -284,9 +284,12 @@
 So it may feel tempting to just rip the math code out of GnuPG (or GnuMP where it was taken from originally) in your
 own application but I think there are reasons not to.  While LibTomMath is slower than libraries such as GnuMP it is
 not normally significantly slower.  On x86 machines the difference is normally a factor of two when performing modular
-exponentiations.
+exponentiations.  It depends largely on the processor, compiler and the moduli being used.
 
-Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.
+Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.  However,
+on the other side of the coin LibTomMath offers you a totally free (public domain) well structured math library
+that is very flexible, complete and performs well in resource contrained environments.  Fast RSA for example can
+be performed with as little as 8KB of ram for data (again depending on build options).  
 
 \chapter{Getting Started with LibTomMath}
 \section{Building Programs}
@@ -809,7 +812,7 @@
 
 \index{mp\_cmp\_mag}
 \begin{alltt}
-int mp_cmp(mp_int * a, mp_int * b);
+int mp_cmp_mag(mp_int * a, mp_int * b);
 \end{alltt}
 This will compare $a$ to $b$ placing $a$ to the left of $b$.  This function cannot fail and will return one of the
 three compare codes listed in figure \ref{fig:CMP}.
@@ -1220,12 +1223,13 @@
 \end{alltt}
 
 Will square $a$ and store it in $b$.  Like the case of multiplication there are four different squaring
-algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms.
+algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms because
+of the speed difference.  
 
 \section{Tuning Polynomial Basis Routines}
 
 Both of the Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that
-the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectfully they require 
+the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectively they require 
 considerably less work.  For example, a 10000-digit multiplication would take roughly 724,000 single precision
 multiplications with Toom-Cook or 100,000,000 single precision multiplications with the standard Comba (a factor
 of 138).
@@ -1297,14 +1301,14 @@
 \section{Barrett Reduction}
 
 Barrett reduction is a generic optimized reduction algorithm that requires pre--computation to achieve
-a decent speedup over straight division.  First a $mu$ value must be precomputed with the following function.
+a decent speedup over straight division.  First a $\mu$ value must be precomputed with the following function.
 
 \index{mp\_reduce\_setup}
 \begin{alltt}
 int mp_reduce_setup(mp_int *a, mp_int *b);
 \end{alltt}
 
-Given a modulus in $b$ this produces the required $mu$ value in $a$.  For any given modulus this only has to
+Given a modulus in $b$ this produces the required $\mu$ value in $a$.  For any given modulus this only has to
 be computed once.  Modular reduction can now be performed with the following.
 
 \index{mp\_reduce}
@@ -1312,7 +1316,7 @@
 int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 \end{alltt}
 
-This will reduce $a$ in place modulo $b$ with the precomputed $mu$ value in $c$.  $a$ must be in the range
+This will reduce $a$ in place modulo $b$ with the precomputed $\mu$ value in $c$.  $a$ must be in the range
 $0 \le a < b^2$.
 
 \begin{alltt}
@@ -1578,7 +1582,8 @@
 This algorithm uses the ``Newton Approximation'' method and will converge on the correct root fairly quickly.  Since
 the algorithm requires raising $a$ to the power of $b$ it is not ideal to attempt to find roots for large
 values of $b$.  If particularly large roots are required then a factor method could be used instead.  For example,
-$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$.
+$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$ or simply 
+$\left ( \left ( \left ( a^{1/2} \right )^{1/2} \right )^{1/2} \right )^{1/2}$
 
 \chapter{Prime Numbers}
 \section{Trial Division}
--- a/bn_fast_mp_invmod.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_mp_invmod.c	Fri May 06 08:59:30 2005 +0000
@@ -21,8 +21,7 @@
  * Based on slow invmod except this is optimized for the case where b is 
  * odd as per HAC Note 14.64 on pp. 610
  */
-int
-fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
   mp_int  x, y, u, v, B, D;
   int     res, neg;
@@ -39,20 +38,20 @@
 
   /* x == modulus, y == value to invert */
   if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
+    goto LBL_ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   mp_set (&D, 1);
 
@@ -61,17 +60,17 @@
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 4.2 if B is odd then */
     if (mp_isodd (&B) == 1) {
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
       }
     }
     /* B = B/2 */
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -79,18 +78,18 @@
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 5.2 if D is odd then */
     if (mp_isodd (&D) == 1) {
       /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
       }
     }
     /* D = D/2 */
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -98,20 +97,20 @@
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   } else {
     /* v - v - u, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -125,21 +124,21 @@
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* b is now the inverse */
   neg = a->sign;
   while (D.sign == MP_NEG) {
     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
   mp_exch (&D, c);
   c->sign = neg;
   res = MP_OKAY;
 
-__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
   return res;
 }
 #endif
--- a/bn_fast_mp_montgomery_reduce.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_mp_montgomery_reduce.c	Fri May 06 08:59:30 2005 +0000
@@ -23,8 +23,7 @@
  *
  * Based on Algorithm 14.32 on pp.601 of HAC.
 */
-int
-fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
   int     ix, res, olduse;
   mp_word W[MP_WARRAY];
--- a/bn_fast_s_mp_mul_digs.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_s_mp_mul_digs.c	Fri May 06 08:59:30 2005 +0000
@@ -31,8 +31,7 @@
  * Based on Algorithm 14.12 on pp.595 of HAC.
  *
  */
-int
-fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -50,7 +49,7 @@
 
   /* clear the carry */
   _W = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
       int      tx, ty;
       int      iy;
       mp_digit *tmpx, *tmpy;
@@ -63,7 +62,7 @@
       tmpx = a->dp + tx;
       tmpy = b->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially 
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -80,14 +79,17 @@
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
 
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
+
   /* setup dest */
   olduse  = c->used;
-  c->used = digs;
+  c->used = pa;
 
   {
     register mp_digit *tmpc;
     tmpc = c->dp;
-    for (ix = 0; ix < digs; ix++) {
+    for (ix = 0; ix < pa+1; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
--- a/bn_fast_s_mp_mul_high_digs.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_s_mp_mul_high_digs.c	Fri May 06 08:59:30 2005 +0000
@@ -24,8 +24,7 @@
  *
  * Based on Algorithm 14.12 on pp.595 of HAC.
  */
-int
-fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -42,7 +41,7 @@
   /* number of output digits to produce */
   pa = a->used + b->used;
   _W = 0;
-  for (ix = digs; ix <= pa; ix++) { 
+  for (ix = digs; ix < pa; ix++) { 
       int      tx, ty, iy;
       mp_digit *tmpx, *tmpy;
 
@@ -70,6 +69,9 @@
       /* make next carry */
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
+  
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
 
   /* setup dest */
   olduse  = c->used;
--- a/bn_fast_s_mp_sqr.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_s_mp_sqr.c	Fri May 06 08:59:30 2005 +0000
@@ -15,33 +15,14 @@
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
 
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed.  This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2".  The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
 /* the jist of squaring...
-
-you do like mult except the offset of the tmpx [one that starts closer to zero]
-can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
-(ty-tx) so that it never happens.  You double all those you add in the inner loop
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
 
 After that loop you do the squares and add them in.
-
-Remove W2 and don't memset W
-
 */
 
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
@@ -60,7 +41,7 @@
 
   /* number of output digits to produce */
   W1 = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
       int      tx, ty, iy;
       mp_word  _W;
       mp_digit *tmpy;
@@ -76,7 +57,7 @@
       tmpx = a->dp + tx;
       tmpy = a->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -101,7 +82,7 @@
       }
 
       /* store it */
-      W[ix] = _W;
+      W[ix] = (mp_digit)(_W & MP_MASK);
 
       /* make next carry */
       W1 = _W >> ((mp_word)DIGIT_BIT);
--- a/bn_mp_div.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_div.c	Fri May 06 08:59:30 2005 +0000
@@ -49,23 +49,23 @@
 
   mp_set(&tq, 1);
   n = mp_count_bits(a) - mp_count_bits(b);
-  if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
-      ((res = mp_copy(b, &tb)) != MP_OKAY) || 
+  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
+      ((res = mp_abs(b, &tb)) != MP_OKAY) || 
       ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
       ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
-      goto __ERR;
+      goto LBL_ERR;
   }
 
   while (n-- >= 0) {
      if (mp_cmp(&tb, &ta) != MP_GT) {
         if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
             ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
         }
      }
      if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
          ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
      }
   }
 
@@ -74,13 +74,13 @@
   n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
   if (c != NULL) {
      mp_exch(c, &q);
-     c->sign  = n2;
+     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
   }
   if (d != NULL) {
      mp_exch(d, &ta);
-     d->sign = n;
+     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
   }
-__ERR:
+LBL_ERR:
    mp_clear_multi(&ta, &tb, &tq, &q, NULL);
    return res;
 }
@@ -129,19 +129,19 @@
   q.used = a->used + 2;
 
   if ((res = mp_init (&t1)) != MP_OKAY) {
-    goto __Q;
+    goto LBL_Q;
   }
 
   if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
+    goto LBL_T1;
   }
 
   if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
-    goto __T2;
+    goto LBL_T2;
   }
 
   if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
-    goto __X;
+    goto LBL_X;
   }
 
   /* fix the sign */
@@ -153,10 +153,10 @@
   if (norm < (int)(DIGIT_BIT-1)) {
      norm = (DIGIT_BIT-1) - norm;
      if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
      }
      if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
      }
   } else {
      norm = 0;
@@ -168,13 +168,13 @@
 
   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
   if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
-    goto __Y;
+    goto LBL_Y;
   }
 
   while (mp_cmp (&x, &y) != MP_LT) {
     ++(q.dp[n - t]);
     if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
   }
 
@@ -216,7 +216,7 @@
       t1.dp[1] = y.dp[t];
       t1.used = 2;
       if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
 
       /* find right hand */
@@ -228,27 +228,27 @@
 
     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
     if (x.sign == MP_NEG) {
       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
       if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
 
       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
@@ -275,11 +275,11 @@
 
   res = MP_OKAY;
 
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
-__Q:mp_clear (&q);
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+LBL_Q:mp_clear (&q);
   return res;
 }
 
--- a/bn_mp_dr_reduce.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_dr_reduce.c	Fri May 06 08:59:30 2005 +0000
@@ -20,7 +20,7 @@
  * Based on algorithm from the paper
  *
  * "Generating Efficient Primes for Discrete Log Cryptosystems"
- *                 Chae Hoon Lim, Pil Loong Lee,
+ *                 Chae Hoon Lim, Pil Joong Lee,
  *          POSTECH Information Research Laboratories
  *
  * The modulus must be of a special format [see manual]
--- a/bn_mp_exptmod.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_exptmod.c	Fri May 06 08:59:30 2005 +0000
@@ -61,25 +61,33 @@
      return err;
 #else 
      /* no invmod */
-     return MP_VAL
+     return MP_VAL;
 #endif
   }
 
+/* modified diminished radix reduction */
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+  if (mp_reduce_is_2k_l(P) == MP_YES) {
+     return s_mp_exptmod(G, X, P, Y, 1);
+  }
+#endif
+
 #ifdef BN_MP_DR_IS_MODULUS_C
   /* is it a DR modulus? */
   dr = mp_dr_is_modulus(P);
 #else
+  /* default to no */
   dr = 0;
 #endif
 
 #ifdef BN_MP_REDUCE_IS_2K_C
-  /* if not, is it a uDR modulus? */
+  /* if not, is it a unrestricted DR modulus? */
   if (dr == 0) {
      dr = mp_reduce_is_2k(P) << 1;
   }
 #endif
     
-  /* if the modulus is odd or dr != 0 use the fast method */
+  /* if the modulus is odd or dr != 0 use the montgomery method */
 #ifdef BN_MP_EXPTMOD_FAST_C
   if (mp_isodd (P) == 1 || dr !=  0) {
     return mp_exptmod_fast (G, X, P, Y, dr);
@@ -87,7 +95,7 @@
 #endif
 #ifdef BN_S_MP_EXPTMOD_C
     /* otherwise use the generic Barrett reduction technique */
-    return s_mp_exptmod (G, X, P, Y);
+    return s_mp_exptmod (G, X, P, Y, 0);
 #else
     /* no exptmod for evens */
     return MP_VAL;
--- a/bn_mp_exptmod_fast.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_exptmod_fast.c	Fri May 06 08:59:30 2005 +0000
@@ -29,8 +29,7 @@
    #define TAB_SIZE 256
 #endif
 
-int
-mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res;
   mp_digit buf, mp;
@@ -88,11 +87,11 @@
 #ifdef BN_MP_MONTGOMERY_SETUP_C     
      /* now setup montgomery  */
      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
      }
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
 
      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
@@ -108,7 +107,7 @@
         redux = mp_montgomery_reduce;
 #else
         err = MP_VAL;
-        goto __M;
+        goto LBL_M;
 #endif
      }
   } else if (redmode == 1) {
@@ -118,24 +117,24 @@
      redux = mp_dr_reduce;
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
   } else {
 #if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
      /* setup DR reduction for moduli of the form 2**k - b */
      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
      }
      redux = mp_reduce_2k;
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
   }
 
   /* setup result */
   if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __M;
+    goto LBL_M;
   }
 
   /* create M table
@@ -149,45 +148,45 @@
 #ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
      /* now we need R mod m */
      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
 #else 
      err = MP_VAL;
-     goto __RES;
+     goto LBL_RES;
 #endif
 
      /* now set M[1] to G * R mod m */
      if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
   } else {
      mp_set(&res, 1);
      if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
   }
 
   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __RES;
+    goto LBL_RES;
   }
 
   for (x = 0; x < (winsize - 1); x++) {
     if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
     if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
   }
 
   /* create upper table */
   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
     if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
   }
 
@@ -227,10 +226,10 @@
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       continue;
     }
@@ -244,19 +243,19 @@
       /* square first */
       for (x = 0; x < winsize; x++) {
         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
         if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
       }
 
       /* then multiply */
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
 
       /* empty window and reset */
@@ -271,10 +270,10 @@
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
 
       /* get next bit of the window */
@@ -282,10 +281,10 @@
       if ((bitbuf & (1 << winsize)) != 0) {
         /* then multiply */
         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
         if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
       }
     }
@@ -299,15 +298,15 @@
       * of R.
       */
      if ((err = redux(&res, P, mp)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
   }
 
   /* swap res with Y */
   mp_exch (&res, Y);
   err = MP_OKAY;
-__RES:mp_clear (&res);
-__M:
+LBL_RES:mp_clear (&res);
+LBL_M:
   mp_clear(&M[1]);
   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
--- a/bn_mp_exteuclid.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_exteuclid.c	Fri May 06 08:59:30 2005 +0000
@@ -59,6 +59,13 @@
        if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
    }
 
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
    /* copy result out */
    if (U1 != NULL) { mp_exch(U1, &u1); }
    if (U2 != NULL) { mp_exch(U2, &u2); }
--- a/bn_mp_gcd.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_gcd.c	Fri May 06 08:59:30 2005 +0000
@@ -43,7 +43,7 @@
   }
 
   if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
-    goto __U;
+    goto LBL_U;
   }
 
   /* must be positive for the remainder of the algorithm */
@@ -57,24 +57,24 @@
   if (k > 0) {
      /* divide the power of two out */
      if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
 
      if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
   /* divide any remaining factors of two out */
   if (u_lsb != k) {
      if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
   if (v_lsb != k) {
      if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
@@ -87,23 +87,23 @@
      
      /* subtract smallest from largest */
      if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
      
      /* Divide out all factors of two */
      if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      } 
   } 
 
   /* multiply by 2**k which we divided out at the beginning */
   if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
-     goto __V;
+     goto LBL_V;
   }
   c->sign = MP_ZPOS;
   res = MP_OKAY;
-__V:mp_clear (&u);
-__U:mp_clear (&v);
+LBL_V:mp_clear (&u);
+LBL_U:mp_clear (&v);
   return res;
 }
 #endif
--- a/bn_mp_invmod_slow.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_invmod_slow.c	Fri May 06 08:59:30 2005 +0000
@@ -33,25 +33,25 @@
   }
 
   /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
   }
   if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* 2. [modified] if x,y are both even then return an error! */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   mp_set (&A, 1);
   mp_set (&D, 1);
@@ -61,24 +61,24 @@
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 4.2 if A or B is odd then */
     if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
       /* A = (A+y)/2, B = (B-x)/2 */
       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
     }
     /* A = A/2, B = B/2 */
     if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -86,24 +86,24 @@
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 5.2 if C or D is odd then */
     if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
       /* C = (C+y)/2, D = (D-x)/2 */
       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
     }
     /* C = C/2, D = D/2 */
     if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -111,28 +111,28 @@
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, A = A - C, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   } else {
     /* v - v - u, C = C - A, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -145,27 +145,27 @@
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* if its too low */
   while (mp_cmp_d(&C, 0) == MP_LT) {
       if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
   }
   
   /* too big */
   while (mp_cmp_mag(&C, b) != MP_LT) {
       if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
   }
   
   /* C is now the inverse */
   mp_exch (&C, c);
   res = MP_OKAY;
-__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
 #endif
--- a/bn_mp_jacobi.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_jacobi.c	Fri May 06 08:59:30 2005 +0000
@@ -50,13 +50,13 @@
   }
 
   if ((res = mp_init (&p1)) != MP_OKAY) {
-    goto __A1;
+    goto LBL_A1;
   }
 
   /* divide out larger power of two */
   k = mp_cnt_lsb(&a1);
   if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) {
-     goto __P1;
+     goto LBL_P1;
   }
 
   /* step 4.  if e is even set s=1 */