diff pre_gen/mpi.c @ 190:d8254fc979e9 libtommath-orig LTM_0.35

Initial import of libtommath 0.35
author Matt Johnston <matt@ucc.asn.au>
date Fri, 06 May 2005 08:59:30 +0000
parents d29b64170cf0
children
line wrap: on
line diff
--- a/pre_gen/mpi.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/pre_gen/mpi.c	Fri May 06 08:59:30 2005 +0000
@@ -69,8 +69,7 @@
  * Based on slow invmod except this is optimized for the case where b is 
  * odd as per HAC Note 14.64 on pp. 610
  */
-int
-fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
   mp_int  x, y, u, v, B, D;
   int     res, neg;
@@ -87,20 +86,20 @@
 
   /* x == modulus, y == value to invert */
   if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
+    goto LBL_ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   mp_set (&D, 1);
 
@@ -109,17 +108,17 @@
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 4.2 if B is odd then */
     if (mp_isodd (&B) == 1) {
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
       }
     }
     /* B = B/2 */
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -127,18 +126,18 @@
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 5.2 if D is odd then */
     if (mp_isodd (&D) == 1) {
       /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
       }
     }
     /* D = D/2 */
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -146,20 +145,20 @@
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   } else {
     /* v - v - u, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -173,21 +172,21 @@
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* b is now the inverse */
   neg = a->sign;
   while (D.sign == MP_NEG) {
     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
   mp_exch (&D, c);
   c->sign = neg;
   res = MP_OKAY;
 
-__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
   return res;
 }
 #endif
@@ -220,8 +219,7 @@
  *
  * Based on Algorithm 14.32 on pp.601 of HAC.
 */
-int
-fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
   int     ix, res, olduse;
   mp_word W[MP_WARRAY];
@@ -401,8 +399,7 @@
  * Based on Algorithm 14.12 on pp.595 of HAC.
  *
  */
-int
-fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -420,7 +417,7 @@
 
   /* clear the carry */
   _W = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
       int      tx, ty;
       int      iy;
       mp_digit *tmpx, *tmpy;
@@ -433,7 +430,7 @@
       tmpx = a->dp + tx;
       tmpy = b->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially 
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -450,14 +447,17 @@
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
 
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
+
   /* setup dest */
   olduse  = c->used;
-  c->used = digs;
+  c->used = pa;
 
   {
     register mp_digit *tmpc;
     tmpc = c->dp;
-    for (ix = 0; ix < digs; ix++) {
+    for (ix = 0; ix < pa+1; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
@@ -501,8 +501,7 @@
  *
  * Based on Algorithm 14.12 on pp.595 of HAC.
  */
-int
-fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -519,7 +518,7 @@
   /* number of output digits to produce */
   pa = a->used + b->used;
   _W = 0;
-  for (ix = digs; ix <= pa; ix++) { 
+  for (ix = digs; ix < pa; ix++) { 
       int      tx, ty, iy;
       mp_digit *tmpx, *tmpy;
 
@@ -547,6 +546,9 @@
       /* make next carry */
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
+  
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
 
   /* setup dest */
   olduse  = c->used;
@@ -591,33 +593,14 @@
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
 
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed.  This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2".  The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
 /* the jist of squaring...
-
-you do like mult except the offset of the tmpx [one that starts closer to zero]
-can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
-(ty-tx) so that it never happens.  You double all those you add in the inner loop
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
 
 After that loop you do the squares and add them in.
-
-Remove W2 and don't memset W
-
 */
 
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
@@ -636,7 +619,7 @@
 
   /* number of output digits to produce */
   W1 = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
       int      tx, ty, iy;
       mp_word  _W;
       mp_digit *tmpy;
@@ -652,7 +635,7 @@
       tmpx = a->dp + tx;
       tmpy = a->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -677,7 +660,7 @@
       }
 
       /* store it */
-      W[ix] = _W;
+      W[ix] = (mp_digit)(_W & MP_MASK);
 
       /* make next carry */
       W1 = _W >> ((mp_word)DIGIT_BIT);
@@ -1539,23 +1522,23 @@
 
   mp_set(&tq, 1);
   n = mp_count_bits(a) - mp_count_bits(b);
-  if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
-      ((res = mp_copy(b, &tb)) != MP_OKAY) || 
+  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
+      ((res = mp_abs(b, &tb)) != MP_OKAY) || 
       ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
       ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
-      goto __ERR;
+      goto LBL_ERR;
   }
 
   while (n-- >= 0) {
      if (mp_cmp(&tb, &ta) != MP_GT) {
         if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
             ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
         }
      }
      if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
          ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
      }
   }
 
@@ -1564,13 +1547,13 @@
   n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
   if (c != NULL) {
      mp_exch(c, &q);
-     c->sign  = n2;
+     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
   }
   if (d != NULL) {
      mp_exch(d, &ta);
-     d->sign = n;
-  }
-__ERR:
+     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
+  }
+LBL_ERR:
    mp_clear_multi(&ta, &tb, &tq, &q, NULL);
    return res;
 }
@@ -1619,19 +1602,19 @@
   q.used = a->used + 2;
 
   if ((res = mp_init (&t1)) != MP_OKAY) {
-    goto __Q;
+    goto LBL_Q;
   }
 
   if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
+    goto LBL_T1;
   }
 
   if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
-    goto __T2;
+    goto LBL_T2;
   }
 
   if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
-    goto __X;
+    goto LBL_X;
   }
 
   /* fix the sign */
@@ -1643,10 +1626,10 @@
   if (norm < (int)(DIGIT_BIT-1)) {
      norm = (DIGIT_BIT-1) - norm;
      if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
      }
      if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
      }
   } else {
      norm = 0;
@@ -1658,13 +1641,13 @@
 
   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
   if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
-    goto __Y;
+    goto LBL_Y;
   }
 
   while (mp_cmp (&x, &y) != MP_LT) {
     ++(q.dp[n - t]);
     if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
   }
 
@@ -1706,7 +1689,7 @@
       t1.dp[1] = y.dp[t];
       t1.used = 2;
       if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
 
       /* find right hand */
@@ -1718,27 +1701,27 @@
 
     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
     if (x.sign == MP_NEG) {
       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
       if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
 
       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
@@ -1765,11 +1748,11 @@
 
   res = MP_OKAY;
 
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
-__Q:mp_clear (&q);
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+LBL_Q:mp_clear (&q);
   return res;
 }
 
@@ -2199,7 +2182,7 @@
  * Based on algorithm from the paper
  *
  * "Generating Efficient Primes for Discrete Log Cryptosystems"
- *                 Chae Hoon Lim, Pil Loong Lee,
+ *                 Chae Hoon Lim, Pil Joong Lee,
  *          POSTECH Information Research Laboratories
  *
  * The modulus must be of a special format [see manual]
@@ -2457,25 +2440,33 @@
      return err;
 #else 
      /* no invmod */
-     return MP_VAL
-#endif
-  }
+     return MP_VAL;
+#endif
+  }
+
+/* modified diminished radix reduction */
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+  if (mp_reduce_is_2k_l(P) == MP_YES) {
+     return s_mp_exptmod(G, X, P, Y, 1);
+  }
+#endif
 
 #ifdef BN_MP_DR_IS_MODULUS_C
   /* is it a DR modulus? */
   dr = mp_dr_is_modulus(P);
 #else
+  /* default to no */
   dr = 0;
 #endif
 
 #ifdef BN_MP_REDUCE_IS_2K_C
-  /* if not, is it a uDR modulus? */
+  /* if not, is it a unrestricted DR modulus? */
   if (dr == 0) {
      dr = mp_reduce_is_2k(P) << 1;
   }
 #endif
     
-  /* if the modulus is odd or dr != 0 use the fast method */
+  /* if the modulus is odd or dr != 0 use the montgomery method */
 #ifdef BN_MP_EXPTMOD_FAST_C
   if (mp_isodd (P) == 1 || dr !=  0) {
     return mp_exptmod_fast (G, X, P, Y, dr);
@@ -2483,7 +2474,7 @@
 #endif
 #ifdef BN_S_MP_EXPTMOD_C
     /* otherwise use the generic Barrett reduction technique */
-    return s_mp_exptmod (G, X, P, Y);
+    return s_mp_exptmod (G, X, P, Y, 0);
 #else
     /* no exptmod for evens */
     return MP_VAL;
@@ -2529,8 +2520,7 @@
    #define TAB_SIZE 256
 #endif
 
-int
-mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res;
   mp_digit buf, mp;
@@ -2588,11 +2578,11 @@
 #ifdef BN_MP_MONTGOMERY_SETUP_C     
      /* now setup montgomery  */
      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
      }
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
 
      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
@@ -2608,7 +2598,7 @@
         redux = mp_montgomery_reduce;
 #else
         err = MP_VAL;
-        goto __M;
+        goto LBL_M;
 #endif
      }
   } else if (redmode == 1) {
@@ -2618,24 +2608,24 @@
      redux = mp_dr_reduce;
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
   } else {
 #if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
      /* setup DR reduction for moduli of the form 2**k - b */
      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
      }
      redux = mp_reduce_2k;
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
   }
 
   /* setup result */
   if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __M;
+    goto LBL_M;
   }
 
   /* create M table
@@ -2649,45 +2639,45 @@
 #ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
      /* now we need R mod m */
      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
 #else 
      err = MP_VAL;
-     goto __RES;
+     goto LBL_RES;
 #endif
 
      /* now set M[1] to G * R mod m */
      if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
   } else {
      mp_set(&res, 1);
      if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
   }
 
   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __RES;
+    goto LBL_RES;
   }
 
   for (x = 0; x < (winsize - 1); x++) {
     if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
     if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
   }
 
   /* create upper table */
   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
     if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
   }
 
@@ -2727,10 +2717,10 @@
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       continue;
     }
@@ -2744,19 +2734,19 @@
       /* square first */
       for (x = 0; x < winsize; x++) {
         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
         if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
       }
 
       /* then multiply */
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
 
       /* empty window and reset */
@@ -2771,10 +2761,10 @@
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
 
       /* get next bit of the window */
@@ -2782,10 +2772,10 @@
       if ((bitbuf & (1 << winsize)) != 0) {
         /* then multiply */
         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
         if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
       }
     }
@@ -2799,15 +2789,15 @@
       * of R.
       */
      if ((err = redux(&res, P, mp)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
   }
 
   /* swap res with Y */
   mp_exch (&res, Y);
   err = MP_OKAY;
-__RES:mp_clear (&res);
-__M:
+LBL_RES:mp_clear (&res);
+LBL_M:
   mp_clear(&M[1]);
   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
@@ -2881,6 +2871,13 @@
        if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
    }
 
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
    /* copy result out */
    if (U1 != NULL) { mp_exch(U1, &u1); }
    if (U2 != NULL) { mp_exch(U2, &u2); }
@@ -3059,7 +3056,7 @@
   }
 
   if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
-    goto __U;
+    goto LBL_U;
   }
 
   /* must be positive for the remainder of the algorithm */
@@ -3073,24 +3070,24 @@
   if (k > 0) {
      /* divide the power of two out */
      if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
 
      if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
   /* divide any remaining factors of two out */
   if (u_lsb != k) {
      if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
   if (v_lsb != k) {
      if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
@@ -3103,23 +3100,23 @@
      
      /* subtract smallest from largest */
      if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
      
      /* Divide out all factors of two */
      if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      } 
   } 
 
   /* multiply by 2**k which we divided out at the beginning */
   if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
-     goto __V;
+     goto LBL_V;
   }
   c->sign = MP_ZPOS;
   res = MP_OKAY;
-__V:mp_clear (&u);
-__U:mp_clear (&v);
+LBL_V:mp_clear (&u);
+LBL_U:mp_clear (&v);
   return res;
 }
 #endif
@@ -3555,25 +3552,25 @@
   }
 
   /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
   }
   if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* 2. [modified] if x,y are both even then return an error! */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   mp_set (&A, 1);
   mp_set (&D, 1);
@@ -3583,24 +3580,24 @@
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 4.2 if A or B is odd then */
     if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
       /* A = (A+y)/2, B = (B-x)/2 */
       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
     }
     /* A = A/2, B = B/2 */
     if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -3608,24 +3605,24 @@
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 5.2 if C or D is odd then */
     if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
       /* C = (C+y)/2, D = (D-x)/2 */
       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
     }
     /* C = C/2, D = D/2 */
     if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -3633,28 +3630,28 @@
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, A = A - C, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   } else {
     /* v - v - u, C = C - A, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -3667,27 +3664,27 @@
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* if its too low */
   while (mp_cmp_d(&C, 0) == MP_LT) {
       if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
   }
   
   /* too big */
   while (mp_cmp_mag(&C, b) != MP_LT) {
       if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
   }
   
   /* C is now the inverse */
   mp_exch (&C, c);
   res = MP_OKAY;
-__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
 #endif
@@ -3856,13 +3853,13 @@
   }
 
   if ((res = mp_init (&p1)) != MP_OKAY) {
-    goto __A1;
+    goto LBL_A1;
   }
 
   /* divide out larger power of two */
   k = mp_cnt_lsb(&a1);
   if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) {
-     goto __P1;
+     goto LBL_P1;
   }
 
   /* step 4.  if e is even set s=1 */
@@ -3890,18 +3887,18 @@
   } else {
     /* n1 = n mod a1 */
     if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
-      goto __P1;
+      goto LBL_P1;
     }
     if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
-      goto __P1;
+      goto LBL_P1;
     }
     *c = s * r;
   }
 
   /* done */
   res = MP_OKAY;
-__P1:mp_clear (&p1);
-__A1:mp_clear (&a1);
+LBL_P1:mp_clear (&p1);
+LBL_A1:mp_clear (&a1);
   return res;
 }
 #endif
@@ -4227,20 +4224,20 @@
 
   /* t1 = get the GCD of the two inputs */
   if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
-    goto __T;
+    goto LBL_T;
   }
 
   /* divide the smallest by the GCD */
   if (mp_cmp_mag(a, b) == MP_LT) {
      /* store quotient in t2 such that t2 * b is the LCM */
      if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
-        goto __T;
+        goto LBL_T;
      }
      res = mp_mul(b, &t2, c);
   } else {
      /* store quotient in t2 such that t2 * a is the LCM */
      if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
-        goto __T;
+        goto LBL_T;
      }
      res = mp_mul(a, &t2, c);
   }
@@ -4248,7 +4245,7 @@
   /* fix the sign to positive */
   c->sign = MP_ZPOS;
 
-__T:
+LBL_T:
   mp_clear_multi (&t1, &t2, NULL);
   return res;
 }
@@ -4402,7 +4399,7 @@
   }
 
   /* if the modulus is larger than the value than return */
-  if (b > (int) (a->used * DIGIT_BIT)) {
+  if (b >= (int) (a->used * DIGIT_BIT)) {
     res = mp_copy (a, c);
     return res;
   }
@@ -4484,7 +4481,6 @@
   /* how many bits of last digit does b use */
   bits = mp_count_bits (b) % DIGIT_BIT;
 
-
   if (b->used > 1) {
      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
         return res;
@@ -4983,8 +4979,9 @@
     u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
   }
 
-  /* store final carry [if any] */
+  /* store final carry [if any] and increment ix offset  */
   *tmpc++ = u;
+  ++ix;
 
   /* now zero digits above the top */
   while (ix++ < olduse) {
@@ -5085,11 +5082,11 @@
   }
 
   if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
+    goto LBL_T1;
   }
 
   if ((res = mp_init (&t3)) != MP_OKAY) {
-    goto __T2;
+    goto LBL_T2;
   }
 
   /* if a is negative fudge the sign but keep track */
@@ -5102,52 +5099,52 @@
   do {
     /* t1 = t2 */
     if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
     
     /* t3 = t1**(b-1) */
     if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {   
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* numerator */
     /* t2 = t1**b */
     if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {    
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* t2 = t1**b - a */
     if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {  
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* denominator */
     /* t3 = t1**(b-1) * b  */
     if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {    
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* t3 = (t1**b - a)/(b * t1**(b-1)) */
     if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {  
-      goto __T3;
+      goto LBL_T3;
     }
 
     if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
-      goto __T3;
+      goto LBL_T3;
     }
   }  while (mp_cmp (&t1, &t2) != MP_EQ);
 
   /* result can be off by a few so check */
   for (;;) {
     if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
-      goto __T3;
+      goto LBL_T3;
     }
 
     if (mp_cmp (&t2, a) == MP_GT) {
       if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
-         goto __T3;
+         goto LBL_T3;
       }
     } else {
       break;
@@ -5165,9 +5162,9 @@
 
   res = MP_OKAY;
 
-__T3:mp_clear (&t3);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
+LBL_T3:mp_clear (&t3);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
   return res;
 }
 #endif
@@ -5196,12 +5193,18 @@
 int mp_neg (mp_int * a, mp_int * b)
 {
   int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
-  }
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+        return res;
+     }
+  }
+
   if (mp_iszero(b) != MP_YES) {
      b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-  }
+  } else {
+     b->sign = MP_ZPOS;
+  }
+
   return MP_OKAY;
 }
 #endif
@@ -5304,7 +5307,7 @@
 
   /* compute t = b**a mod a */
   if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
-    goto __T;
+    goto LBL_T;
   }
 
   /* is it equal to b? */
@@ -5313,7 +5316,7 @@
   }
 
   err = MP_OKAY;
-__T:mp_clear (&t);
+LBL_T:mp_clear (&t);
   return err;
 }
 #endif
@@ -5352,8 +5355,8 @@
   *result = MP_NO;
 
   for (ix = 0; ix < PRIME_SIZE; ix++) {
-    /* what is a mod __prime_tab[ix] */
-    if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
+    /* what is a mod LBL_prime_tab[ix] */
+    if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) {
       return err;
     }
 
@@ -5410,7 +5413,7 @@
 
   /* is the input equal to one of the primes in the table? */
   for (ix = 0; ix < PRIME_SIZE; ix++) {
-      if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
+      if (mp_cmp_d(a, ltm_prime_tab[ix]) == MP_EQ) {
          *result = 1;
          return MP_OKAY;
       }
@@ -5433,20 +5436,20 @@
 
   for (ix = 0; ix < t; ix++) {
     /* set the prime */
-    mp_set (&b, __prime_tab[ix]);
+    mp_set (&b, ltm_prime_tab[ix]);
 
     if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
-      goto __B;
+      goto LBL_B;
     }
 
     if (res == MP_NO) {
-      goto __B;
+      goto LBL_B;
     }
   }
 
   /* passed the test */
   *result = MP_YES;
-__B:mp_clear (&b);
+LBL_B:mp_clear (&b);
   return err;
 }
 #endif
@@ -5496,12 +5499,12 @@
     return err;
   }
   if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
-    goto __N1;
+    goto LBL_N1;
   }
 
   /* set 2**s * r = n1 */
   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
-    goto __N1;
+    goto LBL_N1;
   }
 
   /* count the number of least significant bits
@@ -5511,15 +5514,15 @@
 
   /* now divide n - 1 by 2**s */
   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
-    goto __R;
+    goto LBL_R;
   }
 
   /* compute y = b**r mod a */
   if ((err = mp_init (&y)) != MP_OKAY) {
-    goto __R;
+    goto LBL_R;
   }
   if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
-    goto __Y;
+    goto LBL_Y;
   }
 
   /* if y != 1 and y != n1 do */
@@ -5528,12 +5531,12 @@
     /* while j <= s-1 and y != n1 */
     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
-         goto __Y;
+         goto LBL_Y;
       }
 
       /* if y == 1 then composite */
       if (mp_cmp_d (&y, 1) == MP_EQ) {
-         goto __Y;
+         goto LBL_Y;
       }
 
       ++j;
@@ -5541,15 +5544,15 @@
 
     /* if y != n1 then composite */
     if (mp_cmp (&y, &n1) != MP_EQ) {
-      goto __Y;
+      goto LBL_Y;
     }
   }
 
   /* probably prime now */
   *result = MP_YES;
-__Y:mp_clear (&y);
-__R:mp_clear (&r);
-__N1:mp_clear (&n1);
+LBL_Y:mp_clear (&y);
+LBL_R:mp_clear (&r);
+LBL_N1:mp_clear (&n1);
   return err;
 }
 #endif
@@ -5594,10 +5597,10 @@
    a->sign = MP_ZPOS;
 
    /* simple algo if a is less than the largest prime in the table */
-   if (mp_cmp_d(a, __prime_tab[PRIME_SIZE-1]) == MP_LT) {
+   if (mp_cmp_d(a, ltm_prime_tab[PRIME_SIZE-1]) == MP_LT) {
       /* find which prime it is bigger than */
       for (x = PRIME_SIZE - 2; x >= 0; x--) {
-          if (mp_cmp_d(a, __prime_tab[x]) != MP_LT) {
+          if (mp_cmp_d(a, ltm_prime_tab[x]) != MP_LT) {
              if (bbs_style == 1) {
                 /* ok we found a prime smaller or
                  * equal [so the next is larger]
@@ -5605,17 +5608,17 @@
                  * however, the prime must be
                  * congruent to 3 mod 4
                  */
-                if ((__prime_tab[x + 1] & 3) != 3) {
+                if ((ltm_prime_tab[x + 1] & 3) != 3) {
                    /* scan upwards for a prime congruent to 3 mod 4 */
                    for (y = x + 1; y < PRIME_SIZE; y++) {
-                       if ((__prime_tab[y] & 3) == 3) {
-                          mp_set(a, __prime_tab[y]);
+                       if ((ltm_prime_tab[y] & 3) == 3) {
+                          mp_set(a, ltm_prime_tab[y]);
                           return MP_OKAY;
                        }
                    }
                 }
              } else {
-                mp_set(a, __prime_tab[x + 1]);
+                mp_set(a, ltm_prime_tab[x + 1]);
                 return MP_OKAY;
              }
           }
@@ -5653,7 +5656,7 @@
 
    /* generate the restable */
    for (x = 1; x < PRIME_SIZE; x++) {
-      if ((err = mp_mod_d(a, __prime_tab[x], res_tab + x)) != MP_OKAY) {
+      if ((err = mp_mod_d(a, ltm_prime_tab[x], res_tab + x)) != MP_OKAY) {
          return err;
       }
    }
@@ -5679,8 +5682,8 @@
              res_tab[x] += kstep;
 
              /* subtract the modulus [instead of using division] */
-             if (res_tab[x] >= __prime_tab[x]) {
-                res_tab[x]  -= __prime_tab[x];
+             if (res_tab[x] >= ltm_prime_tab[x]) {
+                res_tab[x]  -= ltm_prime_tab[x];
              }
 
              /* set flag if zero */
@@ -5692,7 +5695,7 @@
 
       /* add the step */
       if ((err = mp_add_d(a, step, a)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
 
       /* if didn't pass sieve and step == MAX then skip test */
@@ -5702,9 +5705,9 @@
 
       /* is this prime? */
       for (x = 0; x < t; x++) {
-          mp_set(&b, __prime_tab[t]);
+          mp_set(&b, ltm_prime_tab[t]);
           if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
-             goto __ERR;
+             goto LBL_ERR;
           }
           if (res == MP_NO) {
              break;
@@ -5717,7 +5720,7 @@
    }
 
    err = MP_OKAY;
-__ERR:
+LBL_ERR:
    mp_clear(&b);
    return err;
 }
@@ -5828,7 +5831,7 @@
    }
 
    /* calc the byte size */
-   bsize = (size>>3)+(size&7?1:0);
+   bsize = (size>>3) + ((size&7)?1:0);
 
    /* we need a buffer of bsize bytes */
    tmp = OPT_CAST(unsigned char) XMALLOC(bsize);
@@ -5837,19 +5840,19 @@
    }
 
    /* calc the maskAND value for the MSbyte*/
-   maskAND = 0xFF >> (8 - (size & 7));
+   maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
 
    /* calc the maskOR_msb */
    maskOR_msb        = 0;
-   maskOR_msb_offset = (size - 2) >> 3;
+   maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
    if (flags & LTM_PRIME_2MSB_ON) {
       maskOR_msb     |= 1 << ((size - 2) & 7);
    } else if (flags & LTM_PRIME_2MSB_OFF) {
       maskAND        &= ~(1 << ((size - 2) & 7));
-   }
+   } 
 
    /* get the maskOR_lsb */
-   maskOR_lsb         = 0;
+   maskOR_lsb         = 1;
    if (flags & LTM_PRIME_BBS) {
       maskOR_lsb     |= 3;
    }
@@ -5943,22 +5946,29 @@
     return MP_VAL;
   }
 
-  /* init a copy of the input */
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
+  if (mp_iszero(a) == MP_YES) {
+     *size = 2;
+    return MP_OKAY;
   }
 
   /* digs is the digit count */
   digs = 0;
 
   /* if it's negative add one for the sign */
-  if (t.sign == MP_NEG) {
+  if (a->sign == MP_NEG) {
     ++digs;
-    t.sign = MP_ZPOS;
-  }
+  }
+
+  /* init a copy of the input */
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* force temp to positive */
+  t.sign = MP_ZPOS; 
 
   /* fetch out all of the digits */
-  while (mp_iszero (&t) == 0) {
+  while (mp_iszero (&t) == MP_NO) {
     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
       mp_clear (&t);
       return res;
@@ -6032,14 +6042,14 @@
 
   /* first place a random non-zero digit */
   do {
-    d = ((mp_digit) abs (rand ()));
+    d = ((mp_digit) abs (rand ())) & MP_MASK;
   } while (d == 0);
 
   if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
     return res;
   }
 
-  while (digits-- > 0) {
+  while (--digits > 0) {
     if ((res = mp_lshd (a, 1)) != MP_OKAY) {
       return res;
     }
@@ -6074,7 +6084,7 @@
  */
 
 /* read a string [ASCII] in a given radix */
-int mp_read_radix (mp_int * a, char *str, int radix)
+int mp_read_radix (mp_int * a, const char *str, int radix)
 {
   int     y, res, neg;
   char    ch;
@@ -6257,8 +6267,7 @@
  * precomputed via mp_reduce_setup.
  * From HAC pp.604 Algorithm 14.42
  */
-int
-mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
 {
   mp_int  q;
   int     res, um = m->used;
@@ -6278,11 +6287,11 @@
     }
   } else {
 #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #else 
@@ -6355,8 +6364,7 @@
  */
 
 /* reduces a modulo n where n is of the form 2**p - d */
-int
-mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
 {
    mp_int q;
    int    p, res;
@@ -6398,6 +6406,68 @@
 
 /* End: bn_mp_reduce_2k.c */
 
+/* Start: bn_mp_reduce_2k_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* reduces a modulo n where n is of the form 2**p - d 
+   This differs from reduce_2k since "d" can be larger
+   than a single digit.
+*/
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   /* q = q * d */
+   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) { 
+      goto ERR;
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
+#endif
+
+/* End: bn_mp_reduce_2k_l.c */
+
 /* Start: bn_mp_reduce_2k_setup.c */
 #include <tommath.h>
 #ifdef BN_MP_REDUCE_2K_SETUP_C
@@ -6417,8 +6487,7 @@
  */
 
 /* determines the setup value */
-int 
-mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 {
    int res, p;
    mp_int tmp;
@@ -6446,6 +6515,50 @@
 
 /* End: bn_mp_reduce_2k_setup.c */
 
+/* Start: bn_mp_reduce_2k_setup_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* determines the setup value */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
+{
+   int    res;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+ERR:
+   mp_clear(&tmp);
+   return res;
+}
+#endif
+
+/* End: bn_mp_reduce_2k_setup_l.c */
+
 /* Start: bn_mp_reduce_is_2k.c */
 #include <tommath.h>
 #ifdef BN_MP_REDUCE_IS_2K_C
@@ -6471,9 +6584,9 @@
    mp_digit iz;
    
    if (a->used == 0) {
-      return 0;
+      return MP_NO;
    } else if (a->used == 1) {
-      return 1;
+      return MP_YES;
    } else if (a->used > 1) {
       iy = mp_count_bits(a);
       iz = 1;
@@ -6482,7 +6595,7 @@
       /* Test every bit from the second digit up, must be 1 */
       for (ix = DIGIT_BIT; ix < iy; ix++) {
           if ((a->dp[iw] & iz) == 0) {
-             return 0;
+             return MP_NO;
           }
           iz <<= 1;
           if (iz > (mp_digit)MP_MASK) {
@@ -6491,13 +6604,57 @@
           }
       }
    }
-   return 1;
+   return MP_YES;
 }
 
 #endif
 
 /* End: bn_mp_reduce_is_2k.c */
 
+/* Start: bn_mp_reduce_is_2k_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* determines if reduce_2k_l can be used */
+int mp_reduce_is_2k_l(mp_int *a)
+{
+   int ix, iy;
+   
+   if (a->used == 0) {
+      return MP_NO;
+   } else if (a->used == 1) {
+      return MP_YES;
+   } else if (a->used > 1) {
+      /* if more than half of the digits are -1 we're sold */
+      for (iy = ix = 0; ix < a->used; ix++) {
+          if (a->dp[ix] == MP_MASK) {
+              ++iy;
+          }
+      }
+      return (iy >= (a->used/2)) ? MP_YES : MP_NO;
+      
+   }
+   return MP_NO;
+}
+
+#endif
+
+/* End: bn_mp_reduce_is_2k_l.c */
+
 /* Start: bn_mp_reduce_setup.c */
 #include <tommath.h>
 #ifdef BN_MP_REDUCE_SETUP_C
@@ -7132,8 +7289,7 @@
  */
 
 /* store in signed [big endian] format */
-int
-mp_to_signed_bin (mp_int * a, unsigned char *b)
+int mp_to_signed_bin (mp_int * a, unsigned char *b)
 {
   int     res;
 
@@ -7147,6 +7303,37 @@
 
 /* End: bn_mp_to_signed_bin.c */
 
+/* Start: bn_mp_to_signed_bin_n.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_signed_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_signed_bin_size(a);
+   return mp_to_signed_bin(a, b);
+}
+#endif
+
+/* End: bn_mp_to_signed_bin_n.c */
+
 /* Start: bn_mp_to_unsigned_bin.c */
 #include <tommath.h>
 #ifdef BN_MP_TO_UNSIGNED_BIN_C
@@ -7166,8 +7353,7 @@
  */
 
 /* store in unsigned [big endian] format */
-int
-mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 {
   int     x, res;
   mp_int  t;
@@ -7196,6 +7382,37 @@
 
 /* End: bn_mp_to_unsigned_bin.c */
 
+/* Start: bn_mp_to_unsigned_bin_n.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_unsigned_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_unsigned_bin_size(a);
+   return mp_to_unsigned_bin(a, b);
+}
+#endif
+
+/* End: bn_mp_to_unsigned_bin_n.c */
+
 /* Start: bn_mp_toom_mul.c */
 #include <tommath.h>
 #ifdef BN_MP_TOOM_MUL_C
@@ -7216,9 +7433,10 @@
 
 /* multiplication using the Toom-Cook 3-way algorithm 
  *
- * Much more complicated than Karatsuba but has a lower asymptotic running time of 
- * O(N**1.464).  This algorithm is only particularly useful on VERY large
- * inputs (we're talking 1000s of digits here...).
+ * Much more complicated than Karatsuba but has a lower 
+ * asymptotic running time of O(N**1.464).  This algorithm is 
+ * only particularly useful on VERY large inputs 
+ * (we're talking 1000s of digits here...).
 */
 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
@@ -7888,8 +8106,7 @@
  */
 
 /* get the size for an unsigned equivalent */
-int
-mp_unsigned_bin_size (mp_int * a)
+int mp_unsigned_bin_size (mp_int * a)
 {
   int     size = mp_count_bits (a);
   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
@@ -7938,7 +8155,7 @@
   }
 
   for (ix = 0; ix < px; ix++) {
-
+     t.dp[ix] ^= x->dp[ix];
   }
   mp_clamp (&t);
   mp_exch (c, &t);
@@ -7968,12 +8185,18 @@
  */
 
 /* set to zero */
-void
-mp_zero (mp_int * a)
-{
+void mp_zero (mp_int * a)
+{
+  int       n;
+  mp_digit *tmp;
+
   a->sign = MP_ZPOS;
   a->used = 0;
-  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+
+  tmp = a->dp;
+  for (n = 0; n < a->alloc; n++) {
+     *tmp++ = 0;
+  }
 }
 #endif
 
@@ -7996,7 +8219,7 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-const mp_digit __prime_tab[] = {
+const mp_digit ltm_prime_tab[] = {
   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
   0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
@@ -8212,11 +8435,12 @@
    #define TAB_SIZE 256
 #endif
 
-int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res, mu;
   mp_digit buf;
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  int (*redux)(mp_int*,mp_int*,mp_int*);
 
   /* find window size */
   x = mp_count_bits (X);
@@ -8261,11 +8485,20 @@
 
   /* create mu, used for Barrett reduction */
   if ((err = mp_init (&mu)) != MP_OKAY) {
-    goto __M;
-  }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto __MU;
-  }
+    goto LBL_M;
+  }
+  
+  if (redmode == 0) {
+     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce;
+  } else {
+     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce_2k_l;
+  }    
 
   /* create M table
    *
@@ -8276,23 +8509,26 @@
    * computed though accept for M[0] and M[1]
    */
   if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
 
   /* compute the value at M[1<<(winsize-1)] by squaring 
    * M[1] (winsize-1) times 
    */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
 
   for (x = 0; x < (winsize - 1); x++) {
+    /* square it */
     if ((err = mp_sqr (&M[1 << (winsize - 1)], 
                        &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
-      goto __MU;
+      goto LBL_MU;
+    }
+
+    /* reduce modulo P */
+    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
     }
   }
 
@@ -8301,16 +8537,16 @@
    */
   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
-      goto __MU;
+      goto LBL_MU;
+    }
+    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
     }
   }
 
   /* setup result */
   if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
   mp_set (&res, 1);
 
@@ -8350,10 +8586,10 @@
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
       }
       continue;
     }
@@ -8367,19 +8603,19 @@
       /* square first */
       for (x = 0; x < winsize; x++) {
         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
         }
       }
 
       /* then multiply */
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
       }
 
       /* empty window and reset */
@@ -8394,20 +8630,20 @@
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
       }
 
       bitbuf <<= 1;
       if ((bitbuf & (1 << winsize)) != 0) {
         /* then multiply */
         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
         }
       }
     }
@@ -8415,9 +8651,9 @@
 
   mp_exch (&res, Y);
   err = MP_OKAY;
-__RES:mp_clear (&res);
-__MU:mp_clear (&mu);
-__M:
+LBL_RES:mp_clear (&res);
+LBL_MU:mp_clear (&mu);
+LBL_M:
   mp_clear(&M[1]);
   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
@@ -8450,8 +8686,7 @@
  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
  * many digits of output are created.
  */
-int
-s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   mp_int  t;
   int     res, pa, pb, ix, iy;
@@ -8619,8 +8854,7 @@
  */
 
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int
-s_mp_sqr (mp_int * a, mp_int * b)
+int s_mp_sqr (mp_int * a, mp_int * b)
 {
   mp_int  t;
   int     res, ix, iy, pa;
@@ -8797,11 +9031,12 @@
  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
  Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
+ AMD Athlon64           /GCC v3.4.4   /        74/       124/LTM 0.34
  
 */
 
-int     KARATSUBA_MUL_CUTOFF = 88,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 128,     /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 74,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 124,     /* Min. number of digits before Karatsuba squaring is used. */
         
         TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
         TOOM_SQR_CUTOFF      = 400;