changeset 190:d8254fc979e9 libtommath-orig LTM_0.35

Initial import of libtommath 0.35
author Matt Johnston <matt@ucc.asn.au>
date Fri, 06 May 2005 08:59:30 +0000
parents d29b64170cf0
children c5c969ed76f3
files TODO bn.pdf bn.tex bn_fast_mp_invmod.c bn_fast_mp_montgomery_reduce.c bn_fast_s_mp_mul_digs.c bn_fast_s_mp_mul_high_digs.c bn_fast_s_mp_sqr.c bn_mp_div.c bn_mp_dr_reduce.c bn_mp_exptmod.c bn_mp_exptmod_fast.c bn_mp_exteuclid.c bn_mp_gcd.c bn_mp_invmod_slow.c bn_mp_jacobi.c bn_mp_lcm.c bn_mp_mod_2d.c bn_mp_montgomery_calc_normalization.c bn_mp_mul_d.c bn_mp_n_root.c bn_mp_neg.c bn_mp_prime_fermat.c bn_mp_prime_is_divisible.c bn_mp_prime_is_prime.c bn_mp_prime_miller_rabin.c bn_mp_prime_next_prime.c bn_mp_prime_random_ex.c bn_mp_radix_size.c bn_mp_rand.c bn_mp_read_radix.c bn_mp_reduce.c bn_mp_reduce_2k.c bn_mp_reduce_2k_l.c bn_mp_reduce_2k_setup.c bn_mp_reduce_2k_setup_l.c bn_mp_reduce_is_2k.c bn_mp_reduce_is_2k_l.c bn_mp_to_signed_bin.c bn_mp_to_signed_bin_n.c bn_mp_to_unsigned_bin.c bn_mp_to_unsigned_bin_n.c bn_mp_toom_mul.c bn_mp_unsigned_bin_size.c bn_mp_xor.c bn_mp_zero.c bn_prime_tab.c bn_s_mp_exptmod.c bn_s_mp_mul_digs.c bn_s_mp_sqr.c bncore.c callgraph.txt changes.txt demo/demo.c demo/timing.c dep.pl etc/mersenne.c etc/pprime.c etc/tune.c logs/add.log logs/expt.log logs/expt_2k.log logs/expt_2kl.log logs/expt_dr.log logs/mult.log logs/mult_kara.log logs/sqr.log logs/sqr_kara.log logs/sub.log makefile makefile.bcc makefile.cygwin_dll makefile.icc makefile.msvc makefile.shared mtest/mtest.c poster.pdf pre_gen/mpi.c tombc/grammar.txt tommath.h tommath.pdf tommath.src tommath.tex tommath_class.h
diffstat 84 files changed, 8248 insertions(+), 5516 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TODO	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,16 @@
+things for book in order of importance...
+
+- Fix up pseudo-code [only] for combas that are not consistent with source
+- Start in chapter 3 [basics] and work up...
+   - re-write to prose [less abrupt]
+   - clean up pseudo code [spacing]
+   - more examples where appropriate and figures
+
+Goal:
+   - Get sync done by mid January [roughly 8-12 hours work]
+   - Finish ch3-6 by end of January [roughly 12-16 hours of work]
+   - Finish ch7-end by mid Feb [roughly 20-24 hours of work].
+
+Goal isn't "first edition" but merely cleaner to read.
+
+
Binary file bn.pdf has changed
--- a/bn.tex	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn.tex	Fri May 06 08:59:30 2005 +0000
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{LibTomMath User Manual \\ v0.32}
+\title{LibTomMath User Manual \\ v0.35}
 \author{Tom St Denis \\ [email protected]}
 \maketitle
 This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
@@ -263,12 +263,12 @@
 \begin{center}
 \begin{tabular}{|l|c|c|l|}
 \hline \textbf{Criteria} & \textbf{Pro} & \textbf{Con} & \textbf{Notes} \\
-\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 76.04$ \\
+\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 71.97$ \\
 \hline Commented function prototypes & X && GnuPG function names are cryptic. \\
 \hline Speed && X & LibTomMath is slower.  \\
 \hline Totally free & X & & GPL has unfavourable restrictions.\\
 \hline Large function base & X & & GnuPG is barebones. \\
-\hline Four modular reduction algorithms & X & & Faster modular exponentiation. \\
+\hline Five modular reduction algorithms & X & & Faster modular exponentiation for a variety of moduli. \\
 \hline Portable & X & & GnuPG requires configuration to build. \\
 \hline
 \end{tabular}
@@ -284,9 +284,12 @@
 So it may feel tempting to just rip the math code out of GnuPG (or GnuMP where it was taken from originally) in your
 own application but I think there are reasons not to.  While LibTomMath is slower than libraries such as GnuMP it is
 not normally significantly slower.  On x86 machines the difference is normally a factor of two when performing modular
-exponentiations.
+exponentiations.  It depends largely on the processor, compiler and the moduli being used.
 
-Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.
+Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.  However,
+on the other side of the coin LibTomMath offers you a totally free (public domain) well structured math library
+that is very flexible, complete and performs well in resource contrained environments.  Fast RSA for example can
+be performed with as little as 8KB of ram for data (again depending on build options).  
 
 \chapter{Getting Started with LibTomMath}
 \section{Building Programs}
@@ -809,7 +812,7 @@
 
 \index{mp\_cmp\_mag}
 \begin{alltt}
-int mp_cmp(mp_int * a, mp_int * b);
+int mp_cmp_mag(mp_int * a, mp_int * b);
 \end{alltt}
 This will compare $a$ to $b$ placing $a$ to the left of $b$.  This function cannot fail and will return one of the
 three compare codes listed in figure \ref{fig:CMP}.
@@ -1220,12 +1223,13 @@
 \end{alltt}
 
 Will square $a$ and store it in $b$.  Like the case of multiplication there are four different squaring
-algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms.
+algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms because
+of the speed difference.  
 
 \section{Tuning Polynomial Basis Routines}
 
 Both of the Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that
-the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectfully they require 
+the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectively they require 
 considerably less work.  For example, a 10000-digit multiplication would take roughly 724,000 single precision
 multiplications with Toom-Cook or 100,000,000 single precision multiplications with the standard Comba (a factor
 of 138).
@@ -1297,14 +1301,14 @@
 \section{Barrett Reduction}
 
 Barrett reduction is a generic optimized reduction algorithm that requires pre--computation to achieve
-a decent speedup over straight division.  First a $mu$ value must be precomputed with the following function.
+a decent speedup over straight division.  First a $\mu$ value must be precomputed with the following function.
 
 \index{mp\_reduce\_setup}
 \begin{alltt}
 int mp_reduce_setup(mp_int *a, mp_int *b);
 \end{alltt}
 
-Given a modulus in $b$ this produces the required $mu$ value in $a$.  For any given modulus this only has to
+Given a modulus in $b$ this produces the required $\mu$ value in $a$.  For any given modulus this only has to
 be computed once.  Modular reduction can now be performed with the following.
 
 \index{mp\_reduce}
@@ -1312,7 +1316,7 @@
 int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 \end{alltt}
 
-This will reduce $a$ in place modulo $b$ with the precomputed $mu$ value in $c$.  $a$ must be in the range
+This will reduce $a$ in place modulo $b$ with the precomputed $\mu$ value in $c$.  $a$ must be in the range
 $0 \le a < b^2$.
 
 \begin{alltt}
@@ -1578,7 +1582,8 @@
 This algorithm uses the ``Newton Approximation'' method and will converge on the correct root fairly quickly.  Since
 the algorithm requires raising $a$ to the power of $b$ it is not ideal to attempt to find roots for large
 values of $b$.  If particularly large roots are required then a factor method could be used instead.  For example,
-$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$.
+$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$ or simply 
+$\left ( \left ( \left ( a^{1/2} \right )^{1/2} \right )^{1/2} \right )^{1/2}$
 
 \chapter{Prime Numbers}
 \section{Trial Division}
--- a/bn_fast_mp_invmod.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_mp_invmod.c	Fri May 06 08:59:30 2005 +0000
@@ -21,8 +21,7 @@
  * Based on slow invmod except this is optimized for the case where b is 
  * odd as per HAC Note 14.64 on pp. 610
  */
-int
-fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
   mp_int  x, y, u, v, B, D;
   int     res, neg;
@@ -39,20 +38,20 @@
 
   /* x == modulus, y == value to invert */
   if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
+    goto LBL_ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   mp_set (&D, 1);
 
@@ -61,17 +60,17 @@
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 4.2 if B is odd then */
     if (mp_isodd (&B) == 1) {
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
       }
     }
     /* B = B/2 */
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -79,18 +78,18 @@
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 5.2 if D is odd then */
     if (mp_isodd (&D) == 1) {
       /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
       }
     }
     /* D = D/2 */
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -98,20 +97,20 @@
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   } else {
     /* v - v - u, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -125,21 +124,21 @@
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* b is now the inverse */
   neg = a->sign;
   while (D.sign == MP_NEG) {
     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
   mp_exch (&D, c);
   c->sign = neg;
   res = MP_OKAY;
 
-__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
   return res;
 }
 #endif
--- a/bn_fast_mp_montgomery_reduce.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_mp_montgomery_reduce.c	Fri May 06 08:59:30 2005 +0000
@@ -23,8 +23,7 @@
  *
  * Based on Algorithm 14.32 on pp.601 of HAC.
 */
-int
-fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
   int     ix, res, olduse;
   mp_word W[MP_WARRAY];
--- a/bn_fast_s_mp_mul_digs.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_s_mp_mul_digs.c	Fri May 06 08:59:30 2005 +0000
@@ -31,8 +31,7 @@
  * Based on Algorithm 14.12 on pp.595 of HAC.
  *
  */
-int
-fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -50,7 +49,7 @@
 
   /* clear the carry */
   _W = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
       int      tx, ty;
       int      iy;
       mp_digit *tmpx, *tmpy;
@@ -63,7 +62,7 @@
       tmpx = a->dp + tx;
       tmpy = b->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially 
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -80,14 +79,17 @@
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
 
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
+
   /* setup dest */
   olduse  = c->used;
-  c->used = digs;
+  c->used = pa;
 
   {
     register mp_digit *tmpc;
     tmpc = c->dp;
-    for (ix = 0; ix < digs; ix++) {
+    for (ix = 0; ix < pa+1; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
--- a/bn_fast_s_mp_mul_high_digs.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_s_mp_mul_high_digs.c	Fri May 06 08:59:30 2005 +0000
@@ -24,8 +24,7 @@
  *
  * Based on Algorithm 14.12 on pp.595 of HAC.
  */
-int
-fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -42,7 +41,7 @@
   /* number of output digits to produce */
   pa = a->used + b->used;
   _W = 0;
-  for (ix = digs; ix <= pa; ix++) { 
+  for (ix = digs; ix < pa; ix++) { 
       int      tx, ty, iy;
       mp_digit *tmpx, *tmpy;
 
@@ -70,6 +69,9 @@
       /* make next carry */
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
+  
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
 
   /* setup dest */
   olduse  = c->used;
--- a/bn_fast_s_mp_sqr.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_fast_s_mp_sqr.c	Fri May 06 08:59:30 2005 +0000
@@ -15,33 +15,14 @@
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
 
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed.  This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2".  The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
 /* the jist of squaring...
-
-you do like mult except the offset of the tmpx [one that starts closer to zero]
-can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
-(ty-tx) so that it never happens.  You double all those you add in the inner loop
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
 
 After that loop you do the squares and add them in.
-
-Remove W2 and don't memset W
-
 */
 
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
@@ -60,7 +41,7 @@
 
   /* number of output digits to produce */
   W1 = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
       int      tx, ty, iy;
       mp_word  _W;
       mp_digit *tmpy;
@@ -76,7 +57,7 @@
       tmpx = a->dp + tx;
       tmpy = a->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -101,7 +82,7 @@
       }
 
       /* store it */
-      W[ix] = _W;
+      W[ix] = (mp_digit)(_W & MP_MASK);
 
       /* make next carry */
       W1 = _W >> ((mp_word)DIGIT_BIT);
--- a/bn_mp_div.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_div.c	Fri May 06 08:59:30 2005 +0000
@@ -49,23 +49,23 @@
 
   mp_set(&tq, 1);
   n = mp_count_bits(a) - mp_count_bits(b);
-  if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
-      ((res = mp_copy(b, &tb)) != MP_OKAY) || 
+  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
+      ((res = mp_abs(b, &tb)) != MP_OKAY) || 
       ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
       ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
-      goto __ERR;
+      goto LBL_ERR;
   }
 
   while (n-- >= 0) {
      if (mp_cmp(&tb, &ta) != MP_GT) {
         if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
             ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
         }
      }
      if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
          ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
      }
   }
 
@@ -74,13 +74,13 @@
   n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
   if (c != NULL) {
      mp_exch(c, &q);
-     c->sign  = n2;
+     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
   }
   if (d != NULL) {
      mp_exch(d, &ta);
-     d->sign = n;
+     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
   }
-__ERR:
+LBL_ERR:
    mp_clear_multi(&ta, &tb, &tq, &q, NULL);
    return res;
 }
@@ -129,19 +129,19 @@
   q.used = a->used + 2;
 
   if ((res = mp_init (&t1)) != MP_OKAY) {
-    goto __Q;
+    goto LBL_Q;
   }
 
   if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
+    goto LBL_T1;
   }
 
   if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
-    goto __T2;
+    goto LBL_T2;
   }
 
   if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
-    goto __X;
+    goto LBL_X;
   }
 
   /* fix the sign */
@@ -153,10 +153,10 @@
   if (norm < (int)(DIGIT_BIT-1)) {
      norm = (DIGIT_BIT-1) - norm;
      if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
      }
      if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
      }
   } else {
      norm = 0;
@@ -168,13 +168,13 @@
 
   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
   if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
-    goto __Y;
+    goto LBL_Y;
   }
 
   while (mp_cmp (&x, &y) != MP_LT) {
     ++(q.dp[n - t]);
     if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
   }
 
@@ -216,7 +216,7 @@
       t1.dp[1] = y.dp[t];
       t1.used = 2;
       if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
 
       /* find right hand */
@@ -228,27 +228,27 @@
 
     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
     if (x.sign == MP_NEG) {
       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
       if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
 
       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
@@ -275,11 +275,11 @@
 
   res = MP_OKAY;
 
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
-__Q:mp_clear (&q);
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+LBL_Q:mp_clear (&q);
   return res;
 }
 
--- a/bn_mp_dr_reduce.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_dr_reduce.c	Fri May 06 08:59:30 2005 +0000
@@ -20,7 +20,7 @@
  * Based on algorithm from the paper
  *
  * "Generating Efficient Primes for Discrete Log Cryptosystems"
- *                 Chae Hoon Lim, Pil Loong Lee,
+ *                 Chae Hoon Lim, Pil Joong Lee,
  *          POSTECH Information Research Laboratories
  *
  * The modulus must be of a special format [see manual]
--- a/bn_mp_exptmod.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_exptmod.c	Fri May 06 08:59:30 2005 +0000
@@ -61,25 +61,33 @@
      return err;
 #else 
      /* no invmod */
-     return MP_VAL
+     return MP_VAL;
 #endif
   }
 
+/* modified diminished radix reduction */
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+  if (mp_reduce_is_2k_l(P) == MP_YES) {
+     return s_mp_exptmod(G, X, P, Y, 1);
+  }
+#endif
+
 #ifdef BN_MP_DR_IS_MODULUS_C
   /* is it a DR modulus? */
   dr = mp_dr_is_modulus(P);
 #else
+  /* default to no */
   dr = 0;
 #endif
 
 #ifdef BN_MP_REDUCE_IS_2K_C
-  /* if not, is it a uDR modulus? */
+  /* if not, is it a unrestricted DR modulus? */
   if (dr == 0) {
      dr = mp_reduce_is_2k(P) << 1;
   }
 #endif
     
-  /* if the modulus is odd or dr != 0 use the fast method */
+  /* if the modulus is odd or dr != 0 use the montgomery method */
 #ifdef BN_MP_EXPTMOD_FAST_C
   if (mp_isodd (P) == 1 || dr !=  0) {
     return mp_exptmod_fast (G, X, P, Y, dr);
@@ -87,7 +95,7 @@
 #endif
 #ifdef BN_S_MP_EXPTMOD_C
     /* otherwise use the generic Barrett reduction technique */
-    return s_mp_exptmod (G, X, P, Y);
+    return s_mp_exptmod (G, X, P, Y, 0);
 #else
     /* no exptmod for evens */
     return MP_VAL;
--- a/bn_mp_exptmod_fast.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_exptmod_fast.c	Fri May 06 08:59:30 2005 +0000
@@ -29,8 +29,7 @@
    #define TAB_SIZE 256
 #endif
 
-int
-mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res;
   mp_digit buf, mp;
@@ -88,11 +87,11 @@
 #ifdef BN_MP_MONTGOMERY_SETUP_C     
      /* now setup montgomery  */
      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
      }
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
 
      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
@@ -108,7 +107,7 @@
         redux = mp_montgomery_reduce;
 #else
         err = MP_VAL;
-        goto __M;
+        goto LBL_M;
 #endif
      }
   } else if (redmode == 1) {
@@ -118,24 +117,24 @@
      redux = mp_dr_reduce;
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
   } else {
 #if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
      /* setup DR reduction for moduli of the form 2**k - b */
      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
      }
      redux = mp_reduce_2k;
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
   }
 
   /* setup result */
   if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __M;
+    goto LBL_M;
   }
 
   /* create M table
@@ -149,45 +148,45 @@
 #ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
      /* now we need R mod m */
      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
 #else 
      err = MP_VAL;
-     goto __RES;
+     goto LBL_RES;
 #endif
 
      /* now set M[1] to G * R mod m */
      if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
   } else {
      mp_set(&res, 1);
      if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
   }
 
   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __RES;
+    goto LBL_RES;
   }
 
   for (x = 0; x < (winsize - 1); x++) {
     if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
     if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
   }
 
   /* create upper table */
   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
     if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
   }
 
@@ -227,10 +226,10 @@
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       continue;
     }
@@ -244,19 +243,19 @@
       /* square first */
       for (x = 0; x < winsize; x++) {
         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
         if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
       }
 
       /* then multiply */
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
 
       /* empty window and reset */
@@ -271,10 +270,10 @@
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
 
       /* get next bit of the window */
@@ -282,10 +281,10 @@
       if ((bitbuf & (1 << winsize)) != 0) {
         /* then multiply */
         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
         if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
       }
     }
@@ -299,15 +298,15 @@
       * of R.
       */
      if ((err = redux(&res, P, mp)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
   }
 
   /* swap res with Y */
   mp_exch (&res, Y);
   err = MP_OKAY;
-__RES:mp_clear (&res);
-__M:
+LBL_RES:mp_clear (&res);
+LBL_M:
   mp_clear(&M[1]);
   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
--- a/bn_mp_exteuclid.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_exteuclid.c	Fri May 06 08:59:30 2005 +0000
@@ -59,6 +59,13 @@
        if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
    }
 
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
    /* copy result out */
    if (U1 != NULL) { mp_exch(U1, &u1); }
    if (U2 != NULL) { mp_exch(U2, &u2); }
--- a/bn_mp_gcd.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_gcd.c	Fri May 06 08:59:30 2005 +0000
@@ -43,7 +43,7 @@
   }
 
   if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
-    goto __U;
+    goto LBL_U;
   }
 
   /* must be positive for the remainder of the algorithm */
@@ -57,24 +57,24 @@
   if (k > 0) {
      /* divide the power of two out */
      if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
 
      if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
   /* divide any remaining factors of two out */
   if (u_lsb != k) {
      if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
   if (v_lsb != k) {
      if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
@@ -87,23 +87,23 @@
      
      /* subtract smallest from largest */
      if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
      
      /* Divide out all factors of two */
      if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      } 
   } 
 
   /* multiply by 2**k which we divided out at the beginning */
   if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
-     goto __V;
+     goto LBL_V;
   }
   c->sign = MP_ZPOS;
   res = MP_OKAY;
-__V:mp_clear (&u);
-__U:mp_clear (&v);
+LBL_V:mp_clear (&u);
+LBL_U:mp_clear (&v);
   return res;
 }
 #endif
--- a/bn_mp_invmod_slow.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_invmod_slow.c	Fri May 06 08:59:30 2005 +0000
@@ -33,25 +33,25 @@
   }
 
   /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
   }
   if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* 2. [modified] if x,y are both even then return an error! */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   mp_set (&A, 1);
   mp_set (&D, 1);
@@ -61,24 +61,24 @@
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 4.2 if A or B is odd then */
     if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
       /* A = (A+y)/2, B = (B-x)/2 */
       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
     }
     /* A = A/2, B = B/2 */
     if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -86,24 +86,24 @@
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 5.2 if C or D is odd then */
     if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
       /* C = (C+y)/2, D = (D-x)/2 */
       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
     }
     /* C = C/2, D = D/2 */
     if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -111,28 +111,28 @@
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, A = A - C, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   } else {
     /* v - v - u, C = C - A, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -145,27 +145,27 @@
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* if its too low */
   while (mp_cmp_d(&C, 0) == MP_LT) {
       if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
   }
   
   /* too big */
   while (mp_cmp_mag(&C, b) != MP_LT) {
       if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
   }
   
   /* C is now the inverse */
   mp_exch (&C, c);
   res = MP_OKAY;
-__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
 #endif
--- a/bn_mp_jacobi.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_jacobi.c	Fri May 06 08:59:30 2005 +0000
@@ -50,13 +50,13 @@
   }
 
   if ((res = mp_init (&p1)) != MP_OKAY) {
-    goto __A1;
+    goto LBL_A1;
   }
 
   /* divide out larger power of two */
   k = mp_cnt_lsb(&a1);
   if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) {
-     goto __P1;
+     goto LBL_P1;
   }
 
   /* step 4.  if e is even set s=1 */
@@ -84,18 +84,18 @@
   } else {
     /* n1 = n mod a1 */
     if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
-      goto __P1;
+      goto LBL_P1;
     }
     if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
-      goto __P1;
+      goto LBL_P1;
     }
     *c = s * r;
   }
 
   /* done */
   res = MP_OKAY;
-__P1:mp_clear (&p1);
-__A1:mp_clear (&a1);
+LBL_P1:mp_clear (&p1);
+LBL_A1:mp_clear (&a1);
   return res;
 }
 #endif
--- a/bn_mp_lcm.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_lcm.c	Fri May 06 08:59:30 2005 +0000
@@ -28,20 +28,20 @@
 
   /* t1 = get the GCD of the two inputs */
   if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
-    goto __T;
+    goto LBL_T;
   }
 
   /* divide the smallest by the GCD */
   if (mp_cmp_mag(a, b) == MP_LT) {
      /* store quotient in t2 such that t2 * b is the LCM */
      if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
-        goto __T;
+        goto LBL_T;
      }
      res = mp_mul(b, &t2, c);
   } else {
      /* store quotient in t2 such that t2 * a is the LCM */
      if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
-        goto __T;
+        goto LBL_T;
      }
      res = mp_mul(a, &t2, c);
   }
@@ -49,7 +49,7 @@
   /* fix the sign to positive */
   c->sign = MP_ZPOS;
 
-__T:
+LBL_T:
   mp_clear_multi (&t1, &t2, NULL);
   return res;
 }
--- a/bn_mp_mod_2d.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_mod_2d.c	Fri May 06 08:59:30 2005 +0000
@@ -28,7 +28,7 @@
   }
 
   /* if the modulus is larger than the value than return */
-  if (b > (int) (a->used * DIGIT_BIT)) {
+  if (b >= (int) (a->used * DIGIT_BIT)) {
     res = mp_copy (a, c);
     return res;
   }
--- a/bn_mp_montgomery_calc_normalization.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_montgomery_calc_normalization.c	Fri May 06 08:59:30 2005 +0000
@@ -28,7 +28,6 @@
   /* how many bits of last digit does b use */
   bits = mp_count_bits (b) % DIGIT_BIT;
 
-
   if (b->used > 1) {
      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
         return res;
--- a/bn_mp_mul_d.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_mul_d.c	Fri May 06 08:59:30 2005 +0000
@@ -57,8 +57,9 @@
     u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
   }
 
-  /* store final carry [if any] */
+  /* store final carry [if any] and increment ix offset  */
   *tmpc++ = u;
+  ++ix;
 
   /* now zero digits above the top */
   while (ix++ < olduse) {
--- a/bn_mp_n_root.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_n_root.c	Fri May 06 08:59:30 2005 +0000
@@ -40,11 +40,11 @@
   }
 
   if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
+    goto LBL_T1;
   }
 
   if ((res = mp_init (&t3)) != MP_OKAY) {
-    goto __T2;
+    goto LBL_T2;
   }
 
   /* if a is negative fudge the sign but keep track */
@@ -57,52 +57,52 @@
   do {
     /* t1 = t2 */
     if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
     
     /* t3 = t1**(b-1) */
     if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {   
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* numerator */
     /* t2 = t1**b */
     if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {    
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* t2 = t1**b - a */
     if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {  
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* denominator */
     /* t3 = t1**(b-1) * b  */
     if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {    
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* t3 = (t1**b - a)/(b * t1**(b-1)) */
     if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {  
-      goto __T3;
+      goto LBL_T3;
     }
 
     if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
-      goto __T3;
+      goto LBL_T3;
     }
   }  while (mp_cmp (&t1, &t2) != MP_EQ);
 
   /* result can be off by a few so check */
   for (;;) {
     if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
-      goto __T3;
+      goto LBL_T3;
     }
 
     if (mp_cmp (&t2, a) == MP_GT) {
       if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
-         goto __T3;
+         goto LBL_T3;
       }
     } else {
       break;
@@ -120,9 +120,9 @@
 
   res = MP_OKAY;
 
-__T3:mp_clear (&t3);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
+LBL_T3:mp_clear (&t3);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
   return res;
 }
 #endif
--- a/bn_mp_neg.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_neg.c	Fri May 06 08:59:30 2005 +0000
@@ -19,12 +19,18 @@
 int mp_neg (mp_int * a, mp_int * b)
 {
   int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+        return res;
+     }
   }
+
   if (mp_iszero(b) != MP_YES) {
      b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  } else {
+     b->sign = MP_ZPOS;
   }
+
   return MP_OKAY;
 }
 #endif
--- a/bn_mp_prime_fermat.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_prime_fermat.c	Fri May 06 08:59:30 2005 +0000
@@ -43,7 +43,7 @@
 
   /* compute t = b**a mod a */
   if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
-    goto __T;
+    goto LBL_T;
   }
 
   /* is it equal to b? */
@@ -52,7 +52,7 @@
   }
 
   err = MP_OKAY;
-__T:mp_clear (&t);
+LBL_T:mp_clear (&t);
   return err;
 }
 #endif
--- a/bn_mp_prime_is_divisible.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_prime_is_divisible.c	Fri May 06 08:59:30 2005 +0000
@@ -29,8 +29,8 @@
   *result = MP_NO;
 
   for (ix = 0; ix < PRIME_SIZE; ix++) {
-    /* what is a mod __prime_tab[ix] */
-    if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
+    /* what is a mod LBL_prime_tab[ix] */
+    if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) {
       return err;
     }
 
--- a/bn_mp_prime_is_prime.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_prime_is_prime.c	Fri May 06 08:59:30 2005 +0000
@@ -37,7 +37,7 @@
 
   /* is the input equal to one of the primes in the table? */
   for (ix = 0; ix < PRIME_SIZE; ix++) {
-      if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
+      if (mp_cmp_d(a, ltm_prime_tab[ix]) == MP_EQ) {
          *result = 1;
          return MP_OKAY;
       }
@@ -60,20 +60,20 @@
 
   for (ix = 0; ix < t; ix++) {
     /* set the prime */
-    mp_set (&b, __prime_tab[ix]);
+    mp_set (&b, ltm_prime_tab[ix]);
 
     if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
-      goto __B;
+      goto LBL_B;
     }
 
     if (res == MP_NO) {
-      goto __B;
+      goto LBL_B;
     }
   }
 
   /* passed the test */
   *result = MP_YES;
-__B:mp_clear (&b);
+LBL_B:mp_clear (&b);
   return err;
 }
 #endif
--- a/bn_mp_prime_miller_rabin.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_prime_miller_rabin.c	Fri May 06 08:59:30 2005 +0000
@@ -40,12 +40,12 @@
     return err;
   }
   if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
-    goto __N1;
+    goto LBL_N1;
   }
 
   /* set 2**s * r = n1 */
   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
-    goto __N1;
+    goto LBL_N1;
   }
 
   /* count the number of least significant bits
@@ -55,15 +55,15 @@
 
   /* now divide n - 1 by 2**s */
   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
-    goto __R;
+    goto LBL_R;
   }
 
   /* compute y = b**r mod a */
   if ((err = mp_init (&y)) != MP_OKAY) {
-    goto __R;
+    goto LBL_R;
   }
   if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
-    goto __Y;
+    goto LBL_Y;
   }
 
   /* if y != 1 and y != n1 do */
@@ -72,12 +72,12 @@
     /* while j <= s-1 and y != n1 */
     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
-         goto __Y;
+         goto LBL_Y;
       }
 
       /* if y == 1 then composite */
       if (mp_cmp_d (&y, 1) == MP_EQ) {
-         goto __Y;
+         goto LBL_Y;
       }
 
       ++j;
@@ -85,15 +85,15 @@
 
     /* if y != n1 then composite */
     if (mp_cmp (&y, &n1) != MP_EQ) {
-      goto __Y;
+      goto LBL_Y;
     }
   }
 
   /* probably prime now */
   *result = MP_YES;
-__Y:mp_clear (&y);
-__R:mp_clear (&r);
-__N1:mp_clear (&n1);
+LBL_Y:mp_clear (&y);
+LBL_R:mp_clear (&r);
+LBL_N1:mp_clear (&n1);
   return err;
 }
 #endif
--- a/bn_mp_prime_next_prime.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_prime_next_prime.c	Fri May 06 08:59:30 2005 +0000
@@ -35,10 +35,10 @@
    a->sign = MP_ZPOS;
 
    /* simple algo if a is less than the largest prime in the table */
-   if (mp_cmp_d(a, __prime_tab[PRIME_SIZE-1]) == MP_LT) {
+   if (mp_cmp_d(a, ltm_prime_tab[PRIME_SIZE-1]) == MP_LT) {
       /* find which prime it is bigger than */
       for (x = PRIME_SIZE - 2; x >= 0; x--) {
-          if (mp_cmp_d(a, __prime_tab[x]) != MP_LT) {
+          if (mp_cmp_d(a, ltm_prime_tab[x]) != MP_LT) {
              if (bbs_style == 1) {
                 /* ok we found a prime smaller or
                  * equal [so the next is larger]
@@ -46,17 +46,17 @@
                  * however, the prime must be
                  * congruent to 3 mod 4
                  */
-                if ((__prime_tab[x + 1] & 3) != 3) {
+                if ((ltm_prime_tab[x + 1] & 3) != 3) {
                    /* scan upwards for a prime congruent to 3 mod 4 */
                    for (y = x + 1; y < PRIME_SIZE; y++) {
-                       if ((__prime_tab[y] & 3) == 3) {
-                          mp_set(a, __prime_tab[y]);
+                       if ((ltm_prime_tab[y] & 3) == 3) {
+                          mp_set(a, ltm_prime_tab[y]);
                           return MP_OKAY;
                        }
                    }
                 }
              } else {
-                mp_set(a, __prime_tab[x + 1]);
+                mp_set(a, ltm_prime_tab[x + 1]);
                 return MP_OKAY;
              }
           }
@@ -94,7 +94,7 @@
 
    /* generate the restable */
    for (x = 1; x < PRIME_SIZE; x++) {
-      if ((err = mp_mod_d(a, __prime_tab[x], res_tab + x)) != MP_OKAY) {
+      if ((err = mp_mod_d(a, ltm_prime_tab[x], res_tab + x)) != MP_OKAY) {
          return err;
       }
    }
@@ -120,8 +120,8 @@
              res_tab[x] += kstep;
 
              /* subtract the modulus [instead of using division] */
-             if (res_tab[x] >= __prime_tab[x]) {
-                res_tab[x]  -= __prime_tab[x];
+             if (res_tab[x] >= ltm_prime_tab[x]) {
+                res_tab[x]  -= ltm_prime_tab[x];
              }
 
              /* set flag if zero */
@@ -133,7 +133,7 @@
 
       /* add the step */
       if ((err = mp_add_d(a, step, a)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
 
       /* if didn't pass sieve and step == MAX then skip test */
@@ -143,9 +143,9 @@
 
       /* is this prime? */
       for (x = 0; x < t; x++) {
-          mp_set(&b, __prime_tab[t]);
+          mp_set(&b, ltm_prime_tab[t]);
           if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
-             goto __ERR;
+             goto LBL_ERR;
           }
           if (res == MP_NO) {
              break;
@@ -158,7 +158,7 @@
    }
 
    err = MP_OKAY;
-__ERR:
+LBL_ERR:
    mp_clear(&b);
    return err;
 }
--- a/bn_mp_prime_random_ex.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_prime_random_ex.c	Fri May 06 08:59:30 2005 +0000
@@ -47,7 +47,7 @@
    }
 
    /* calc the byte size */
-   bsize = (size>>3)+(size&7?1:0);
+   bsize = (size>>3) + ((size&7)?1:0);
 
    /* we need a buffer of bsize bytes */
    tmp = OPT_CAST(unsigned char) XMALLOC(bsize);
@@ -56,19 +56,19 @@
    }
 
    /* calc the maskAND value for the MSbyte*/
-   maskAND = 0xFF >> (8 - (size & 7));
+   maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
 
    /* calc the maskOR_msb */
    maskOR_msb        = 0;
-   maskOR_msb_offset = (size - 2) >> 3;
+   maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
    if (flags & LTM_PRIME_2MSB_ON) {
       maskOR_msb     |= 1 << ((size - 2) & 7);
    } else if (flags & LTM_PRIME_2MSB_OFF) {
       maskAND        &= ~(1 << ((size - 2) & 7));
-   }
+   } 
 
    /* get the maskOR_lsb */
-   maskOR_lsb         = 0;
+   maskOR_lsb         = 1;
    if (flags & LTM_PRIME_BBS) {
       maskOR_lsb     |= 3;
    }
--- a/bn_mp_radix_size.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_radix_size.c	Fri May 06 08:59:30 2005 +0000
@@ -35,22 +35,29 @@
     return MP_VAL;
   }
 
-  /* init a copy of the input */
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
+  if (mp_iszero(a) == MP_YES) {
+     *size = 2;
+    return MP_OKAY;
   }
 
   /* digs is the digit count */
   digs = 0;
 
   /* if it's negative add one for the sign */
-  if (t.sign == MP_NEG) {
+  if (a->sign == MP_NEG) {
     ++digs;
-    t.sign = MP_ZPOS;
   }
 
+  /* init a copy of the input */
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* force temp to positive */
+  t.sign = MP_ZPOS; 
+
   /* fetch out all of the digits */
-  while (mp_iszero (&t) == 0) {
+  while (mp_iszero (&t) == MP_NO) {
     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
       mp_clear (&t);
       return res;
--- a/bn_mp_rand.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_rand.c	Fri May 06 08:59:30 2005 +0000
@@ -29,14 +29,14 @@
 
   /* first place a random non-zero digit */
   do {
-    d = ((mp_digit) abs (rand ()));
+    d = ((mp_digit) abs (rand ())) & MP_MASK;
   } while (d == 0);
 
   if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
     return res;
   }
 
-  while (digits-- > 0) {
+  while (--digits > 0) {
     if ((res = mp_lshd (a, 1)) != MP_OKAY) {
       return res;
     }
--- a/bn_mp_read_radix.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_read_radix.c	Fri May 06 08:59:30 2005 +0000
@@ -16,7 +16,7 @@
  */
 
 /* read a string [ASCII] in a given radix */
-int mp_read_radix (mp_int * a, char *str, int radix)
+int mp_read_radix (mp_int * a, const char *str, int radix)
 {
   int     y, res, neg;
   char    ch;
--- a/bn_mp_reduce.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_reduce.c	Fri May 06 08:59:30 2005 +0000
@@ -19,8 +19,7 @@
  * precomputed via mp_reduce_setup.
  * From HAC pp.604 Algorithm 14.42
  */
-int
-mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
 {
   mp_int  q;
   int     res, um = m->used;
@@ -40,11 +39,11 @@
     }
   } else {
 #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #else 
--- a/bn_mp_reduce_2k.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_reduce_2k.c	Fri May 06 08:59:30 2005 +0000
@@ -16,8 +16,7 @@
  */
 
 /* reduces a modulo n where n is of the form 2**p - d */
-int
-mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
 {
    mp_int q;
    int    p, res;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn_mp_reduce_2k_l.c	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,58 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* reduces a modulo n where n is of the form 2**p - d 
+   This differs from reduce_2k since "d" can be larger
+   than a single digit.
+*/
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   /* q = q * d */
+   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) { 
+      goto ERR;
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
+#endif
--- a/bn_mp_reduce_2k_setup.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_reduce_2k_setup.c	Fri May 06 08:59:30 2005 +0000
@@ -16,8 +16,7 @@
  */
 
 /* determines the setup value */
-int 
-mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 {
    int res, p;
    mp_int tmp;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn_mp_reduce_2k_setup_l.c	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,40 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* determines the setup value */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
+{
+   int    res;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+ERR:
+   mp_clear(&tmp);
+   return res;
+}
+#endif
--- a/bn_mp_reduce_is_2k.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_reduce_is_2k.c	Fri May 06 08:59:30 2005 +0000
@@ -22,9 +22,9 @@
    mp_digit iz;
    
    if (a->used == 0) {
-      return 0;
+      return MP_NO;
    } else if (a->used == 1) {
-      return 1;
+      return MP_YES;
    } else if (a->used > 1) {
       iy = mp_count_bits(a);
       iz = 1;
@@ -33,7 +33,7 @@
       /* Test every bit from the second digit up, must be 1 */
       for (ix = DIGIT_BIT; ix < iy; ix++) {
           if ((a->dp[iw] & iz) == 0) {
-             return 0;
+             return MP_NO;
           }
           iz <<= 1;
           if (iz > (mp_digit)MP_MASK) {
@@ -42,7 +42,7 @@
           }
       }
    }
-   return 1;
+   return MP_YES;
 }
 
 #endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn_mp_reduce_is_2k_l.c	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,40 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* determines if reduce_2k_l can be used */
+int mp_reduce_is_2k_l(mp_int *a)
+{
+   int ix, iy;
+   
+   if (a->used == 0) {
+      return MP_NO;
+   } else if (a->used == 1) {
+      return MP_YES;
+   } else if (a->used > 1) {
+      /* if more than half of the digits are -1 we're sold */
+      for (iy = ix = 0; ix < a->used; ix++) {
+          if (a->dp[ix] == MP_MASK) {
+              ++iy;
+          }
+      }
+      return (iy >= (a->used/2)) ? MP_YES : MP_NO;
+      
+   }
+   return MP_NO;
+}
+
+#endif
--- a/bn_mp_to_signed_bin.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_to_signed_bin.c	Fri May 06 08:59:30 2005 +0000
@@ -16,8 +16,7 @@
  */
 
 /* store in signed [big endian] format */
-int
-mp_to_signed_bin (mp_int * a, unsigned char *b)
+int mp_to_signed_bin (mp_int * a, unsigned char *b)
 {
   int     res;
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn_mp_to_signed_bin_n.c	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,27 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_signed_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_signed_bin_size(a);
+   return mp_to_signed_bin(a, b);
+}
+#endif
--- a/bn_mp_to_unsigned_bin.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_to_unsigned_bin.c	Fri May 06 08:59:30 2005 +0000
@@ -16,8 +16,7 @@
  */
 
 /* store in unsigned [big endian] format */
-int
-mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 {
   int     x, res;
   mp_int  t;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn_mp_to_unsigned_bin_n.c	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,27 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_unsigned_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_unsigned_bin_size(a);
+   return mp_to_unsigned_bin(a, b);
+}
+#endif
--- a/bn_mp_toom_mul.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_toom_mul.c	Fri May 06 08:59:30 2005 +0000
@@ -17,9 +17,10 @@
 
 /* multiplication using the Toom-Cook 3-way algorithm 
  *
- * Much more complicated than Karatsuba but has a lower asymptotic running time of 
- * O(N**1.464).  This algorithm is only particularly useful on VERY large
- * inputs (we're talking 1000s of digits here...).
+ * Much more complicated than Karatsuba but has a lower 
+ * asymptotic running time of O(N**1.464).  This algorithm is 
+ * only particularly useful on VERY large inputs 
+ * (we're talking 1000s of digits here...).
 */
 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
--- a/bn_mp_unsigned_bin_size.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_unsigned_bin_size.c	Fri May 06 08:59:30 2005 +0000
@@ -16,8 +16,7 @@
  */
 
 /* get the size for an unsigned equivalent */
-int
-mp_unsigned_bin_size (mp_int * a)
+int mp_unsigned_bin_size (mp_int * a)
 {
   int     size = mp_count_bits (a);
   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
--- a/bn_mp_xor.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_xor.c	Fri May 06 08:59:30 2005 +0000
@@ -37,7 +37,7 @@
   }
 
   for (ix = 0; ix < px; ix++) {
-
+     t.dp[ix] ^= x->dp[ix];
   }
   mp_clamp (&t);
   mp_exch (c, &t);
--- a/bn_mp_zero.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_mp_zero.c	Fri May 06 08:59:30 2005 +0000
@@ -16,11 +16,17 @@
  */
 
 /* set to zero */
-void
-mp_zero (mp_int * a)
+void mp_zero (mp_int * a)
 {
+  int       n;
+  mp_digit *tmp;
+
   a->sign = MP_ZPOS;
   a->used = 0;
-  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+
+  tmp = a->dp;
+  for (n = 0; n < a->alloc; n++) {
+     *tmp++ = 0;
+  }
 }
 #endif
--- a/bn_prime_tab.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_prime_tab.c	Fri May 06 08:59:30 2005 +0000
@@ -14,7 +14,7 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-const mp_digit __prime_tab[] = {
+const mp_digit ltm_prime_tab[] = {
   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
   0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
--- a/bn_s_mp_exptmod.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_s_mp_exptmod.c	Fri May 06 08:59:30 2005 +0000
@@ -21,11 +21,12 @@
    #define TAB_SIZE 256
 #endif
 
-int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res, mu;
   mp_digit buf;
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  int (*redux)(mp_int*,mp_int*,mp_int*);
 
   /* find window size */
   x = mp_count_bits (X);
@@ -70,11 +71,20 @@
 
   /* create mu, used for Barrett reduction */
   if ((err = mp_init (&mu)) != MP_OKAY) {
-    goto __M;
+    goto LBL_M;
   }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto __MU;
-  }
+  
+  if (redmode == 0) {
+     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce;
+  } else {
+     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce_2k_l;
+  }    
 
   /* create M table
    *
@@ -85,23 +95,26 @@
    * computed though accept for M[0] and M[1]
    */
   if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
 
   /* compute the value at M[1<<(winsize-1)] by squaring 
    * M[1] (winsize-1) times 
    */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
 
   for (x = 0; x < (winsize - 1); x++) {
+    /* square it */
     if ((err = mp_sqr (&M[1 << (winsize - 1)], 
                        &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __MU;
+      goto LBL_MU;
     }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
-      goto __MU;
+
+    /* reduce modulo P */
+    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
     }
   }
 
@@ -110,16 +123,16 @@
    */
   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __MU;
+      goto LBL_MU;
     }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
-      goto __MU;
+    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
     }
   }
 
   /* setup result */
   if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
   mp_set (&res, 1);
 
@@ -159,10 +172,10 @@
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
       }
       continue;
     }
@@ -176,19 +189,19 @@
       /* square first */
       for (x = 0; x < winsize; x++) {
         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
         }
       }
 
       /* then multiply */
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
       }
 
       /* empty window and reset */
@@ -203,20 +216,20 @@
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
       }
 
       bitbuf <<= 1;
       if ((bitbuf & (1 << winsize)) != 0) {
         /* then multiply */
         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
         }
       }
     }
@@ -224,9 +237,9 @@
 
   mp_exch (&res, Y);
   err = MP_OKAY;
-__RES:mp_clear (&res);
-__MU:mp_clear (&mu);
-__M:
+LBL_RES:mp_clear (&res);
+LBL_MU:mp_clear (&mu);
+LBL_M:
   mp_clear(&M[1]);
   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
--- a/bn_s_mp_mul_digs.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_s_mp_mul_digs.c	Fri May 06 08:59:30 2005 +0000
@@ -19,8 +19,7 @@
  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
  * many digits of output are created.
  */
-int
-s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   mp_int  t;
   int     res, pa, pb, ix, iy;
--- a/bn_s_mp_sqr.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bn_s_mp_sqr.c	Fri May 06 08:59:30 2005 +0000
@@ -16,8 +16,7 @@
  */
 
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int
-s_mp_sqr (mp_int * a, mp_int * b)
+int s_mp_sqr (mp_int * a, mp_int * b)
 {
   mp_int  t;
   int     res, ix, iy, pa;
--- a/bncore.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/bncore.c	Fri May 06 08:59:30 2005 +0000
@@ -20,11 +20,12 @@
  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
  Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
+ AMD Athlon64           /GCC v3.4.4   /        74/       124/LTM 0.34
  
 */
 
-int     KARATSUBA_MUL_CUTOFF = 88,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 128,     /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 74,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 124,     /* Min. number of digits before Karatsuba squaring is used. */
         
         TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
         TOOM_SQR_CUTOFF      = 400; 
--- a/callgraph.txt	Sun Dec 19 11:33:56 2004 +0000
+++ b/callgraph.txt	Fri May 06 08:59:30 2005 +0000
@@ -245,6 +245,7 @@
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   +--->BN_MP_CLEAR_C
 |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -298,6 +299,7 @@
 |   |   +--->BN_MP_CLEAR_C
 |   +--->BN_MP_SET_C
 |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
 |   +--->BN_MP_MUL_2D_C
 |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_LSHD_C
@@ -404,6 +406,7 @@
 |   |   |   +--->BN_MP_CLEAR_C
 |   |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -700,6 +703,7 @@
 |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
 |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_LSHD_C
@@ -753,6 +757,7 @@
 |   |   |   +--->BN_MP_CLEAR_C
 |   |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -902,7 +907,64 @@
 |   |   |   +--->BN_MP_CLEAR_C
 |   |   +--->BN_MP_COPY_C
 |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_SET_C
 |   |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_DIV_2_C
@@ -933,6 +995,66 @@
 |   +--->BN_MP_INVMOD_SLOW_C
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_COPY_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_SET_C
@@ -968,6 +1090,470 @@
 |   +--->BN_MP_COPY_C
 |   |   +--->BN_MP_GROW_C
 +--->BN_MP_CLEAR_MULTI_C
++--->BN_MP_REDUCE_IS_2K_L_C
++--->BN_S_MP_EXPTMOD_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_REDUCE_SETUP_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_D_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_2K_L_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SQR_C
+|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_EXCH_C
 +--->BN_MP_DR_IS_MODULUS_C
 +--->BN_MP_REDUCE_IS_2K_C
 |   +--->BN_MP_REDUCE_2K_C
@@ -1375,122 +1961,190 @@
 |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_EXCH_C
-+--->BN_S_MP_EXPTMOD_C
-|   +--->BN_MP_COUNT_BITS_C
-|   +--->BN_MP_REDUCE_SETUP_C
-|   |   +--->BN_MP_2EXPT_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_DIV_C
-|   |   |   +--->BN_MP_CMP_MAG_C
+
+
+BN_MP_OR_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_ZERO_C
+
+
+BN_MP_GROW_C
+
+
+BN_MP_COUNT_BITS_C
+
+
+BN_MP_PRIME_FERMAT_C
++--->BN_MP_CMP_D_C
++--->BN_MP_INIT_C
++--->BN_MP_EXPTMOD_C
+|   +--->BN_MP_INVMOD_C
+|   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_2D_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_MOD_C
-|   |   +--->BN_MP_DIV_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_MP_COPY_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_SUB_C
 |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_2D_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_ADD_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_EXCH_C
-|   +--->BN_MP_COPY_C
-|   |   +--->BN_MP_GROW_C
-|   +--->BN_MP_SQR_C
-|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   +--->BN_MP_GROW_C
@@ -1507,58 +2161,417 @@
 |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_REDUCE_IS_2K_L_C
+|   +--->BN_S_MP_EXPTMOD_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
 |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_KARATSUBA_SQR_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_FAST_S_MP_SQR_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_S_MP_SQR_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   +--->BN_MP_REDUCE_C
-|   |   +--->BN_MP_INIT_COPY_C
-|   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_MUL_C
 |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   +--->BN_MP_MOD_2D_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_ADD_C
@@ -1610,6 +2623,8 @@
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_CLAMP_C
@@ -1617,217 +2632,9 @@
 |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MOD_2D_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_SUB_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_D_C
 |   |   +--->BN_MP_SET_C
 |   |   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_LSHD_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_ADD_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_MUL_C
-|   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   +--->BN_MP_SET_C
-|   |   +--->BN_MP_ZERO_C
-|   +--->BN_MP_EXCH_C
-
-
-BN_MP_OR_C
-+--->BN_MP_INIT_COPY_C
-|   +--->BN_MP_COPY_C
-|   |   +--->BN_MP_GROW_C
-+--->BN_MP_CLAMP_C
-+--->BN_MP_EXCH_C
-+--->BN_MP_CLEAR_C
-
-
-BN_MP_ZERO_C
-
-
-BN_MP_GROW_C
-
-
-BN_MP_COUNT_BITS_C
-
-
-BN_MP_PRIME_FERMAT_C
-+--->BN_MP_CMP_D_C
-+--->BN_MP_INIT_C
-+--->BN_MP_EXPTMOD_C
-|   +--->BN_MP_INVMOD_C
-|   |   +--->BN_FAST_MP_INVMOD_C
-|   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_CLEAR_C
-|   |   |   +--->BN_MP_COPY_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ABS_C
-|   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_CLEAR_MULTI_C
-|   |   |   |   +--->BN_MP_CLEAR_C
-|   |   +--->BN_MP_INVMOD_SLOW_C
-|   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_CLEAR_C
-|   |   |   +--->BN_MP_COPY_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_CLEAR_MULTI_C
-|   |   |   |   +--->BN_MP_CLEAR_C
-|   +--->BN_MP_CLEAR_C
-|   +--->BN_MP_ABS_C
-|   |   +--->BN_MP_COPY_C
-|   |   |   +--->BN_MP_GROW_C
-|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_DR_IS_MODULUS_C
 |   +--->BN_MP_REDUCE_IS_2K_C
 |   |   +--->BN_MP_REDUCE_2K_C
@@ -2235,364 +3042,6 @@
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_EXCH_C
-|   +--->BN_S_MP_EXPTMOD_C
-|   |   +--->BN_MP_COUNT_BITS_C
-|   |   +--->BN_MP_REDUCE_SETUP_C
-|   |   |   +--->BN_MP_2EXPT_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_DIV_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_2D_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MOD_C
-|   |   |   +--->BN_MP_DIV_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_2D_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_COPY_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_SQR_C
-|   |   |   +--->BN_MP_TOOM_SQR_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_KARATSUBA_SQR_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_FAST_S_MP_SQR_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_SQR_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_REDUCE_C
-|   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MUL_C
-|   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MUL_C
-|   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_EXCH_C
 +--->BN_MP_CMP_C
 |   +--->BN_MP_CMP_MAG_C
 +--->BN_MP_CLEAR_C
@@ -2618,6 +3067,7 @@
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -2838,6 +3288,7 @@
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -2894,7 +3345,65 @@
 |   |   +--->BN_MP_CLEAR_C
 |   +--->BN_MP_COPY_C
 |   |   +--->BN_MP_GROW_C
-|   +--->BN_MP_ABS_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_ZERO_C
 |   +--->BN_MP_DIV_2_C
@@ -2926,6 +3435,67 @@
 |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_INIT_C
 |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_COPY_C
 |   |   +--->BN_MP_GROW_C
 |   +--->BN_MP_SET_C
@@ -2978,7 +3548,65 @@
 |   +--->BN_MP_CLEAR_C
 +--->BN_MP_COPY_C
 |   +--->BN_MP_GROW_C
-+--->BN_MP_ABS_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
 +--->BN_MP_SET_C
 |   +--->BN_MP_ZERO_C
 +--->BN_MP_DIV_2_C
@@ -3313,6 +3941,7 @@
 |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_CLEAR_C
 |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
 |   +--->BN_MP_MUL_2D_C
 |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_LSHD_C
@@ -3465,7 +4094,55 @@
 |   |   |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -3493,6 +4170,57 @@
 |   |   |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_DIV_2_C
@@ -3525,6 +4253,437 @@
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_REDUCE_IS_2K_L_C
+|   |   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_DR_IS_MODULUS_C
 |   |   |   +--->BN_MP_REDUCE_IS_2K_C
 |   |   |   |   +--->BN_MP_REDUCE_2K_C
@@ -3895,343 +5054,6 @@
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_S_MP_EXPTMOD_C
-|   |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   |   +--->BN_MP_REDUCE_SETUP_C
-|   |   |   |   |   +--->BN_MP_2EXPT_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MOD_C
-|   |   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_SQR_C
-|   |   |   |   |   +--->BN_MP_TOOM_SQR_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_FAST_S_MP_SQR_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SQR_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_REDUCE_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_MUL_C
-|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_C
-|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_CMP_C
 |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   +--->BN_MP_SQRMOD_C
@@ -4322,6 +5144,7 @@
 |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
 |   |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_LSHD_C
@@ -4548,6 +5371,7 @@
 |   |   +--->BN_MP_CLEAR_C
 |   +--->BN_MP_SET_C
 |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
 |   +--->BN_MP_MUL_2D_C
 |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_LSHD_C
@@ -4743,7 +5567,55 @@
 |   |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -4771,6 +5643,57 @@
 |   |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_DIV_2_C
@@ -4803,6 +5726,437 @@
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_REDUCE_IS_2K_L_C
+|   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_DR_IS_MODULUS_C
 |   |   +--->BN_MP_REDUCE_IS_2K_C
 |   |   |   +--->BN_MP_REDUCE_2K_C
@@ -5173,343 +6527,6 @@
 |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_S_MP_EXPTMOD_C
-|   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   +--->BN_MP_REDUCE_SETUP_C
-|   |   |   |   +--->BN_MP_2EXPT_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MOD_C
-|   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_COPY_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_SQR_C
-|   |   |   |   +--->BN_MP_TOOM_SQR_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_FAST_S_MP_SQR_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_SQR_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_REDUCE_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MUL_C
-|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_C
-|   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_CMP_C
 |   |   +--->BN_MP_CMP_MAG_C
 |   +--->BN_MP_SQRMOD_C
@@ -5600,6 +6617,7 @@
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
 |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
@@ -5809,6 +6827,7 @@
 |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ABS_C
 |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_LSHD_C
@@ -5865,6 +6884,7 @@
 |   |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -6284,6 +7304,7 @@
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -6482,7 +7503,55 @@
 |   |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -6510,6 +7579,57 @@
 |   |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_DIV_2_C
@@ -6542,6 +7662,437 @@
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_REDUCE_IS_2K_L_C
+|   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_DR_IS_MODULUS_C
 |   |   +--->BN_MP_REDUCE_IS_2K_C
 |   |   |   +--->BN_MP_REDUCE_2K_C
@@ -6912,343 +8463,6 @@
 |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_S_MP_EXPTMOD_C
-|   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   +--->BN_MP_REDUCE_SETUP_C
-|   |   |   |   +--->BN_MP_2EXPT_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MOD_C
-|   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_COPY_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_SQR_C
-|   |   |   |   +--->BN_MP_TOOM_SQR_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_FAST_S_MP_SQR_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_SQR_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_REDUCE_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MUL_C
-|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_C
-|   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_CMP_C
 |   |   +--->BN_MP_CMP_MAG_C
 |   +--->BN_MP_SQRMOD_C
@@ -7339,6 +8553,7 @@
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
 |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
@@ -7391,6 +8606,67 @@
 +--->BN_MP_INIT_MULTI_C
 |   +--->BN_MP_INIT_C
 |   +--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
 +--->BN_MP_COPY_C
 |   +--->BN_MP_GROW_C
 +--->BN_MP_SET_C
@@ -7465,6 +8741,7 @@
 |   +--->BN_MP_ZERO_C
 |   +--->BN_MP_SET_C
 |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
 |   +--->BN_MP_MUL_2D_C
 |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_LSHD_C
@@ -7588,6 +8865,107 @@
 |   +--->BN_MP_CLEAR_C
 
 
+BN_MP_REDUCE_2K_L_C
++--->BN_MP_INIT_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
 BN_REVERSE_C
 
 
@@ -7655,6 +9033,18 @@
 +--->BN_MP_CLEAR_C
 
 
+BN_MP_REDUCE_2K_SETUP_L_C
++--->BN_MP_INIT_C
++--->BN_MP_2EXPT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_COUNT_BITS_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
 BN_MP_READ_RADIX_C
 +--->BN_MP_ZERO_C
 +--->BN_MP_MUL_D_C
@@ -7928,6 +9318,7 @@
 |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -7966,6 +9357,226 @@
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_2K_SETUP_L_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_2K_L_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
 +--->BN_MP_MOD_C
 |   +--->BN_MP_DIV_C
 |   |   +--->BN_MP_CMP_MAG_C
@@ -7974,6 +9585,7 @@
 |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -8092,121 +9704,6 @@
 |   |   +--->BN_MP_INIT_SIZE_C
 |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_EXCH_C
-+--->BN_MP_REDUCE_C
-|   +--->BN_MP_INIT_COPY_C
-|   +--->BN_MP_RSHD_C
-|   |   +--->BN_MP_ZERO_C
-|   +--->BN_MP_MUL_C
-|   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLEAR_MULTI_C
-|   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_INIT_SIZE_C
-|   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_EXCH_C
-|   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_MOD_2D_C
-|   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_CLAMP_C
-|   +--->BN_S_MP_MUL_DIGS_C
-|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_INIT_SIZE_C
-|   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_EXCH_C
-|   +--->BN_MP_SUB_C
-|   |   +--->BN_S_MP_ADD_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_CMP_D_C
-|   +--->BN_MP_SET_C
-|   |   +--->BN_MP_ZERO_C
-|   +--->BN_MP_LSHD_C
-|   |   +--->BN_MP_GROW_C
-|   +--->BN_MP_ADD_C
-|   |   +--->BN_S_MP_ADD_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_CMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   +--->BN_S_MP_SUB_C
-|   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_CLAMP_C
 +--->BN_MP_MUL_C
 |   +--->BN_MP_TOOM_MUL_C
 |   |   +--->BN_MP_INIT_MULTI_C
@@ -8372,6 +9869,7 @@
 |   +--->BN_MP_CLEAR_C
 +--->BN_MP_SET_C
 +--->BN_MP_COUNT_BITS_C
++--->BN_MP_ABS_C
 +--->BN_MP_MUL_2D_C
 |   +--->BN_MP_GROW_C
 |   +--->BN_MP_LSHD_C
@@ -8465,6 +9963,7 @@
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -8509,6 +10008,31 @@
 |   +--->BN_MP_CLAMP_C
 
 
+BN_MP_TO_SIGNED_BIN_N_C
++--->BN_MP_SIGNED_BIN_SIZE_C
+|   +--->BN_MP_UNSIGNED_BIN_SIZE_C
+|   |   +--->BN_MP_COUNT_BITS_C
++--->BN_MP_TO_SIGNED_BIN_C
+|   +--->BN_MP_TO_UNSIGNED_BIN_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+
+
+BN_MP_REDUCE_IS_2K_L_C
+
+
 BN_MP_RAND_C
 +--->BN_MP_ZERO_C
 +--->BN_MP_ADD_D_C
@@ -8536,6 +10060,26 @@
 BN_MP_SHRINK_C
 
 
+BN_MP_TO_UNSIGNED_BIN_N_C
++--->BN_MP_UNSIGNED_BIN_SIZE_C
+|   +--->BN_MP_COUNT_BITS_C
++--->BN_MP_TO_UNSIGNED_BIN_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
 BN_MP_REDUCE_C
 +--->BN_MP_REDUCE_SETUP_C
 |   +--->BN_MP_2EXPT_C
@@ -8551,6 +10095,7 @@
 |   |   |   +--->BN_MP_CLEAR_C
 |   |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -8766,6 +10311,7 @@
 |   |   |   +--->BN_MP_CLEAR_C
 |   |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
 |   |   +--->BN_MP_MUL_2D_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_LSHD_C
@@ -8912,6 +10458,7 @@
 |   +--->BN_MP_CMP_MAG_C
 |   +--->BN_MP_ZERO_C
 |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
 |   +--->BN_MP_MUL_2D_C
 |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_LSHD_C
@@ -9039,6 +10586,7 @@
 |   +--->BN_S_MP_SUB_C
 |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_CLAMP_C
++--->BN_MP_NEG_C
 +--->BN_MP_EXCH_C
 +--->BN_MP_CLEAR_MULTI_C
 |   +--->BN_MP_CLEAR_C
@@ -9078,6 +10626,7 @@
 |   |   +--->BN_MP_CLEAR_C
 |   +--->BN_MP_SET_C
 |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
 |   +--->BN_MP_MUL_2D_C
 |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_LSHD_C
@@ -9245,7 +10794,56 @@
 |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_DIV_2_C
@@ -9275,6 +10873,58 @@
 |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_SET_C
@@ -9309,6 +10959,443 @@
 |   |   +--->BN_MP_COPY_C
 |   |   |   +--->BN_MP_GROW_C
 |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_REDUCE_IS_2K_L_C
+|   +--->BN_S_MP_EXPTMOD_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_DR_IS_MODULUS_C
 |   +--->BN_MP_REDUCE_IS_2K_C
 |   |   +--->BN_MP_REDUCE_2K_C
@@ -9684,349 +11771,6 @@
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_EXCH_C
-|   +--->BN_S_MP_EXPTMOD_C
-|   |   +--->BN_MP_COUNT_BITS_C
-|   |   +--->BN_MP_REDUCE_SETUP_C
-|   |   |   +--->BN_MP_2EXPT_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_DIV_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MOD_C
-|   |   |   +--->BN_MP_DIV_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_COPY_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_SQR_C
-|   |   |   +--->BN_MP_TOOM_SQR_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_KARATSUBA_SQR_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_FAST_S_MP_SQR_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_SQR_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_REDUCE_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MUL_C
-|   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MUL_C
-|   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_EXCH_C
 +--->BN_MP_CMP_C
 |   +--->BN_MP_CMP_MAG_C
 +--->BN_MP_SQRMOD_C
@@ -10118,6 +11862,7 @@
 |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   +--->BN_MP_SET_C
 |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
 |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_LSHD_C
--- a/changes.txt	Sun Dec 19 11:33:56 2004 +0000
+++ b/changes.txt	Fri May 06 08:59:30 2005 +0000
@@ -1,3 +1,35 @@
+March 12th, 2005
+v0.35  -- Stupid XOR function missing line again... oops.
+       -- Fixed bug in invmod not handling negative inputs correctly [Wolfgang Ehrhardt]
+       -- Made exteuclid always give positive u3 output...[ Wolfgang Ehrhardt ]
+       -- [Wolfgang Ehrhardt] Suggested a fix for mp_reduce() which avoided underruns.  ;-)
+       -- mp_rand() would emit one too many digits and it was possible to get a 0 out of it ... oops
+       -- Added montgomery to the testing to make sure it handles 1..10 digit moduli correctly
+       -- Fixed bug in comba that would lead to possible erroneous outputs when "pa < digs" 
+       -- Fixed bug in mp_toradix_size for "0" [Kevin Kenny]
+       -- Updated chapters 1-5 of the textbook ;-) It now talks about the new comba code!
+
+February 12th, 2005
+v0.34  -- Fixed two more small errors in mp_prime_random_ex()
+       -- Fixed overflow in mp_mul_d() [Kevin Kenny]
+       -- Added mp_to_(un)signed_bin_n() functions which do bounds checking for ya [and report the size]
+       -- Added "large" diminished radix support.  Speeds up things like DSA where the moduli is of the form 2^k - P for some P < 2^(k/2) or so
+          Actually is faster than Montgomery on my AMD64 (and probably much faster on a P4)
+       -- Updated the manual a bit
+       -- Ok so I haven't done the textbook work yet... My current freelance gig has landed me in France till the 
+          end of Feb/05.  Once I get back I'll have tons of free time and I plan to go to town on the book.
+          As of this release the API will freeze.  At least until the book catches up with all the changes.  I welcome
+          bug reports but new algorithms will have to wait.
+
+December 23rd, 2004
+v0.33  -- Fixed "small" variant for mp_div() which would munge with negative dividends...
+       -- Fixed bug in mp_prime_random_ex() which would set the most significant byte to zero when
+          no special flags were set
+       -- Fixed overflow [minor] bug in fast_s_mp_sqr()
+       -- Made the makefiles easier to configure the group/user that ltm will install as
+       -- Fixed "final carry" bug in comba multipliers. (Volkan Ceylan)
+       -- Matt Johnston pointed out a missing semi-colon in mp_exptmod
+
 October 29th, 2004
 v0.32  -- Added "makefile.shared" for shared object support
        -- Added more to the build options/configs in the manual
--- a/demo/demo.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/demo/demo.c	Fri May 06 08:59:30 2005 +0000
@@ -9,15 +9,16 @@
 
 #include "tommath.h"
 
-void ndraw(mp_int *a, char *name)
+void ndraw(mp_int * a, char *name)
 {
-   char buf[4096];
+   char buf[16000];
+
    printf("%s: ", name);
-   mp_toradix(a, buf, 64);
+   mp_toradix(a, buf, 10);
    printf("%s\n", buf);
 }
 
-static void draw(mp_int *a)
+static void draw(mp_int * a)
 {
    ndraw(a, "");
 }
@@ -39,20 +40,23 @@
 int myrng(unsigned char *dst, int len, void *dat)
 {
    int x;
-   for (x = 0; x < len; x++) dst[x] = rand() & 0xFF;
+
+   for (x = 0; x < len; x++)
+      dst[x] = rand() & 0xFF;
    return len;
 }
 
 
 
-   char cmd[4096], buf[4096];
+char cmd[4096], buf[4096];
 int main(void)
 {
    mp_int a, b, c, d, e, f;
-   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
-                 div2_n, mul2_n, add_d_n, sub_d_n, t;
+   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n,
+      gcd_n, lcm_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n, t;
    unsigned rr;
    int i, n, err, cnt, ix, old_kara_m, old_kara_s;
+   mp_digit mp;
 
 
    mp_init(&a);
@@ -65,108 +69,152 @@
    srand(time(NULL));
 
 #if 0
-  // test mp_get_int
-  printf("Testing: mp_get_int\n");
-  for(i=0;i<1000;++i) {
-    t = ((unsigned long)rand()*rand()+1)&0xFFFFFFFF;
-    mp_set_int(&a,t);
-    if (t!=mp_get_int(&a)) { 
+   // test montgomery 
+   printf("Testing montgomery...\n");
+   for (i = 1; i < 10; i++) {
+      printf("Testing digit size: %d\n", i);
+      for (n = 0; n < 1000; n++) {
+         mp_rand(&a, i);
+         a.dp[0] |= 1;
+
+         // let's see if R is right
+         mp_montgomery_calc_normalization(&b, &a);
+         mp_montgomery_setup(&a, &mp);
+
+         // now test a random reduction 
+         for (ix = 0; ix < 100; ix++) {
+             mp_rand(&c, 1 + abs(rand()) % (2*i));
+             mp_copy(&c, &d);
+             mp_copy(&c, &e);
+
+             mp_mod(&d, &a, &d);
+             mp_montgomery_reduce(&c, &a, mp);
+             mp_mulmod(&c, &b, &a, &c);
+
+             if (mp_cmp(&c, &d) != MP_EQ) { 
+printf("d = e mod a, c = e MOD a\n");
+mp_todecimal(&a, buf); printf("a = %s\n", buf);
+mp_todecimal(&e, buf); printf("e = %s\n", buf);
+mp_todecimal(&d, buf); printf("d = %s\n", buf);
+mp_todecimal(&c, buf); printf("c = %s\n", buf);
+printf("compare no compare!\n"); exit(EXIT_FAILURE); }
+         }
+      }
+   }
+   printf("done\n");
+
+   // test mp_get_int
+   printf("Testing: mp_get_int\n");
+   for (i = 0; i < 1000; ++i) {
+      t = ((unsigned long) rand() * rand() + 1) & 0xFFFFFFFF;
+      mp_set_int(&a, t);
+      if (t != mp_get_int(&a)) {
+	 printf("mp_get_int() bad result!\n");
+	 return 1;
+      }
+   }
+   mp_set_int(&a, 0);
+   if (mp_get_int(&a) != 0) {
+      printf("mp_get_int() bad result!\n");
+      return 1;
+   }
+   mp_set_int(&a, 0xffffffff);
+   if (mp_get_int(&a) != 0xffffffff) {
       printf("mp_get_int() bad result!\n");
       return 1;
-    }
-  }
-  mp_set_int(&a,0);
-  if (mp_get_int(&a)!=0)
-  { printf("mp_get_int() bad result!\n");
-    return 1;
-  }
-  mp_set_int(&a,0xffffffff);
-  if (mp_get_int(&a)!=0xffffffff)
-  { printf("mp_get_int() bad result!\n");
-    return 1;
-  }
+   }
+   // test mp_sqrt
+   printf("Testing: mp_sqrt\n");
+   for (i = 0; i < 1000; ++i) {
+      printf("%6d\r", i);
+      fflush(stdout);
+      n = (rand() & 15) + 1;
+      mp_rand(&a, n);
+      if (mp_sqrt(&a, &b) != MP_OKAY) {
+	 printf("mp_sqrt() error!\n");
+	 return 1;
+      }
+      mp_n_root(&a, 2, &a);
+      if (mp_cmp_mag(&b, &a) != MP_EQ) {
+	 printf("mp_sqrt() bad result!\n");
+	 return 1;
+      }
+   }
 
-  // test mp_sqrt
-  printf("Testing: mp_sqrt\n");
-  for (i=0;i<1000;++i) { 
-    printf("%6d\r", i); fflush(stdout);
-    n = (rand()&15)+1;
-    mp_rand(&a,n);
-    if (mp_sqrt(&a,&b) != MP_OKAY)
-    { printf("mp_sqrt() error!\n");
-      return 1;
-    }
-    mp_n_root(&a,2,&a);
-    if (mp_cmp_mag(&b,&a) != MP_EQ)
-    { printf("mp_sqrt() bad result!\n");
-      return 1;
-    }
-  }
+   printf("\nTesting: mp_is_square\n");
+   for (i = 0; i < 1000; ++i) {
+      printf("%6d\r", i);
+      fflush(stdout);
 
-  printf("\nTesting: mp_is_square\n");
-  for (i=0;i<1000;++i) {
-    printf("%6d\r", i); fflush(stdout);
+      /* test mp_is_square false negatives */
+      n = (rand() & 7) + 1;
+      mp_rand(&a, n);
+      mp_sqr(&a, &a);
+      if (mp_is_square(&a, &n) != MP_OKAY) {
+	 printf("fn:mp_is_square() error!\n");
+	 return 1;
+      }
+      if (n == 0) {
+	 printf("fn:mp_is_square() bad result!\n");
+	 return 1;
+      }
 
-    /* test mp_is_square false negatives */
-    n = (rand()&7)+1;
-    mp_rand(&a,n);
-    mp_sqr(&a,&a);
-    if (mp_is_square(&a,&n)!=MP_OKAY) { 
-      printf("fn:mp_is_square() error!\n");
-      return 1;
-    }
-    if (n==0) { 
-      printf("fn:mp_is_square() bad result!\n");
-      return 1;
-    }
+      /* test for false positives */
+      mp_add_d(&a, 1, &a);
+      if (mp_is_square(&a, &n) != MP_OKAY) {
+	 printf("fp:mp_is_square() error!\n");
+	 return 1;
+      }
+      if (n == 1) {
+	 printf("fp:mp_is_square() bad result!\n");
+	 return 1;
+      }
 
-    /* test for false positives */
-    mp_add_d(&a, 1, &a);
-    if (mp_is_square(&a,&n)!=MP_OKAY) { 
-      printf("fp:mp_is_square() error!\n");
-      return 1;
-    }
-    if (n==1) { 
-      printf("fp:mp_is_square() bad result!\n");
-      return 1;
-    }
-
-  }
-  printf("\n\n");
+   }
+   printf("\n\n");
 
    /* test for size */
-   for (ix = 10; ix < 256; ix++) {
-       printf("Testing (not safe-prime): %9d bits    \r", ix); fflush(stdout);
-       err = mp_prime_random_ex(&a, 8, ix, (rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON, myrng, NULL);
-       if (err != MP_OKAY) {
-          printf("failed with err code %d\n", err);
-          return EXIT_FAILURE;
-       }
-       if (mp_count_bits(&a) != ix) {
-          printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
-          return EXIT_FAILURE;
-       }
+   for (ix = 10; ix < 128; ix++) {
+      printf("Testing (not safe-prime): %9d bits    \r", ix);
+      fflush(stdout);
+      err =
+	 mp_prime_random_ex(&a, 8, ix,
+			    (rand() & 1) ? LTM_PRIME_2MSB_OFF :
+			    LTM_PRIME_2MSB_ON, myrng, NULL);
+      if (err != MP_OKAY) {
+	 printf("failed with err code %d\n", err);
+	 return EXIT_FAILURE;
+      }
+      if (mp_count_bits(&a) != ix) {
+	 printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
+	 return EXIT_FAILURE;
+      }
    }
 
-   for (ix = 16; ix < 256; ix++) {
-       printf("Testing (   safe-prime): %9d bits    \r", ix); fflush(stdout);
-       err = mp_prime_random_ex(&a, 8, ix, ((rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON)|LTM_PRIME_SAFE, myrng, NULL);
-       if (err != MP_OKAY) {
-          printf("failed with err code %d\n", err);
-          return EXIT_FAILURE;
-       }
-       if (mp_count_bits(&a) != ix) {
-          printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
-          return EXIT_FAILURE;
-       }
-       /* let's see if it's really a safe prime */
-       mp_sub_d(&a, 1, &a);
-       mp_div_2(&a, &a);
-       mp_prime_is_prime(&a, 8, &cnt);
-       if (cnt != MP_YES) {
-          printf("sub is not prime!\n");
-          return EXIT_FAILURE;
-       }
+   for (ix = 16; ix < 128; ix++) {
+      printf("Testing (   safe-prime): %9d bits    \r", ix);
+      fflush(stdout);
+      err =
+	 mp_prime_random_ex(&a, 8, ix,
+			    ((rand() & 1) ? LTM_PRIME_2MSB_OFF :
+			     LTM_PRIME_2MSB_ON) | LTM_PRIME_SAFE, myrng,
+			    NULL);
+      if (err != MP_OKAY) {
+	 printf("failed with err code %d\n", err);
+	 return EXIT_FAILURE;
+      }
+      if (mp_count_bits(&a) != ix) {
+	 printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
+	 return EXIT_FAILURE;
+      }
+      /* let's see if it's really a safe prime */
+      mp_sub_d(&a, 1, &a);
+      mp_div_2(&a, &a);
+      mp_prime_is_prime(&a, 8, &cnt);
+      if (cnt != MP_YES) {
+	 printf("sub is not prime!\n");
+	 return EXIT_FAILURE;
+      }
    }
 
    printf("\n\n");
@@ -194,51 +242,56 @@
    printf("testing mp_cnt_lsb...\n");
    mp_set(&a, 1);
    for (ix = 0; ix < 1024; ix++) {
-       if (mp_cnt_lsb(&a) != ix) {
-          printf("Failed at %d, %d\n", ix, mp_cnt_lsb(&a));
-          return 0;
-       }
-       mp_mul_2(&a, &a);
+      if (mp_cnt_lsb(&a) != ix) {
+	 printf("Failed at %d, %d\n", ix, mp_cnt_lsb(&a));
+	 return 0;
+      }
+      mp_mul_2(&a, &a);
    }
 
 /* test mp_reduce_2k */
    printf("Testing mp_reduce_2k...\n");
    for (cnt = 3; cnt <= 128; ++cnt) {
-       mp_digit tmp;
-       mp_2expt(&a, cnt);
-       mp_sub_d(&a, 2, &a);  /* a = 2**cnt - 2 */
+      mp_digit tmp;
+
+      mp_2expt(&a, cnt);
+      mp_sub_d(&a, 2, &a);	/* a = 2**cnt - 2 */
 
 
-       printf("\nTesting %4d bits", cnt);
-       printf("(%d)", mp_reduce_is_2k(&a));
-       mp_reduce_2k_setup(&a, &tmp);
-       printf("(%d)", tmp);
-       for (ix = 0; ix < 1000; ix++) {
-           if (!(ix & 127)) {printf("."); fflush(stdout); }
-           mp_rand(&b, (cnt/DIGIT_BIT  + 1) * 2);
-           mp_copy(&c, &b);
-           mp_mod(&c, &a, &c);
-           mp_reduce_2k(&b, &a, 1);
-           if (mp_cmp(&c, &b)) {
-              printf("FAILED\n");
-              exit(0);
-           }
-        }
-    }
+      printf("\nTesting %4d bits", cnt);
+      printf("(%d)", mp_reduce_is_2k(&a));
+      mp_reduce_2k_setup(&a, &tmp);
+      printf("(%d)", tmp);
+      for (ix = 0; ix < 1000; ix++) {
+	 if (!(ix & 127)) {
+	    printf(".");
+	    fflush(stdout);
+	 }
+	 mp_rand(&b, (cnt / DIGIT_BIT + 1) * 2);
+	 mp_copy(&c, &b);
+	 mp_mod(&c, &a, &c);
+	 mp_reduce_2k(&b, &a, 2);
+	 if (mp_cmp(&c, &b)) {
+	    printf("FAILED\n");
+	    exit(0);
+	 }
+      }
+   }
 
 /* test mp_div_3  */
    printf("Testing mp_div_3...\n");
    mp_set(&d, 3);
-   for (cnt = 0; cnt < 10000; ) {
+   for (cnt = 0; cnt < 10000;) {
       mp_digit r1, r2;
 
-      if (!(++cnt & 127)) printf("%9d\r", cnt);
+      if (!(++cnt & 127))
+	 printf("%9d\r", cnt);
       mp_rand(&a, abs(rand()) % 128 + 1);
       mp_div(&a, &d, &b, &e);
       mp_div_3(&a, &c, &r2);
 
       if (mp_cmp(&b, &c) || mp_cmp_d(&e, r2)) {
-         printf("\n\nmp_div_3 => Failure\n");
+	 printf("\n\nmp_div_3 => Failure\n");
       }
    }
    printf("\n\nPassed div_3 testing\n");
@@ -246,270 +299,438 @@
 /* test the DR reduction */
    printf("testing mp_dr_reduce...\n");
    for (cnt = 2; cnt < 32; cnt++) {
-       printf("%d digit modulus\n", cnt);
-       mp_grow(&a, cnt);
-       mp_zero(&a);
-       for (ix = 1; ix < cnt; ix++) {
-           a.dp[ix] = MP_MASK;
-       }
-       a.used = cnt;
-       a.dp[0] = 3;
+      printf("%d digit modulus\n", cnt);
+      mp_grow(&a, cnt);
+      mp_zero(&a);
+      for (ix = 1; ix < cnt; ix++) {
+	 a.dp[ix] = MP_MASK;
+      }
+      a.used = cnt;
+      a.dp[0] = 3;
 
-       mp_rand(&b, cnt - 1);
-       mp_copy(&b, &c);
+      mp_rand(&b, cnt - 1);
+      mp_copy(&b, &c);
 
       rr = 0;
       do {
-         if (!(rr & 127)) { printf("%9lu\r", rr); fflush(stdout); }
-         mp_sqr(&b, &b); mp_add_d(&b, 1, &b);
-         mp_copy(&b, &c);
+	 if (!(rr & 127)) {
+	    printf("%9lu\r", rr);
+	    fflush(stdout);
+	 }
+	 mp_sqr(&b, &b);
+	 mp_add_d(&b, 1, &b);
+	 mp_copy(&b, &c);
 
-         mp_mod(&b, &a, &b);
-         mp_dr_reduce(&c, &a, (((mp_digit)1)<<DIGIT_BIT)-a.dp[0]);
+	 mp_mod(&b, &a, &b);
+	 mp_dr_reduce(&c, &a, (((mp_digit) 1) << DIGIT_BIT) - a.dp[0]);
 
-         if (mp_cmp(&b, &c) != MP_EQ) {
-            printf("Failed on trial %lu\n", rr); exit(-1);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("Failed on trial %lu\n", rr);
+	    exit(-1);
 
-         }
+	 }
       } while (++rr < 500);
       printf("Passed DR test for %d digits\n", cnt);
    }
 
 #endif
 
+/* test the mp_reduce_2k_l code */
+#if 0
+#if 0
+/* first load P with 2^1024 - 0x2A434 B9FDEC95 D8F9D550 FFFFFFFF FFFFFFFF */
+   mp_2expt(&a, 1024);
+   mp_read_radix(&b, "2A434B9FDEC95D8F9D550FFFFFFFFFFFFFFFF", 16);
+   mp_sub(&a, &b, &a);
+#elif 1
+/*  p = 2^2048 - 0x1 00000000 00000000 00000000 00000000 4945DDBF 8EA2A91D 5776399B B83E188F  */
+   mp_2expt(&a, 2048);
+   mp_read_radix(&b,
+		 "1000000000000000000000000000000004945DDBF8EA2A91D5776399BB83E188F",
+		 16);
+   mp_sub(&a, &b, &a);
+#endif
+
+   mp_todecimal(&a, buf);
+   printf("p==%s\n", buf);
+/* now mp_reduce_is_2k_l() should return */
+   if (mp_reduce_is_2k_l(&a) != 1) {
+      printf("mp_reduce_is_2k_l() return 0, should be 1\n");
+      return EXIT_FAILURE;
+   }
+   mp_reduce_2k_setup_l(&a, &d);
+   /* now do a million square+1 to see if it varies */
+   mp_rand(&b, 64);
+   mp_mod(&b, &a, &b);
+   mp_copy(&b, &c);
+   printf("testing mp_reduce_2k_l...");
+   fflush(stdout);
+   for (cnt = 0; cnt < (1UL << 20); cnt++) {
+      mp_sqr(&b, &b);
+      mp_add_d(&b, 1, &b);
+      mp_reduce_2k_l(&b, &a, &d);
+      mp_sqr(&c, &c);
+      mp_add_d(&c, 1, &c);
+      mp_mod(&c, &a, &c);
+      if (mp_cmp(&b, &c) != MP_EQ) {
+	 printf("mp_reduce_2k_l() failed at step %lu\n", cnt);
+	 mp_tohex(&b, buf);
+	 printf("b == %s\n", buf);
+	 mp_tohex(&c, buf);
+	 printf("c == %s\n", buf);
+	 return EXIT_FAILURE;
+      }
+   }
+   printf("...Passed\n");
+#endif
+
    div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
-   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= 0;
+      sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n =
+      sub_d_n = 0;
 
    /* force KARA and TOOM to enable despite cutoffs */
    KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 110;
-   TOOM_SQR_CUTOFF      = TOOM_MUL_CUTOFF      = 150;
+   TOOM_SQR_CUTOFF = TOOM_MUL_CUTOFF = 150;
 
    for (;;) {
-       /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
-       switch (abs(rand()) % 7) {
-           case 0:  mp_clear(&a); mp_init(&a); break;
-           case 1:  mp_clear(&b); mp_init(&b); break;
-           case 2:  mp_clear(&c); mp_init(&c); break;
-           case 3:  mp_clear(&d); mp_init(&d); break;
-           case 4:  mp_clear(&e); mp_init(&e); break;
-           case 5:  mp_clear(&f); mp_init(&f); break;
-           case 6:  break; /* don't clear any */
-       }
+      /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
+      switch (abs(rand()) % 7) {
+      case 0:
+	 mp_clear(&a);
+	 mp_init(&a);
+	 break;
+      case 1:
+	 mp_clear(&b);
+	 mp_init(&b);
+	 break;
+      case 2:
+	 mp_clear(&c);
+	 mp_init(&c);
+	 break;
+      case 3:
+	 mp_clear(&d);
+	 mp_init(&d);
+	 break;
+      case 4:
+	 mp_clear(&e);
+	 mp_init(&e);
+	 break;
+      case 5:
+	 mp_clear(&f);
+	 mp_init(&f);
+	 break;
+      case 6:
+	 break;			/* don't clear any */
+      }
 
 
-       printf("%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu ", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n);
-       fgets(cmd, 4095, stdin);
-       cmd[strlen(cmd)-1] = 0;
-       printf("%s  ]\r",cmd); fflush(stdout);
-       if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+      printf
+	 ("%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu ",
+	  add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n,
+	  expt_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n);
+      fgets(cmd, 4095, stdin);
+      cmd[strlen(cmd) - 1] = 0;
+      printf("%s  ]\r", cmd);
+      fflush(stdout);
+      if (!strcmp(cmd, "mul2d")) {
+	 ++mul2d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &rr);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
 
-          mp_mul_2d(&a, rr, &a);
-          a.sign = b.sign;
-          if (mp_cmp(&a, &b) != MP_EQ) {
-             printf("mul2d failed, rr == %d\n",rr);
-             draw(&a);
-             draw(&b);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "div2d")) { ++div2d_n;
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+	 mp_mul_2d(&a, rr, &a);
+	 a.sign = b.sign;
+	 if (mp_cmp(&a, &b) != MP_EQ) {
+	    printf("mul2d failed, rr == %d\n", rr);
+	    draw(&a);
+	    draw(&b);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "div2d")) {
+	 ++div2d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &rr);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
 
-          mp_div_2d(&a, rr, &a, &e);
-          a.sign = b.sign;
-          if (a.used == b.used && a.used == 0) { a.sign = b.sign = MP_ZPOS; }
-          if (mp_cmp(&a, &b) != MP_EQ) {
-             printf("div2d failed, rr == %d\n",rr);
-             draw(&a);
-             draw(&b);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "add")) { ++add_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-          mp_copy(&a, &d);
-          mp_add(&d, &b, &d);
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("add %lu failure!\n", add_n);
-draw(&a);draw(&b);draw(&c);draw(&d);
-             return 0;
-          }
+	 mp_div_2d(&a, rr, &a, &e);
+	 a.sign = b.sign;
+	 if (a.used == b.used && a.used == 0) {
+	    a.sign = b.sign = MP_ZPOS;
+	 }
+	 if (mp_cmp(&a, &b) != MP_EQ) {
+	    printf("div2d failed, rr == %d\n", rr);
+	    draw(&a);
+	    draw(&b);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "add")) {
+	 ++add_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_add(&d, &b, &d);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("add %lu failure!\n", add_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
 
-          /* test the sign/unsigned storage functions */
+	 /* test the sign/unsigned storage functions */
 
-          rr = mp_signed_bin_size(&c);
-          mp_to_signed_bin(&c, (unsigned char *)cmd);
-          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
-          mp_read_signed_bin(&d, (unsigned char *)cmd, rr);
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("mp_signed_bin failure!\n");
-             draw(&c);
-             draw(&d);
-             return 0;
-          }
+	 rr = mp_signed_bin_size(&c);
+	 mp_to_signed_bin(&c, (unsigned char *) cmd);
+	 memset(cmd + rr, rand() & 255, sizeof(cmd) - rr);
+	 mp_read_signed_bin(&d, (unsigned char *) cmd, rr);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("mp_signed_bin failure!\n");
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
 
 
-          rr = mp_unsigned_bin_size(&c);
-          mp_to_unsigned_bin(&c, (unsigned char *)cmd);
-          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
-          mp_read_unsigned_bin(&d, (unsigned char *)cmd, rr);
-          if (mp_cmp_mag(&c, &d) != MP_EQ) {
-             printf("mp_unsigned_bin failure!\n");
-             draw(&c);
-             draw(&d);
-             return 0;
-          }
+	 rr = mp_unsigned_bin_size(&c);
+	 mp_to_unsigned_bin(&c, (unsigned char *) cmd);
+	 memset(cmd + rr, rand() & 255, sizeof(cmd) - rr);
+	 mp_read_unsigned_bin(&d, (unsigned char *) cmd, rr);
+	 if (mp_cmp_mag(&c, &d) != MP_EQ) {
+	    printf("mp_unsigned_bin failure!\n");
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
 
-       } else if (!strcmp(cmd, "sub")) { ++sub_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-          mp_copy(&a, &d);
-          mp_sub(&d, &b, &d);
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("sub %lu failure!\n", sub_n);
-draw(&a);draw(&b);draw(&c);draw(&d);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "mul")) { ++mul_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-          mp_copy(&a, &d);
-          mp_mul(&d, &b, &d);
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("mul %lu failure!\n", mul_n);
-draw(&a);draw(&b);draw(&c);draw(&d);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "div")) { ++div_n;
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64);
-          fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 64);
+      } else if (!strcmp(cmd, "sub")) {
+	 ++sub_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_sub(&d, &b, &d);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("sub %lu failure!\n", sub_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "mul")) {
+	 ++mul_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_mul(&d, &b, &d);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("mul %lu failure!\n", mul_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "div")) {
+	 ++div_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&d, buf, 64);
 
-          mp_div(&a, &b, &e, &f);
-          if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) {
-             printf("div %lu failure!\n", div_n);
-draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f);
-             return 0;
-          }
+	 mp_div(&a, &b, &e, &f);
+	 if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) {
+	    printf("div %lu %d, %d, failure!\n", div_n, mp_cmp(&c, &e),
+		   mp_cmp(&d, &f));
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    draw(&e);
+	    draw(&f);
+	    return 0;
+	 }
 
-       } else if (!strcmp(cmd, "sqr")) { ++sqr_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          mp_copy(&a, &c);
-          mp_sqr(&c, &c);
-          if (mp_cmp(&b, &c) != MP_EQ) {
-             printf("sqr %lu failure!\n", sqr_n);
-draw(&a);draw(&b);draw(&c);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "gcd")) { ++gcd_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-          mp_copy(&a, &d);
-          mp_gcd(&d, &b, &d);
-          d.sign = c.sign;
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("gcd %lu failure!\n", gcd_n);
-draw(&a);draw(&b);draw(&c);draw(&d);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "lcm")) { ++lcm_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-             mp_copy(&a, &d);
-             mp_lcm(&d, &b, &d);
-             d.sign = c.sign;
-             if (mp_cmp(&c, &d) != MP_EQ) {
-                printf("lcm %lu failure!\n", lcm_n);
-   draw(&a);draw(&b);draw(&c);draw(&d);
-                return 0;
-             }
-       } else if (!strcmp(cmd, "expt")) {  ++expt_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&d, buf, 64);
-             mp_copy(&a, &e);
-             mp_exptmod(&e, &b, &c, &e);
-             if (mp_cmp(&d, &e) != MP_EQ) {
-                printf("expt %lu failure!\n", expt_n);
-   draw(&a);draw(&b);draw(&c);draw(&d); draw(&e);
-                return 0;
-             }
-       } else if (!strcmp(cmd, "invmod")) {  ++inv_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-             mp_invmod(&a, &b, &d);
-             mp_mulmod(&d,&a,&b,&e);
-             if (mp_cmp_d(&e, 1) != MP_EQ) {
-                printf("inv [wrong value from MPI?!] failure\n");
-                draw(&a);draw(&b);draw(&c);draw(&d);
-                mp_gcd(&a, &b, &e);
-                draw(&e);
-                return 0;
-             }
+      } else if (!strcmp(cmd, "sqr")) {
+	 ++sqr_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_copy(&a, &c);
+	 mp_sqr(&c, &c);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("sqr %lu failure!\n", sqr_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "gcd")) {
+	 ++gcd_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_gcd(&d, &b, &d);
+	 d.sign = c.sign;
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("gcd %lu failure!\n", gcd_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "lcm")) {
+	 ++lcm_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_lcm(&d, &b, &d);
+	 d.sign = c.sign;
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("lcm %lu failure!\n", lcm_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "expt")) {
+	 ++expt_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&d, buf, 64);
+	 mp_copy(&a, &e);
+	 mp_exptmod(&e, &b, &c, &e);
+	 if (mp_cmp(&d, &e) != MP_EQ) {
+	    printf("expt %lu failure!\n", expt_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    draw(&e);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "invmod")) {
+	 ++inv_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_invmod(&a, &b, &d);
+	 mp_mulmod(&d, &a, &b, &e);
+	 if (mp_cmp_d(&e, 1) != MP_EQ) {
+	    printf("inv [wrong value from MPI?!] failure\n");
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    mp_gcd(&a, &b, &e);
+	    draw(&e);
+	    return 0;
+	 }
 
-       } else if (!strcmp(cmd, "div2")) { ++div2_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             mp_div_2(&a, &c);
-             if (mp_cmp(&c, &b) != MP_EQ) {
-                 printf("div_2 %lu failure\n", div2_n);
-                 draw(&a);
-                 draw(&b);
-                 draw(&c);
-                 return 0;
-             }
-       } else if (!strcmp(cmd, "mul2")) { ++mul2_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             mp_mul_2(&a, &c);
-             if (mp_cmp(&c, &b) != MP_EQ) {
-                 printf("mul_2 %lu failure\n", mul2_n);
-                 draw(&a);
-                 draw(&b);
-                 draw(&c);
-                 return 0;
-             }
-       } else if (!strcmp(cmd, "add_d")) { ++add_d_n;
-              fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
-              fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
-              mp_add_d(&a, ix, &c);
-              if (mp_cmp(&b, &c) != MP_EQ) {
-                 printf("add_d %lu failure\n", add_d_n);
-                 draw(&a);
-                 draw(&b);
-                 draw(&c);
-                 printf("d == %d\n", ix);
-                 return 0;
-              }
-       } else if (!strcmp(cmd, "sub_d")) { ++sub_d_n;
-              fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
-              fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
-              mp_sub_d(&a, ix, &c);
-              if (mp_cmp(&b, &c) != MP_EQ) {
-                 printf("sub_d %lu failure\n", sub_d_n);
-                 draw(&a);
-                 draw(&b);
-                 draw(&c);
-                 printf("d == %d\n", ix);
-                 return 0;
-              }
-       }
+      } else if (!strcmp(cmd, "div2")) {
+	 ++div2_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_div_2(&a, &c);
+	 if (mp_cmp(&c, &b) != MP_EQ) {
+	    printf("div_2 %lu failure\n", div2_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "mul2")) {
+	 ++mul2_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_mul_2(&a, &c);
+	 if (mp_cmp(&c, &b) != MP_EQ) {
+	    printf("mul_2 %lu failure\n", mul2_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "add_d")) {
+	 ++add_d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &ix);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_add_d(&a, ix, &c);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("add_d %lu failure\n", add_d_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    printf("d == %d\n", ix);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "sub_d")) {
+	 ++sub_d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &ix);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_sub_d(&a, ix, &c);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("sub_d %lu failure\n", sub_d_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    printf("d == %d\n", ix);
+	    return 0;
+	 }
+      }
    }
    return 0;
 }
-
--- a/demo/timing.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/demo/timing.c	Fri May 06 08:59:30 2005 +0000
@@ -11,15 +11,16 @@
 #endif
 
 
-void ndraw(mp_int *a, char *name)
+void ndraw(mp_int * a, char *name)
 {
    char buf[4096];
+
    printf("%s: ", name);
    mp_toradix(a, buf, 64);
    printf("%s\n", buf);
 }
 
-static void draw(mp_int *a)
+static void draw(mp_int * a)
 {
    ndraw(a, "");
 }
@@ -38,40 +39,39 @@
    }
 }
 
-#if defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
 /* RDTSC from Scott Duplichan */
-static ulong64 TIMFUNC (void)
-   {
-   #if defined __GNUC__
-      #ifdef __i386__
-         ulong64 a;
-         __asm__ __volatile__ ("rdtsc ":"=A" (a));
-         return a;
-      #else /* gcc-IA64 version */
-         unsigned long result;
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         while (__builtin_expect ((int) result == -1, 0))
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         return result;
-      #endif
+static ulong64 TIMFUNC(void)
+{
+#if defined __GNUC__
+#if defined(__i386__) || defined(__x86_64__)
+   unsigned long long a;
+   __asm__ __volatile__("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::
+			"m"(a):"%eax", "%edx");
+   return a;
+#else /* gcc-IA64 version */
+   unsigned long result;
+   __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory");
+
+   while (__builtin_expect((int) result == -1, 0))
+      __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory");
+
+   return result;
+#endif
 
    // Microsoft and Intel Windows compilers
-   #elif defined _M_IX86
-     __asm rdtsc
-   #elif defined _M_AMD64
-     return __rdtsc ();
-   #elif defined _M_IA64
-     #if defined __INTEL_COMPILER
-       #include <ia64intrin.h>
-     #endif
-      return __getReg (3116);
-   #else
-     #error need rdtsc function for this build
-   #endif
-   }
+#elif defined _M_IX86
+   __asm rdtsc
+#elif defined _M_AMD64
+   return __rdtsc();
+#elif defined _M_IA64
+#if defined __INTEL_COMPILER
+#include <ia64intrin.h>
+#endif
+   return __getReg(3116);
 #else
-#define TIMFUNC clock
+#error need rdtsc function for this build
 #endif
+}
 
 #define DO(x) x; x;
 //#define DO4(x) DO2(x); DO2(x);
@@ -81,7 +81,7 @@
 int main(void)
 {
    ulong64 tt, gg, CLK_PER_SEC;
-   FILE *log, *logb, *logc;
+   FILE *log, *logb, *logc, *logd;
    mp_int a, b, c, d, e, f;
    int n, cnt, ix, old_kara_m, old_kara_s;
    unsigned rr;
@@ -94,168 +94,191 @@
    mp_init(&f);
 
    srand(time(NULL));
- 
+
+
+   /* temp. turn off TOOM */
+   TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
+
+   CLK_PER_SEC = TIMFUNC();
+   sleep(1);
+   CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC;
 
-      /* temp. turn off TOOM */
-      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
-
-      CLK_PER_SEC = TIMFUNC();
-      sleep(1);
-      CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC;
+   printf("CLK_PER_SEC == %llu\n", CLK_PER_SEC);
+   goto exptmod;
+   log = fopen("logs/add.log", "w");
+   for (cnt = 8; cnt <= 128; cnt += 8) {
+      SLEEP;
+      mp_rand(&a, cnt);
+      mp_rand(&b, cnt);
+      rr = 0;
+      tt = -1;
+      do {
+	 gg = TIMFUNC();
+	 DO(mp_add(&a, &b, &c));
+	 gg = (TIMFUNC() - gg) >> 1;
+	 if (tt > gg)
+	    tt = gg;
+      } while (++rr < 100000);
+      printf("Adding\t\t%4d-bit => %9llu/sec, %9llu cycles\n",
+	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+      fprintf(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
+      fflush(log);
+   }
+   fclose(log);
 
-      printf("CLK_PER_SEC == %llu\n", CLK_PER_SEC);
-      
-      log = fopen("logs/add.log", "w");
-      for (cnt = 8; cnt <= 128; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_add(&a,&b,&c));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 100000);
-         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt); fflush(log);
+   log = fopen("logs/sub.log", "w");
+   for (cnt = 8; cnt <= 128; cnt += 8) {
+      SLEEP;
+      mp_rand(&a, cnt);
+      mp_rand(&b, cnt);
+      rr = 0;
+      tt = -1;
+      do {
+	 gg = TIMFUNC();
+	 DO(mp_sub(&a, &b, &c));
+	 gg = (TIMFUNC() - gg) >> 1;
+	 if (tt > gg)
+	    tt = gg;
+      } while (++rr < 100000);
+
+      printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu cycles\n",
+	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+      fprintf(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
+      fflush(log);
+   }
+   fclose(log);
+
+   /* do mult/square twice, first without karatsuba and second with */
+ multtest:
+   old_kara_m = KARATSUBA_MUL_CUTOFF;
+   old_kara_s = KARATSUBA_SQR_CUTOFF;
+   for (ix = 0; ix < 2; ix++) {
+      printf("With%s Karatsuba\n", (ix == 0) ? "out" : "");
+
+      KARATSUBA_MUL_CUTOFF = (ix == 0) ? 9999 : old_kara_m;
+      KARATSUBA_SQR_CUTOFF = (ix == 0) ? 9999 : old_kara_s;
+
+      log = fopen((ix == 0) ? "logs/mult.log" : "logs/mult_kara.log", "w");
+      for (cnt = 4; cnt <= 10240 / DIGIT_BIT; cnt += 2) {
+	 SLEEP;
+	 mp_rand(&a, cnt);
+	 mp_rand(&b, cnt);
+	 rr = 0;
+	 tt = -1;
+	 do {
+	    gg = TIMFUNC();
+	    DO(mp_mul(&a, &b, &c));
+	    gg = (TIMFUNC() - gg) >> 1;
+	    if (tt > gg)
+	       tt = gg;
+	 } while (++rr < 100);
+	 printf("Multiplying\t%4d-bit => %9llu/sec, %9llu cycles\n",
+		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+	 fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);
+	 fflush(log);
       }
       fclose(log);
 
-      log = fopen("logs/sub.log", "w");
-      for (cnt = 8; cnt <= 128; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_sub(&a,&b,&c));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 100000);
-
-         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);  fflush(log);
-      }
-      fclose(log);
-
-   /* do mult/square twice, first without karatsuba and second with */
-   old_kara_m = KARATSUBA_MUL_CUTOFF;
-   old_kara_s = KARATSUBA_SQR_CUTOFF;
-   for (ix = 0; ix < 1; ix++) {
-      printf("With%s Karatsuba\n", (ix==0)?"out":"");
-
-      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
-      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
-
-      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
-      for (cnt = 4; cnt <= 288; cnt += 2) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_mul(&a, &b, &c));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 100);
-         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
-      }
-      fclose(log);
-
-      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
-      for (cnt = 4; cnt <= 288; cnt += 2) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_sqr(&a, &b));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 100);
-         printf("Squaring\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
+      log = fopen((ix == 0) ? "logs/sqr.log" : "logs/sqr_kara.log", "w");
+      for (cnt = 4; cnt <= 10240 / DIGIT_BIT; cnt += 2) {
+	 SLEEP;
+	 mp_rand(&a, cnt);
+	 rr = 0;
+	 tt = -1;
+	 do {
+	    gg = TIMFUNC();
+	    DO(mp_sqr(&a, &b));
+	    gg = (TIMFUNC() - gg) >> 1;
+	    if (tt > gg)
+	       tt = gg;
+	 } while (++rr < 100);
+	 printf("Squaring\t%4d-bit => %9llu/sec, %9llu cycles\n",
+		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+	 fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);
+	 fflush(log);
       }
       fclose(log);
 
    }
+ exptmod:
 
-  {
+   {
       char *primes[] = {
-         /* 2K moduli mersenne primes */
-         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
-         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
-         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
-         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
-         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
-         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
+	 /* 2K large moduli */
+	 "179769313486231590772930519078902473361797697894230657273430081157732675805500963132708477322407536021120113879871393357658789768814416622492847430639474124377767893424865485276302219601246094119453082952085005768838150682342462881473913110540827237163350510684586239334100047359817950870678242457666208137217",
+	 "32317006071311007300714876688669951960444102669715484032130345427524655138867890893197201411522913463688717960921898019494119559150490921095088152386448283120630877367300996091750197750389652106796057638384067568276792218642619756161838094338476170470581645852036305042887575891541065808607552399123930385521914333389668342420684974786564569494856176035326322058077805659331026192708460314150258592864177116725943603718461857357598351152301645904403697613233287231227125684710820209725157101726931323469678542580656697935045997268352998638099733077152121140120031150424541696791951097529546801429027668869927491725169",
+	 "1044388881413152506691752710716624382579964249047383780384233483283953907971557456848826811934997558340890106714439262837987573438185793607263236087851365277945956976543709998340361590134383718314428070011855946226376318839397712745672334684344586617496807908705803704071284048740118609114467977783598029006686938976881787785946905630190260940599579453432823469303026696443059025015972399867714215541693835559885291486318237914434496734087811872639496475100189041349008417061675093668333850551032972088269550769983616369411933015213796825837188091833656751221318492846368125550225998300412344784862595674492194617023806505913245610825731835380087608622102834270197698202313169017678006675195485079921636419370285375124784014907159135459982790513399611551794271106831134090584272884279791554849782954323534517065223269061394905987693002122963395687782878948440616007412945674919823050571642377154816321380631045902916136926708342856440730447899971901781465763473223850267253059899795996090799469201774624817718449867455659250178329070473119433165550807568221846571746373296884912819520317457002440926616910874148385078411929804522981857338977648103126085902995208257421855249796721729039744118165938433694823325696642096892124547425283",
+	 /* 2K moduli mersenne primes */
+	 "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
+	 "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
+	 "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
+	 "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
+	 "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
+	 "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
 
-         /* DR moduli */
-         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
-         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
-         "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
-         "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
-         "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
-         "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
-         "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
+	 /* DR moduli */
+	 "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
+	 "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
+	 "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
+	 "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
+	 "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
+	 "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
+	 "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
 
-         /* generic unrestricted moduli */
-         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
-         "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
-         "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
-         "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
-         "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
-         "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
-         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
-         NULL
+	 /* generic unrestricted moduli */
+	 "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
+	 "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
+	 "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
+	 "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
+	 "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
+	 "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
+	 "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
+	 NULL
       };
-   log = fopen("logs/expt.log", "w");
-   logb = fopen("logs/expt_dr.log", "w");
-   logc = fopen("logs/expt_2k.log", "w");
-   for (n = 0; primes[n]; n++) {
-      SLEEP;
-      mp_read_radix(&a, primes[n], 10);
-      mp_zero(&b);
-      for (rr = 0; rr < (unsigned)mp_count_bits(&a); rr++) {
-         mp_mul_2(&b, &b);
-         b.dp[0] |= lbit();
-         b.used  += 1;
+      log = fopen("logs/expt.log", "w");
+      logb = fopen("logs/expt_dr.log", "w");
+      logc = fopen("logs/expt_2k.log", "w");
+      logd = fopen("logs/expt_2kl.log", "w");
+      for (n = 0; primes[n]; n++) {
+	 SLEEP;
+	 mp_read_radix(&a, primes[n], 10);
+	 mp_zero(&b);
+	 for (rr = 0; rr < (unsigned) mp_count_bits(&a); rr++) {
+	    mp_mul_2(&b, &b);
+	    b.dp[0] |= lbit();
+	    b.used += 1;
+	 }
+	 mp_sub_d(&a, 1, &c);
+	 mp_mod(&b, &c, &b);
+	 mp_set(&c, 3);
+	 rr = 0;
+	 tt = -1;
+	 do {
+	    gg = TIMFUNC();
+	    DO(mp_exptmod(&c, &b, &a, &d));
+	    gg = (TIMFUNC() - gg) >> 1;
+	    if (tt > gg)
+	       tt = gg;
+	 } while (++rr < 10);
+	 mp_sub_d(&a, 1, &e);
+	 mp_sub(&e, &b, &b);
+	 mp_exptmod(&c, &b, &a, &e);	/* c^(p-1-b) mod a */
+	 mp_mulmod(&e, &d, &a, &d);	/* c^b * c^(p-1-b) == c^p-1 == 1 */
+	 if (mp_cmp_d(&d, 1)) {
+	    printf("Different (%d)!!!\n", mp_count_bits(&a));
+	    draw(&d);
+	    exit(0);
+	 }
+	 printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu cycles\n",
+		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+	 fprintf(n < 4 ? logd : (n < 9) ? logc : (n < 16) ? logb : log,
+		 "%d %9llu\n", mp_count_bits(&a), tt);
       }
-      mp_sub_d(&a, 1, &c);
-      mp_mod(&b, &c, &b);
-      mp_set(&c, 3);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_exptmod(&c, &b, &a, &d));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 10);
-      mp_sub_d(&a, 1, &e);
-      mp_sub(&e, &b, &b);
-      mp_exptmod(&c, &b, &a, &e);  /* c^(p-1-b) mod a */
-      mp_mulmod(&e, &d, &a, &d);   /* c^b * c^(p-1-b) == c^p-1 == 1 */
-      if (mp_cmp_d(&d, 1)) {
-         printf("Different (%d)!!!\n", mp_count_bits(&a));
-         draw(&d);
-         exit(0);
-      }
-      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), tt);
-   }
    }
    fclose(log);
    fclose(logb);
    fclose(logc);
+   fclose(logd);
 
    log = fopen("logs/invmod.log", "w");
    for (cnt = 4; cnt <= 128; cnt += 4) {
@@ -264,28 +287,29 @@
       mp_rand(&b, cnt);
 
       do {
-         mp_add_d(&b, 1, &b);
-         mp_gcd(&a, &b, &c);
+	 mp_add_d(&b, 1, &b);
+	 mp_gcd(&a, &b, &c);
       } while (mp_cmp_d(&c, 1) != MP_EQ);
 
-         rr = 0;
-         tt = -1;
+      rr = 0;
+      tt = -1;
       do {
-         gg = TIMFUNC();
-         DO(mp_invmod(&b, &a, &c));
-         gg = (TIMFUNC() - gg)>>1;
-         if (tt > gg) tt = gg;
+	 gg = TIMFUNC();
+	 DO(mp_invmod(&b, &a, &c));
+	 gg = (TIMFUNC() - gg) >> 1;
+	 if (tt > gg)
+	    tt = gg;
       } while (++rr < 1000);
       mp_mulmod(&b, &c, &a, &d);
       if (mp_cmp_d(&d, 1) != MP_EQ) {
-         printf("Failed to invert\n");
-         return 0;
+	 printf("Failed to invert\n");
+	 return 0;
       }
-      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);
+      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu cycles\n",
+	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+      fprintf(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
    }
    fclose(log);
 
    return 0;
 }
-
--- a/dep.pl	Sun Dec 19 11:33:56 2004 +0000
+++ b/dep.pl	Fri May 06 08:59:30 2005 +0000
@@ -13,6 +13,8 @@
 foreach my $filename (glob "bn*.c") {
    my $define = $filename;
 
+print "Processing $filename\n";
+
    # convert filename to upper case so we can use it as a define 
    $define =~ tr/[a-z]/[A-Z]/;
    $define =~ tr/\./_/;
--- a/etc/mersenne.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/etc/mersenne.c	Fri May 06 08:59:30 2005 +0000
@@ -18,15 +18,15 @@
   }
 
   if ((res = mp_init (&u)) != MP_OKAY) {
-    goto __N;
+    goto LBL_N;
   }
 
   /* n = 2^s - 1 */
   if ((res = mp_2expt(&n, s)) != MP_OKAY) {
-     goto __MU;
+     goto LBL_MU;
   }
   if ((res = mp_sub_d (&n, 1, &n)) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
 
   /* set u=4 */
@@ -36,22 +36,22 @@
   for (k = 1; k <= s - 2; k++) {
     /* u = u^2 - 2 mod n */
     if ((res = mp_sqr (&u, &u)) != MP_OKAY) {
-      goto __MU;
+      goto LBL_MU;
     }
     if ((res = mp_sub_d (&u, 2, &u)) != MP_OKAY) {
-      goto __MU;
+      goto LBL_MU;
     }
 
     /* make sure u is positive */
     while (u.sign == MP_NEG) {
       if ((res = mp_add (&u, &n, &u)) != MP_OKAY) {
-         goto __MU;
+         goto LBL_MU;
       }
     }
 
     /* reduce */
     if ((res = mp_reduce_2k (&u, &n, 1)) != MP_OKAY) {
-      goto __MU;
+      goto LBL_MU;
     }
   }
 
@@ -62,8 +62,8 @@
   }
 
   res = MP_OKAY;
-__MU:mp_clear (&u);
-__N:mp_clear (&n);
+LBL_MU:mp_clear (&u);
+LBL_N:mp_clear (&n);
   return res;
 }
 
--- a/etc/pprime.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/etc/pprime.c	Fri May 06 08:59:30 2005 +0000
@@ -189,7 +189,7 @@
   }
 
   if ((res = mp_init (&v)) != MP_OKAY) {
-    goto __C;
+    goto LBL_C;
   }
 
   /* product of first 50 primes */
@@ -197,34 +197,34 @@
        mp_read_radix (&v,
 		      "19078266889580195013601891820992757757219839668357012055907516904309700014933909014729740190",
 		      10)) != MP_OKAY) {
-    goto __V;
+    goto LBL_V;
   }
 
   if ((res = mp_init (&a)) != MP_OKAY) {
-    goto __V;
+    goto LBL_V;
   }
 
   /* set the prime */
   mp_set (&a, prime_digit ());
 
   if ((res = mp_init (&b)) != MP_OKAY) {
-    goto __A;
+    goto LBL_A;
   }
 
   if ((res = mp_init (&n)) != MP_OKAY) {
-    goto __B;
+    goto LBL_B;
   }
 
   if ((res = mp_init (&x)) != MP_OKAY) {
-    goto __N;
+    goto LBL_N;
   }
 
   if ((res = mp_init (&y)) != MP_OKAY) {
-    goto __X;
+    goto LBL_X;
   }
 
   if ((res = mp_init (&z)) != MP_OKAY) {
-    goto __Y;
+    goto LBL_Y;
   }
 
   /* now loop making the single digit */
@@ -236,25 +236,25 @@
 
     /* now compute z = a * b * 2 */
     if ((res = mp_mul (&a, &b, &z)) != MP_OKAY) {	/* z = a * b */
-      goto __Z;
+      goto LBL_Z;
     }
 
     if ((res = mp_copy (&z, &c)) != MP_OKAY) {	/* c = a * b */
-      goto __Z;
+      goto LBL_Z;
     }
 
     if ((res = mp_mul_2 (&z, &z)) != MP_OKAY) {	/* z = 2 * a * b */
-      goto __Z;
+      goto LBL_Z;
     }
 
     /* n = z + 1 */
     if ((res = mp_add_d (&z, 1, &n)) != MP_OKAY) {	/* n = z + 1 */
-      goto __Z;
+      goto LBL_Z;
     }
 
     /* check (n, v) == 1 */
     if ((res = mp_gcd (&n, &v, &y)) != MP_OKAY) {	/* y = (n, v) */
-      goto __Z;
+      goto LBL_Z;
     }
 
     if (mp_cmp_d (&y, 1) != MP_EQ)
@@ -266,7 +266,7 @@
 
       /* compute x^a mod n */
       if ((res = mp_exptmod (&x, &a, &n, &y)) != MP_OKAY) {	/* y = x^a mod n */
-	goto __Z;
+	goto LBL_Z;
       }
 
       /* if y == 1 loop */
@@ -275,7 +275,7 @@
 
       /* now x^2a mod n */
       if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2a mod n */
-	goto __Z;
+	goto LBL_Z;
       }
 
       if (mp_cmp_d (&y, 1) == MP_EQ)
@@ -283,7 +283,7 @@
 
       /* compute x^b mod n */
       if ((res = mp_exptmod (&x, &b, &n, &y)) != MP_OKAY) {	/* y = x^b mod n */
-	goto __Z;
+	goto LBL_Z;
       }
 
       /* if y == 1 loop */
@@ -292,7 +292,7 @@
 
       /* now x^2b mod n */
       if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2b mod n */
-	goto __Z;
+	goto LBL_Z;
       }
 
       if (mp_cmp_d (&y, 1) == MP_EQ)
@@ -300,7 +300,7 @@
 
       /* compute x^c mod n == x^ab mod n */
       if ((res = mp_exptmod (&x, &c, &n, &y)) != MP_OKAY) {	/* y = x^ab mod n */
-	goto __Z;
+	goto LBL_Z;
       }
 
       /* if y == 1 loop */
@@ -309,7 +309,7 @@
 
       /* now compute (x^c mod n)^2 */
       if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2ab mod n */
-	goto __Z;
+	goto LBL_Z;
       }
 
       /* y should be 1 */
@@ -346,14 +346,14 @@
   mp_exch (&n, p);
 
   res = MP_OKAY;
-__Z:mp_clear (&z);
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__N:mp_clear (&n);
-__B:mp_clear (&b);
-__A:mp_clear (&a);
-__V:mp_clear (&v);
-__C:mp_clear (&c);
+LBL_Z:mp_clear (&z);
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_N:mp_clear (&n);
+LBL_B:mp_clear (&b);
+LBL_A:mp_clear (&a);
+LBL_V:mp_clear (&v);
+LBL_C:mp_clear (&c);
   return res;
 }
 
--- a/etc/tune.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/etc/tune.c	Fri May 06 08:59:30 2005 +0000
@@ -10,13 +10,44 @@
  */
 #define TIMES (1UL<<14UL)
 
+/* RDTSC from Scott Duplichan */
+static ulong64 TIMFUNC (void)
+   {
+   #if defined __GNUC__
+      #if defined(__i386__) || defined(__x86_64__)
+         unsigned long long a;
+         __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
+         return a;
+      #else /* gcc-IA64 version */
+         unsigned long result;
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         while (__builtin_expect ((int) result == -1, 0))
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         return result;
+      #endif
+
+   // Microsoft and Intel Windows compilers
+   #elif defined _M_IX86
+     __asm rdtsc
+   #elif defined _M_AMD64
+     return __rdtsc ();
+   #elif defined _M_IA64
+     #if defined __INTEL_COMPILER
+       #include <ia64intrin.h>
+     #endif
+      return __getReg (3116);
+   #else
+     #error need rdtsc function for this build
+   #endif
+   }
+
 
 #ifndef X86_TIMER
 
 /* generic ISO C timer */
-ulong64 __T;
-void t_start(void) { __T = clock(); }
-ulong64 t_read(void) { return clock() - __T; }
+ulong64 LBL_T;
+void t_start(void) { LBL_T = TIMFUNC(); }
+ulong64 t_read(void) { return TIMFUNC() - LBL_T; }
 
 #else
 extern void t_start(void);
--- a/logs/add.log	Sun Dec 19 11:33:56 2004 +0000
+++ b/logs/add.log	Fri May 06 08:59:30 2005 +0000
@@ -1,16 +1,16 @@
-224       222
-448       330
-672       436
-896       520
-1120       612
-1344       696
-1568       810
-1792       912
-2016      1006
-2240      1116
-2464      1152
-2688      1284
-2912      1348
-3136      1486
-3360      1580
-3584      1636
+480        87
+960       111
+1440       135
+1920       159
+2400       200
+2880       224
+3360       248
+3840       272
+4320       296
+4800       320
+5280       344
+5760       368
+6240       392
+6720       416
+7200       440
+7680       464
--- a/logs/expt.log	Sun Dec 19 11:33:56 2004 +0000
+++ b/logs/expt.log	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,7 @@
+513   1489160
+769   3688476
+1025   8162061
+2049  49260015
+2561  89579052
+3073 148797060
+4097 324449263
--- a/logs/expt_2k.log	Sun Dec 19 11:33:56 2004 +0000
+++ b/logs/expt_2k.log	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,5 @@
+607   2272809
+1279   9557382
+2203  36250309
+3217  87666486
+4253 174168369
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/logs/expt_2kl.log	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,4 @@
+1024   6954080
+2048  35993987
+4096 176068521
+521   1683720
--- a/logs/expt_dr.log	Sun Dec 19 11:33:56 2004 +0000
+++ b/logs/expt_dr.log	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,7 @@
+532   1989592
+784   3898697
+1036   6519700
+1540  15676650
+2072  33128187
+3080  82963362
+4116 168358337
--- a/logs/mult.log	Sun Dec 19 11:33:56 2004 +0000
+++ b/logs/mult.log	Fri May 06 08:59:30 2005 +0000
@@ -1,143 +1,84 @@
-140      1272
-195      1428
-252      1996
-307      2586
-364      3464
-420      4420
-476      5260
-532      6430
-588      7692
-644      8704
-699     10226
-755     11670
-812     13190
-865     14834
-924     16738
-979     18362
-1036     20660
-1092     22776
-1148     24848
-1204     27168
-1260     29930
-1316     32258
-1370     35172
-1422     37534
-1482     40390
-1537     43990
-1589     46946
-1652     50438
-1703     52902
-1764     56646
-1820     59892
-1876     63248
-1932     66872
-1988     72596
-2042     74662
-2100     78512
-2156     82944
-2211     87444
-2268     92170
-2324     95534
-2380    100484
-2435    105024
-2491    109460
-2546    114154
-2603    118946
-2660    124110
-2716    129300
-2771    134274
-2828    139594
-2883    145234
-2939    150332
-2996    155750
-3048    161718
-3108    167492
-3162    173882
-3219    179766
-3276    185560
-3330    191826
-3388    197822
-3442    204176
-3500    210682
-3556    217236
-3612    223484
-3666    230714
-3724    237744
-3779    244080
-3835    250970
-3890    257914
-3947    265162
-4001    272128
-4060    279108
-4116    287606
-4171    294716
-4227    302806
-4284    310260
-4340    318564
-4395    326164
-4443    334034
-4508    342108
-4561    351810
-4618    358828
-4675    367332
-4732    376140
-4787    384172
-4841    393308
-4899    402036
-4955    411286
-5010    420290
-5067    429688
-5124    438810
-5180    448130
-5235    457264
-5290    467390
-5348    476586
-5404    486120
-5459    496512
-5516    506624
-5569    516346
-5628    526604
-5684    536544
-5740    546936
-5796    557284
-5852    568106
-5907    578824
-5963    589204
-6019    600176
-6076    610564
-6127    621972
-6188    633564
-6244    644730
-6300    655288
-6354    667402
-6412    678824
-6467    690594
-6522    702718
-6580    714148
-6636    725608
-6690    737834
-6747    750100
-6804    762202
-6860    774184
-6916    787298
-6971    798734
-7028    811162
-7083    824570
-7139    837738
-7196   2579488
-7245   2626714
-7308   2643582
-7364   2698746
-7416   2734106
-7476   2773372
-7530   2816738
-7588   2859204
-7643   2938596
-7698   2919716
-7754   2988542
-7812   3026520
-7867   3058304
-7924   3115790
-7977   3161450
-8035   3203138
-8092   3244056
+271       555
+390       855
+508      1161
+631      1605
+749      2117
+871      2687
+991      3329
+1108      4084
+1231      4786
+1351      5624
+1470      6392
+1586      7364
+1710      8218
+1830      9255
+1951     10217
+2067     11461
+2191     12463
+2308     13677
+2430     14800
+2551     16232
+2671     17460
+2791     18899
+2902     20247
+3028     21902
+3151     23240
+3267     24927
+3391     26441
+3511     28277
+3631     29838
+3749     31751
+3869     33673
+3989     35431
+4111     37518
+4231     39426
+4349     41504
+4471     43567
+4591     45786
+4711     47876
+4831     50299
+4951     52427
+5071     54785
+5189     57241
+5307     59730
+5431     62194
+5551     64761
+5670     67322
+5789     70073
+5907     72663
+6030     75437
+6151     78242
+6268     81202
+6389     83948
+6509     86985
+6631     89903
+6747     93184
+6869     96044
+6991     99286
+7109    102395
+7229    105917
+7351    108940
+7470    112490
+7589    115702
+7711    119508
+7831    122632
+7951    126410
+8071    129808
+8190    133895
+8311    137146
+8431    141218
+8549    144732
+8667    149131
+8790    152462
+8911    156754
+9030    160479
+9149    165138
+9271    168601
+9391    173185
+9511    176988
+9627    181976
+9751    185539
+9870    190388
+9991    194335
+10110    199605
+10228    203298
--- a/logs/mult_kara.log	Sun Dec 19 11:33:56 2004 +0000
+++ b/logs/mult_kara.log	Fri May 06 08:59:30 2005 +0000
@@ -1,33 +1,84 @@
-924     16686
-1146     25334
-1371     35304
-1591     47122
-1820     61500
-2044     75254
-2266     91732
-2492    111656
-2716    129428
-2937    147508
-3164    167758
-3388    188248
-3612    210826
-3836    233814
-4059    256898
-4284    280210
-4508    310372
-4731    333902
-4955    376502
-5179    402854
-5404    432004
-5626    459010
-5849    491868
-6076    520550
-6300    547400
-6524    575968
-6747    608482
-6971    642850
-7196    673670
-7419    710680
-7644    743942
-7868    780394
-8092    817342
+271       560
+391       870
+511      1159
+631      1605
+750      2111
+871      2737
+991      3361
+1111      4054
+1231      4778
+1351      5600
+1471      6404
+1591      7323
+1710      8255
+1831      9239
+1948     10257
+2070     11397
+2190     12531
+2308     13665
+2429     14870
+2550     16175
+2671     17539
+2787     18879
+2911     20350
+3031     21807
+3150     23415
+3270     24897
+3388     26567
+3511     28205
+3627     30076
+3751     31744
+3869     33657
+3991     35425
+4111     37522
+4229     39363
+4351     41503
+4470     43491
+4590     45827
+4711     47795
+4828     50166
+4951     52318
+5070     54911
+5191     57036
+5308     58237
+5431     60248
+5551     62678
+5671     64786
+5791     67294
+5908     69343
+6031     71607
+6151     74166
+6271     76590
+6391     78734
+6511     81175
+6631     83742
+6750     86403
+6868     88873
+6990     91150
+7110     94211
+7228     96922
+7351     99445
+7469    102216
+7589    104968
+7711    108113
+7827    110758
+7950    113714
+8071    116511
+8186    119643
+8310    122679
+8425    125581
+8551    128715
+8669    131778
+8788    135116
+8910    138138
+9031    141628
+9148    144754
+9268    148367
+9391    151551
+9511    155033
+9631    158652
+9751    162125
+9871    165248
+9988    168627
+10111    172427
+10231    176412
--- a/logs/sqr.log	Sun Dec 19 11:33:56 2004 +0000
+++ b/logs/sqr.log	Fri May 06 08:59:30 2005 +0000
@@ -1,143 +1,84 @@
-139       806
-195      1212
-252      1604
-307      2260
-364      2892
-420      3308
-476      4152
-532      4814
-588      5754
-644      6684
-700      7226
-756      8324
-808      9092
-866     10068
-924     11204
-976     12918
-1036     13656
-1092     15248
-1148     15956
-1204     17270
-1260     19894
-1316     20516
-1370     21864
-1428     25554
-1483     26138
-1540     27086
-1596     29246
-1652     32210
-1707     32704
-1764     35142
-1820     39050
-1876     39256
-1931     41574
-1985     45070
-2044     46352
-2099     48114
-2155     51332
-2212     53268
-2267     55890
-2324     59054
-2380     60206
-2434     63540
-2491     66084
-2547     68590
-2604     74332
-2660     74784
-2715     77974
-2772     79924
-2826     82914
-2884     87210
-2929     89076
-2996     92480
-3052     96814
-3108     99990
-3162    102550
-3219    105396
-3276    109284
-3332    113752
-3387    116628
-3444    120782
-3500    122938
-3556    127940
-3612    303656
-3667    312212
-3724    324376
-3779    329204
-3833    340910
-3892    353850
-3943    362348
-4003    367780
-4056    380448
-4114    393616
-4172    404104
-4227    415148
-4284    409770
-4339    436648
-4394    442970
-4451    463096
-4507    472056
-4564    485780
-4616    496286
-4675    507612
-4732    519524
-4788    536768
-4843    542754
-4899    553090
-4956    571986
-5012    586340
-5068    599606
-5124    613670
-5179    624256
-5235    636266
-5292    655518
-5348    668142
-5403    677266
-5460    696040
-5516    712772
-5570    723942
-5628    739052
-5684    755350
-5739    769962
-5790    775258
-5851    790128
-5908    814536
-5962    827278
-6018    844510
-6076    851606
-6130    865748
-6188    894752
-6244    900474
-6300    928174
-6356    928440
-6410    957758
-6468    981134
-6524    994088
-6580   1011124
-6636   1027178
-6692   1045466
-6747   1056910
-6804   1083784
-6860   1104706
-6915   1116450
-6972   1137894
-7028   1154670
-7084   1158064
-7138   1188734
-7196   1214218
-7249   1226822
-7307   1247528
-7363   1255338
-7420   1291104
-7475   1297940
-7532   1324994
-7587   1340274
-7644   1342596
-7698   1381418
-7756   1382904
-7812   1432588
-7867   1443632
-7922   1465092
-7979   1496804
-8036   1520142
-8092   1539566
+265       562
+389       882
+509      1207
+631      1572
+750      1990
+859      2433
+991      2894
+1109      3555
+1230      4228
+1350      5018
+1471      5805
+1591      6579
+1709      7415
+1829      8329
+1949      9225
+2071     10139
+2188     11239
+2309     12178
+2431     13212
+2551     14294
+2671     15551
+2791     16512
+2911     17718
+3030     18876
+3150     20259
+3270     21374
+3391     22650
+3511     23948
+3631     25493
+3750     26756
+3870     28225
+3989     29705
+4110     31409
+4230     32834
+4351     34327
+4471     35818
+4591     37636
+4711     39228
+4830     40868
+4949     42393
+5070     44541
+5191     46269
+5310     48162
+5429     49728
+5548     51985
+5671     53948
+5791     55885
+5910     57584
+6031     60082
+6150     62239
+6270     64309
+6390     66014
+6511     68766
+6631     71012
+6750     73172
+6871     74952
+6991     77909
+7111     80371
+7231     82666
+7351     84531
+7469     87698
+7589     90318
+7711    225384
+7830    232428
+7950    240009
+8070    246522
+8190    253662
+8310    260961
+8431    269253
+8549    275743
+8671    283769
+8789    290811
+8911    300034
+9030    306873
+9149    315085
+9270    323944
+9390    332390
+9508    337519
+9631    348986
+9749    356904
+9871    367013
+9989    373831
+10108    381033
+10230    393475
--- a/logs/sqr_kara.log	Sun Dec 19 11:33:56 2004 +0000
+++ b/logs/sqr_kara.log	Fri May 06 08:59:30 2005 +0000
@@ -1,33 +1,84 @@
-922     11272
-1148     16004
-1370     21958
-1596     28684
-1817     37832
-2044     46386
-2262     56218
-2492     66388
-2716     77478
-2940     89380
-3163    103680
-3385    116274
-3612    135334
-3836    151332
-4057    164938
-4284    183178
-4508    198864
-4731    215222
-4954    231986
-5180    251660
-5404    269414
-5626    288454
-5850    307806
-6076    329458
-6299    347726
-6523    369864
-6748    387832
-6971    413010
-7194    453310
-7415    476936
-7643    497118
-7867    521394
-8091    540224
+271       560
+388       878
+511      1179
+629      1625
+751      1988
+871      2423
+989      2896
+1111      3561
+1231      4209
+1350      5015
+1470      5804
+1591      6556
+1709      7420
+1831      8263
+1951      9173
+2070     10153
+2191     11229
+2310     12167
+2431     13211
+2550     14309
+2671     15524
+2788     16525
+2910     17712
+3028     18822
+3148     20220
+3271     21343
+3391     22652
+3511     23944
+3630     25485
+3750     26778
+3868     28201
+3990     29653
+4111     31393
+4225     32841
+4350     34328
+4471     35786
+4590     37652
+4711     39245
+4830     40876
+4951     42433
+5068     44547
+5191     46321
+5311     48140
+5430     49727
+5550     52034
+5671     53954
+5791     55921
+5908     57597
+6031     60084
+6148     62226
+6270     64295
+6390     66045
+6511     68779
+6629     71003
+6751     73169
+6871     74992
+6991     77895
+7110     80376
+7231     82628
+7351     84468
+7470     87664
+7591     90284
+7711     91352
+7828     93995
+7950     96276
+8071     98691
+8190    101256
+8308    103631
+8431    105222
+8550    108343
+8671    110281
+8787    112764
+8911    115397
+9031    117690
+9151    120266
+9271    122715
+9391    124624
+9510    127937
+9630    130313
+9750    132914
+9871    136129
+9991    138517
+10108    141525
+10231    144225
--- a/logs/sub.log	Sun Dec 19 11:33:56 2004 +0000
+++ b/logs/sub.log	Fri May 06 08:59:30 2005 +0000
@@ -1,16 +1,16 @@
-224       216
-448       324
-672       428
-896       532
-1120       648
-1344       766
-1568       862
-1792       928
-2016      1070
-2240      1128
-2464      1250
-2688      1344
-2912      1436
-3136      1542
-3360      1628
-3584      1696
+480        94
+960       116
+1440       140
+1920       164
+2400       205
+2880       229
+3360       253
+3840       277
+4320       299
+4800       321
+5280       345
+5760       371
+6240       395
+6720       419
+7200       441
+7680       465
--- a/makefile	Sun Dec 19 11:33:56 2004 +0000
+++ b/makefile	Fri May 06 08:59:30 2005 +0000
@@ -1,10 +1,14 @@
 #Makefile for GCC
 #
 #Tom St Denis
+
+#version of library 
+VERSION=0.35
+
 CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
 
 #for speed 
-CFLAGS += -O3 -funroll-loops
+CFLAGS += -O3 -funroll-all-loops
 
 #for size 
 #CFLAGS += -Os
@@ -15,13 +19,15 @@
 #debug
 #CFLAGS += -g3
 
-VERSION=0.32
+#install as this user
+USER=root
+GROUP=root
 
 default: libtommath.a
 
 #default files to install
 LIBNAME=libtommath.a
-HEADERS=tommath.h
+HEADERS=tommath.h tommath_class.h tommath_superclass.h
 
 #LIBPATH-The directory for libtommath to be installed to.
 #INCPATH-The directory to install the header files for libtommath.
@@ -51,17 +57,18 @@
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
 bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
-bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
 
 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
 	ranlib libtommath.a
 
-
 #make a profiled library (takes a while!!!)
 #
 # This will build the library with profile generation
@@ -86,19 +93,19 @@
 	ranlib libtommath.a	
 
 install: libtommath.a
-	install -d -g root -o root $(DESTDIR)$(LIBPATH)
-	install -d -g root -o root $(DESTDIR)$(INCPATH)
-	install -g root -o root $(LIBNAME) $(DESTDIR)$(LIBPATH)
-	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
 
 test: libtommath.a demo/demo.o
-	$(CC) demo/demo.o libtommath.a -o test
+	$(CC) $(CFLAGS) demo/demo.o libtommath.a -o test
 	
 mtest: test	
-	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest -s
+	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest
         
 timing: libtommath.a
-	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest -s
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest
 
 # makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think]
 docdvi: tommath.src
--- a/makefile.bcc	Sun Dec 19 11:33:56 2004 +0000
+++ b/makefile.bcc	Fri May 06 08:59:30 2005 +0000
@@ -27,11 +27,13 @@
 bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
 bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
+bn_mp_reduce_2k_l.obj bn_mp_reduce_is_2k_l.obj bn_mp_reduce_2k_setup_l.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
 bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
 bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
-bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj
+bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj \
+bn_mp_to_signed_bin_n.obj bn_mp_to_unsigned_bin_n.obj
 
 TARGET = libtommath.lib
 
--- a/makefile.cygwin_dll	Sun Dec 19 11:33:56 2004 +0000
+++ b/makefile.cygwin_dll	Fri May 06 08:59:30 2005 +0000
@@ -32,11 +32,13 @@
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
 bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
-bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
 
 # make a Windows DLL via Cygwin
 windll:  $(OBJECTS)
--- a/makefile.icc	Sun Dec 19 11:33:56 2004 +0000
+++ b/makefile.icc	Fri May 06 08:59:30 2005 +0000
@@ -21,6 +21,10 @@
 # Default to just generic max opts
 CFLAGS += -O3 -xN
 
+#install as this user
+USER=root
+GROUP=root
+
 default: libtommath.a
 
 #default files to install
@@ -55,11 +59,13 @@
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
 bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
-bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
 
 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
@@ -89,10 +95,10 @@
 	ranlib libtommath.a	
 
 install: libtommath.a
-	install -d -g root -o root $(DESTDIR)$(LIBPATH)
-	install -d -g root -o root $(DESTDIR)$(INCPATH)
-	install -g root -o root $(LIBNAME) $(DESTDIR)$(LIBPATH)
-	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
 
 test: libtommath.a demo/demo.o
 	$(CC) demo/demo.o libtommath.a -o test
--- a/makefile.msvc	Sun Dec 19 11:33:56 2004 +0000
+++ b/makefile.msvc	Fri May 06 08:59:30 2005 +0000
@@ -26,11 +26,13 @@
 bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
 bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
+bn_mp_reduce_2k_l.obj bn_mp_reduce_is_2k_l.obj bn_mp_reduce_2k_setup_l.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
 bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
 bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
-bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj
+bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj \
+bn_mp_to_signed_bin_n.obj bn_mp_to_unsigned_bin_n.obj
 
 library: $(OBJECTS)
 	lib /out:tommath.lib $(OBJECTS)
--- a/makefile.shared	Sun Dec 19 11:33:56 2004 +0000
+++ b/makefile.shared	Fri May 06 08:59:30 2005 +0000
@@ -1,10 +1,9 @@
 #Makefile for GCC
 #
 #Tom St Denis
-VERSION=0:32
+VERSION=0:35
 
 CC = libtool --mode=compile gcc
-
 CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
 
 #for speed 
@@ -16,11 +15,15 @@
 #x86 optimizations [should be valid for any GCC install though]
 CFLAGS  += -fomit-frame-pointer
 
+#install as this user
+USER=root
+GROUP=root
+
 default: libtommath.la
 
 #default files to install
 LIBNAME=libtommath.la
-HEADERS=tommath.h
+HEADERS=tommath.h tommath_class.h tommath_superclass.h
 
 #LIBPATH-The directory for libtommath to be installed to.
 #INCPATH-The directory to install the header files for libtommath.
@@ -50,18 +53,21 @@
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
 bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
-bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
+
 
 libtommath.la:  $(OBJECTS)
 	libtool --mode=link gcc *.lo -o libtommath.la -rpath $(LIBPATH) -version-info $(VERSION)
 	libtool --mode=link gcc *.o -o libtommath.a 
 	libtool --mode=install install -c libtommath.la $(LIBPATH)/libtommath.la
-	install -d -g root -o root $(DESTDIR)$(INCPATH)
-	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
 
 test: libtommath.a demo/demo.o
 	gcc $(CFLAGS) -c demo/demo.c -o demo/demo.o
--- a/mtest/mtest.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/mtest/mtest.c	Fri May 06 08:59:30 2005 +0000
@@ -46,7 +46,7 @@
    int n, size;
    unsigned char buf[2048];
 
-   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 1031;
+   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 101;
    buf[0] = (fgetc(rng)&1)?1:0;
    fread(buf+1, 1, size, rng);
    while (buf[1] == 0) buf[1] = fgetc(rng);
@@ -58,7 +58,7 @@
    int n, size;
    unsigned char buf[2048];
 
-   size = 10 + ((fgetc(rng)<<8) + fgetc(rng)) % 97;
+   size = 10 + ((fgetc(rng)<<8) + fgetc(rng)) % 101;
    buf[0] = (fgetc(rng)&1)?1:0;
    fread(buf+1, 1, size, rng);
    while (buf[1] == 0) buf[1] = fgetc(rng);
Binary file poster.pdf has changed
--- a/pre_gen/mpi.c	Sun Dec 19 11:33:56 2004 +0000
+++ b/pre_gen/mpi.c	Fri May 06 08:59:30 2005 +0000
@@ -69,8 +69,7 @@
  * Based on slow invmod except this is optimized for the case where b is 
  * odd as per HAC Note 14.64 on pp. 610
  */
-int
-fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
   mp_int  x, y, u, v, B, D;
   int     res, neg;
@@ -87,20 +86,20 @@
 
   /* x == modulus, y == value to invert */
   if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
+    goto LBL_ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   mp_set (&D, 1);
 
@@ -109,17 +108,17 @@
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 4.2 if B is odd then */
     if (mp_isodd (&B) == 1) {
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
       }
     }
     /* B = B/2 */
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -127,18 +126,18 @@
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 5.2 if D is odd then */
     if (mp_isodd (&D) == 1) {
       /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
       }
     }
     /* D = D/2 */
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -146,20 +145,20 @@
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   } else {
     /* v - v - u, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -173,21 +172,21 @@
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* b is now the inverse */
   neg = a->sign;
   while (D.sign == MP_NEG) {
     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
   mp_exch (&D, c);
   c->sign = neg;
   res = MP_OKAY;
 
-__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
   return res;
 }
 #endif
@@ -220,8 +219,7 @@
  *
  * Based on Algorithm 14.32 on pp.601 of HAC.
 */
-int
-fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
   int     ix, res, olduse;
   mp_word W[MP_WARRAY];
@@ -401,8 +399,7 @@
  * Based on Algorithm 14.12 on pp.595 of HAC.
  *
  */
-int
-fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -420,7 +417,7 @@
 
   /* clear the carry */
   _W = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
       int      tx, ty;
       int      iy;
       mp_digit *tmpx, *tmpy;
@@ -433,7 +430,7 @@
       tmpx = a->dp + tx;
       tmpy = b->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially 
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -450,14 +447,17 @@
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
 
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
+
   /* setup dest */
   olduse  = c->used;
-  c->used = digs;
+  c->used = pa;
 
   {
     register mp_digit *tmpc;
     tmpc = c->dp;
-    for (ix = 0; ix < digs; ix++) {
+    for (ix = 0; ix < pa+1; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
@@ -501,8 +501,7 @@
  *
  * Based on Algorithm 14.12 on pp.595 of HAC.
  */
-int
-fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -519,7 +518,7 @@
   /* number of output digits to produce */
   pa = a->used + b->used;
   _W = 0;
-  for (ix = digs; ix <= pa; ix++) { 
+  for (ix = digs; ix < pa; ix++) { 
       int      tx, ty, iy;
       mp_digit *tmpx, *tmpy;
 
@@ -547,6 +546,9 @@
       /* make next carry */
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
+  
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
 
   /* setup dest */
   olduse  = c->used;
@@ -591,33 +593,14 @@
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
 
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed.  This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2".  The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
 /* the jist of squaring...
-
-you do like mult except the offset of the tmpx [one that starts closer to zero]
-can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
-(ty-tx) so that it never happens.  You double all those you add in the inner loop
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
 
 After that loop you do the squares and add them in.
-
-Remove W2 and don't memset W
-
 */
 
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
@@ -636,7 +619,7 @@
 
   /* number of output digits to produce */
   W1 = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
       int      tx, ty, iy;
       mp_word  _W;
       mp_digit *tmpy;
@@ -652,7 +635,7 @@
       tmpx = a->dp + tx;
       tmpy = a->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -677,7 +660,7 @@
       }
 
       /* store it */
-      W[ix] = _W;
+      W[ix] = (mp_digit)(_W & MP_MASK);
 
       /* make next carry */
       W1 = _W >> ((mp_word)DIGIT_BIT);
@@ -1539,23 +1522,23 @@
 
   mp_set(&tq, 1);
   n = mp_count_bits(a) - mp_count_bits(b);
-  if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
-      ((res = mp_copy(b, &tb)) != MP_OKAY) || 
+  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
+      ((res = mp_abs(b, &tb)) != MP_OKAY) || 
       ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
       ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
-      goto __ERR;
+      goto LBL_ERR;
   }
 
   while (n-- >= 0) {
      if (mp_cmp(&tb, &ta) != MP_GT) {
         if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
             ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
         }
      }
      if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
          ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
      }
   }
 
@@ -1564,13 +1547,13 @@
   n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
   if (c != NULL) {
      mp_exch(c, &q);
-     c->sign  = n2;
+     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
   }
   if (d != NULL) {
      mp_exch(d, &ta);
-     d->sign = n;
-  }
-__ERR:
+     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
+  }
+LBL_ERR:
    mp_clear_multi(&ta, &tb, &tq, &q, NULL);
    return res;
 }
@@ -1619,19 +1602,19 @@
   q.used = a->used + 2;
 
   if ((res = mp_init (&t1)) != MP_OKAY) {
-    goto __Q;
+    goto LBL_Q;
   }
 
   if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
+    goto LBL_T1;
   }
 
   if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
-    goto __T2;
+    goto LBL_T2;
   }
 
   if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
-    goto __X;
+    goto LBL_X;
   }
 
   /* fix the sign */
@@ -1643,10 +1626,10 @@
   if (norm < (int)(DIGIT_BIT-1)) {
      norm = (DIGIT_BIT-1) - norm;
      if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
      }
      if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
      }
   } else {
      norm = 0;
@@ -1658,13 +1641,13 @@
 
   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
   if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
-    goto __Y;
+    goto LBL_Y;
   }
 
   while (mp_cmp (&x, &y) != MP_LT) {
     ++(q.dp[n - t]);
     if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
   }
 
@@ -1706,7 +1689,7 @@
       t1.dp[1] = y.dp[t];
       t1.used = 2;
       if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
 
       /* find right hand */
@@ -1718,27 +1701,27 @@
 
     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
     }
 
     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
     if (x.sign == MP_NEG) {
       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
       if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
       }
 
       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
@@ -1765,11 +1748,11 @@
 
   res = MP_OKAY;
 
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
-__Q:mp_clear (&q);
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+LBL_Q:mp_clear (&q);
   return res;
 }
 
@@ -2199,7 +2182,7 @@
  * Based on algorithm from the paper
  *
  * "Generating Efficient Primes for Discrete Log Cryptosystems"
- *                 Chae Hoon Lim, Pil Loong Lee,
+ *                 Chae Hoon Lim, Pil Joong Lee,
  *          POSTECH Information Research Laboratories
  *
  * The modulus must be of a special format [see manual]
@@ -2457,25 +2440,33 @@
      return err;
 #else 
      /* no invmod */
-     return MP_VAL
-#endif
-  }
+     return MP_VAL;
+#endif
+  }
+
+/* modified diminished radix reduction */
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+  if (mp_reduce_is_2k_l(P) == MP_YES) {
+     return s_mp_exptmod(G, X, P, Y, 1);
+  }
+#endif
 
 #ifdef BN_MP_DR_IS_MODULUS_C
   /* is it a DR modulus? */
   dr = mp_dr_is_modulus(P);
 #else
+  /* default to no */
   dr = 0;
 #endif
 
 #ifdef BN_MP_REDUCE_IS_2K_C
-  /* if not, is it a uDR modulus? */
+  /* if not, is it a unrestricted DR modulus? */
   if (dr == 0) {
      dr = mp_reduce_is_2k(P) << 1;
   }
 #endif
     
-  /* if the modulus is odd or dr != 0 use the fast method */
+  /* if the modulus is odd or dr != 0 use the montgomery method */
 #ifdef BN_MP_EXPTMOD_FAST_C
   if (mp_isodd (P) == 1 || dr !=  0) {
     return mp_exptmod_fast (G, X, P, Y, dr);
@@ -2483,7 +2474,7 @@
 #endif
 #ifdef BN_S_MP_EXPTMOD_C
     /* otherwise use the generic Barrett reduction technique */
-    return s_mp_exptmod (G, X, P, Y);
+    return s_mp_exptmod (G, X, P, Y, 0);
 #else
     /* no exptmod for evens */
     return MP_VAL;
@@ -2529,8 +2520,7 @@
    #define TAB_SIZE 256
 #endif
 
-int
-mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res;
   mp_digit buf, mp;
@@ -2588,11 +2578,11 @@
 #ifdef BN_MP_MONTGOMERY_SETUP_C     
      /* now setup montgomery  */
      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
      }
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
 
      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
@@ -2608,7 +2598,7 @@
         redux = mp_montgomery_reduce;
 #else
         err = MP_VAL;
-        goto __M;
+        goto LBL_M;
 #endif
      }
   } else if (redmode == 1) {
@@ -2618,24 +2608,24 @@
      redux = mp_dr_reduce;
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
   } else {
 #if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
      /* setup DR reduction for moduli of the form 2**k - b */
      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
      }
      redux = mp_reduce_2k;
 #else
      err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
   }
 
   /* setup result */
   if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __M;
+    goto LBL_M;
   }
 
   /* create M table
@@ -2649,45 +2639,45 @@
 #ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
      /* now we need R mod m */
      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
 #else 
      err = MP_VAL;
-     goto __RES;
+     goto LBL_RES;
 #endif
 
      /* now set M[1] to G * R mod m */
      if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
   } else {
      mp_set(&res, 1);
      if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
   }
 
   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __RES;
+    goto LBL_RES;
   }
 
   for (x = 0; x < (winsize - 1); x++) {
     if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
     if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
   }
 
   /* create upper table */
   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
     if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
     }
   }
 
@@ -2727,10 +2717,10 @@
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       continue;
     }
@@ -2744,19 +2734,19 @@
       /* square first */
       for (x = 0; x < winsize; x++) {
         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
         if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
       }
 
       /* then multiply */
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
 
       /* empty window and reset */
@@ -2771,10 +2761,10 @@
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
 
       /* get next bit of the window */
@@ -2782,10 +2772,10 @@
       if ((bitbuf & (1 << winsize)) != 0) {
         /* then multiply */
         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
         if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
       }
     }
@@ -2799,15 +2789,15 @@
       * of R.
       */
      if ((err = redux(&res, P, mp)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
      }
   }
 
   /* swap res with Y */
   mp_exch (&res, Y);
   err = MP_OKAY;
-__RES:mp_clear (&res);
-__M:
+LBL_RES:mp_clear (&res);
+LBL_M:
   mp_clear(&M[1]);
   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
@@ -2881,6 +2871,13 @@
        if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
    }
 
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
    /* copy result out */
    if (U1 != NULL) { mp_exch(U1, &u1); }
    if (U2 != NULL) { mp_exch(U2, &u2); }
@@ -3059,7 +3056,7 @@
   }
 
   if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
-    goto __U;
+    goto LBL_U;
   }
 
   /* must be positive for the remainder of the algorithm */
@@ -3073,24 +3070,24 @@
   if (k > 0) {
      /* divide the power of two out */
      if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
 
      if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
   /* divide any remaining factors of two out */
   if (u_lsb != k) {
      if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
   if (v_lsb != k) {
      if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
   }
 
@@ -3103,23 +3100,23 @@
      
      /* subtract smallest from largest */
      if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      }
      
      /* Divide out all factors of two */
      if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
      } 
   } 
 
   /* multiply by 2**k which we divided out at the beginning */
   if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
-     goto __V;
+     goto LBL_V;
   }
   c->sign = MP_ZPOS;
   res = MP_OKAY;
-__V:mp_clear (&u);
-__U:mp_clear (&v);
+LBL_V:mp_clear (&u);
+LBL_U:mp_clear (&v);
   return res;
 }
 #endif
@@ -3555,25 +3552,25 @@
   }
 
   /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
   }
   if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* 2. [modified] if x,y are both even then return an error! */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
   }
   mp_set (&A, 1);
   mp_set (&D, 1);
@@ -3583,24 +3580,24 @@
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 4.2 if A or B is odd then */
     if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
       /* A = (A+y)/2, B = (B-x)/2 */
       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
     }
     /* A = A/2, B = B/2 */
     if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -3608,24 +3605,24 @@
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     /* 5.2 if C or D is odd then */
     if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
       /* C = (C+y)/2, D = (D-x)/2 */
       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
     }
     /* C = C/2, D = D/2 */
     if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -3633,28 +3630,28 @@
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, A = A - C, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   } else {
     /* v - v - u, C = C - A, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
     }
   }
 
@@ -3667,27 +3664,27 @@
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
   }
 
   /* if its too low */
   while (mp_cmp_d(&C, 0) == MP_LT) {
       if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
   }
   
   /* too big */
   while (mp_cmp_mag(&C, b) != MP_LT) {
       if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
   }
   
   /* C is now the inverse */
   mp_exch (&C, c);
   res = MP_OKAY;
-__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
 #endif
@@ -3856,13 +3853,13 @@
   }
 
   if ((res = mp_init (&p1)) != MP_OKAY) {
-    goto __A1;
+    goto LBL_A1;
   }
 
   /* divide out larger power of two */
   k = mp_cnt_lsb(&a1);
   if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) {
-     goto __P1;
+     goto LBL_P1;
   }
 
   /* step 4.  if e is even set s=1 */
@@ -3890,18 +3887,18 @@
   } else {
     /* n1 = n mod a1 */
     if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
-      goto __P1;
+      goto LBL_P1;
     }
     if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
-      goto __P1;
+      goto LBL_P1;
     }
     *c = s * r;
   }
 
   /* done */
   res = MP_OKAY;
-__P1:mp_clear (&p1);
-__A1:mp_clear (&a1);
+LBL_P1:mp_clear (&p1);
+LBL_A1:mp_clear (&a1);
   return res;
 }
 #endif
@@ -4227,20 +4224,20 @@
 
   /* t1 = get the GCD of the two inputs */
   if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
-    goto __T;
+    goto LBL_T;
   }
 
   /* divide the smallest by the GCD */
   if (mp_cmp_mag(a, b) == MP_LT) {
      /* store quotient in t2 such that t2 * b is the LCM */
      if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
-        goto __T;
+        goto LBL_T;
      }
      res = mp_mul(b, &t2, c);
   } else {
      /* store quotient in t2 such that t2 * a is the LCM */
      if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
-        goto __T;
+        goto LBL_T;
      }
      res = mp_mul(a, &t2, c);
   }
@@ -4248,7 +4245,7 @@
   /* fix the sign to positive */
   c->sign = MP_ZPOS;
 
-__T:
+LBL_T:
   mp_clear_multi (&t1, &t2, NULL);
   return res;
 }
@@ -4402,7 +4399,7 @@
   }
 
   /* if the modulus is larger than the value than return */
-  if (b > (int) (a->used * DIGIT_BIT)) {
+  if (b >= (int) (a->used * DIGIT_BIT)) {
     res = mp_copy (a, c);
     return res;
   }
@@ -4484,7 +4481,6 @@
   /* how many bits of last digit does b use */
   bits = mp_count_bits (b) % DIGIT_BIT;
 
-
   if (b->used > 1) {
      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
         return res;
@@ -4983,8 +4979,9 @@
     u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
   }
 
-  /* store final carry [if any] */
+  /* store final carry [if any] and increment ix offset  */
   *tmpc++ = u;
+  ++ix;
 
   /* now zero digits above the top */
   while (ix++ < olduse) {
@@ -5085,11 +5082,11 @@
   }
 
   if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
+    goto LBL_T1;
   }
 
   if ((res = mp_init (&t3)) != MP_OKAY) {
-    goto __T2;
+    goto LBL_T2;
   }
 
   /* if a is negative fudge the sign but keep track */
@@ -5102,52 +5099,52 @@
   do {
     /* t1 = t2 */
     if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
     
     /* t3 = t1**(b-1) */
     if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {   
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* numerator */
     /* t2 = t1**b */
     if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {    
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* t2 = t1**b - a */
     if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {  
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* denominator */
     /* t3 = t1**(b-1) * b  */
     if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {    
-      goto __T3;
+      goto LBL_T3;
     }
 
     /* t3 = (t1**b - a)/(b * t1**(b-1)) */
     if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {  
-      goto __T3;
+      goto LBL_T3;
     }
 
     if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
-      goto __T3;
+      goto LBL_T3;
     }
   }  while (mp_cmp (&t1, &t2) != MP_EQ);
 
   /* result can be off by a few so check */
   for (;;) {
     if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
-      goto __T3;
+      goto LBL_T3;
     }
 
     if (mp_cmp (&t2, a) == MP_GT) {
       if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
-         goto __T3;
+         goto LBL_T3;
       }
     } else {
       break;
@@ -5165,9 +5162,9 @@
 
   res = MP_OKAY;
 
-__T3:mp_clear (&t3);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
+LBL_T3:mp_clear (&t3);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
   return res;
 }
 #endif
@@ -5196,12 +5193,18 @@
 int mp_neg (mp_int * a, mp_int * b)
 {
   int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
-  }
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+        return res;
+     }
+  }
+
   if (mp_iszero(b) != MP_YES) {
      b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-  }
+  } else {
+     b->sign = MP_ZPOS;
+  }
+
   return MP_OKAY;
 }
 #endif
@@ -5304,7 +5307,7 @@
 
   /* compute t = b**a mod a */
   if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
-    goto __T;
+    goto LBL_T;
   }
 
   /* is it equal to b? */
@@ -5313,7 +5316,7 @@
   }
 
   err = MP_OKAY;
-__T:mp_clear (&t);
+LBL_T:mp_clear (&t);
   return err;
 }
 #endif
@@ -5352,8 +5355,8 @@
   *result = MP_NO;
 
   for (ix = 0; ix < PRIME_SIZE; ix++) {
-    /* what is a mod __prime_tab[ix] */
-    if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
+    /* what is a mod LBL_prime_tab[ix] */
+    if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) {
       return err;
     }
 
@@ -5410,7 +5413,7 @@
 
   /* is the input equal to one of the primes in the table? */
   for (ix = 0; ix < PRIME_SIZE; ix++) {
-      if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
+      if (mp_cmp_d(a, ltm_prime_tab[ix]) == MP_EQ) {
          *result = 1;
          return MP_OKAY;
       }
@@ -5433,20 +5436,20 @@
 
   for (ix = 0; ix < t; ix++) {
     /* set the prime */
-    mp_set (&b, __prime_tab[ix]);
+    mp_set (&b, ltm_prime_tab[ix]);
 
     if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
-      goto __B;
+      goto LBL_B;
     }
 
     if (res == MP_NO) {
-      goto __B;
+      goto LBL_B;
     }
   }
 
   /* passed the test */
   *result = MP_YES;
-__B:mp_clear (&b);
+LBL_B:mp_clear (&b);
   return err;
 }
 #endif
@@ -5496,12 +5499,12 @@
     return err;
   }
   if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
-    goto __N1;
+    goto LBL_N1;
   }
 
   /* set 2**s * r = n1 */
   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
-    goto __N1;
+    goto LBL_N1;
   }
 
   /* count the number of least significant bits
@@ -5511,15 +5514,15 @@
 
   /* now divide n - 1 by 2**s */
   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
-    goto __R;
+    goto LBL_R;
   }
 
   /* compute y = b**r mod a */
   if ((err = mp_init (&y)) != MP_OKAY) {
-    goto __R;
+    goto LBL_R;
   }
   if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
-    goto __Y;
+    goto LBL_Y;
   }
 
   /* if y != 1 and y != n1 do */
@@ -5528,12 +5531,12 @@
     /* while j <= s-1 and y != n1 */
     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
-         goto __Y;
+         goto LBL_Y;
       }
 
       /* if y == 1 then composite */
       if (mp_cmp_d (&y, 1) == MP_EQ) {
-         goto __Y;
+         goto LBL_Y;
       }
 
       ++j;
@@ -5541,15 +5544,15 @@
 
     /* if y != n1 then composite */
     if (mp_cmp (&y, &n1) != MP_EQ) {
-      goto __Y;
+      goto LBL_Y;
     }
   }
 
   /* probably prime now */
   *result = MP_YES;
-__Y:mp_clear (&y);
-__R:mp_clear (&r);
-__N1:mp_clear (&n1);
+LBL_Y:mp_clear (&y);
+LBL_R:mp_clear (&r);
+LBL_N1:mp_clear (&n1);
   return err;
 }
 #endif
@@ -5594,10 +5597,10 @@
    a->sign = MP_ZPOS;
 
    /* simple algo if a is less than the largest prime in the table */
-   if (mp_cmp_d(a, __prime_tab[PRIME_SIZE-1]) == MP_LT) {
+   if (mp_cmp_d(a, ltm_prime_tab[PRIME_SIZE-1]) == MP_LT) {
       /* find which prime it is bigger than */
       for (x = PRIME_SIZE - 2; x >= 0; x--) {
-          if (mp_cmp_d(a, __prime_tab[x]) != MP_LT) {
+          if (mp_cmp_d(a, ltm_prime_tab[x]) != MP_LT) {
              if (bbs_style == 1) {
                 /* ok we found a prime smaller or
                  * equal [so the next is larger]
@@ -5605,17 +5608,17 @@
                  * however, the prime must be
                  * congruent to 3 mod 4
                  */
-                if ((__prime_tab[x + 1] & 3) != 3) {
+                if ((ltm_prime_tab[x + 1] & 3) != 3) {
                    /* scan upwards for a prime congruent to 3 mod 4 */
                    for (y = x + 1; y < PRIME_SIZE; y++) {
-                       if ((__prime_tab[y] & 3) == 3) {
-                          mp_set(a, __prime_tab[y]);
+                       if ((ltm_prime_tab[y] & 3) == 3) {
+                          mp_set(a, ltm_prime_tab[y]);
                           return MP_OKAY;
                        }
                    }
                 }
              } else {
-                mp_set(a, __prime_tab[x + 1]);
+                mp_set(a, ltm_prime_tab[x + 1]);
                 return MP_OKAY;
              }
           }
@@ -5653,7 +5656,7 @@
 
    /* generate the restable */
    for (x = 1; x < PRIME_SIZE; x++) {
-      if ((err = mp_mod_d(a, __prime_tab[x], res_tab + x)) != MP_OKAY) {
+      if ((err = mp_mod_d(a, ltm_prime_tab[x], res_tab + x)) != MP_OKAY) {
          return err;
       }
    }
@@ -5679,8 +5682,8 @@
              res_tab[x] += kstep;
 
              /* subtract the modulus [instead of using division] */
-             if (res_tab[x] >= __prime_tab[x]) {
-                res_tab[x]  -= __prime_tab[x];
+             if (res_tab[x] >= ltm_prime_tab[x]) {
+                res_tab[x]  -= ltm_prime_tab[x];
              }
 
              /* set flag if zero */
@@ -5692,7 +5695,7 @@
 
       /* add the step */
       if ((err = mp_add_d(a, step, a)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
       }
 
       /* if didn't pass sieve and step == MAX then skip test */
@@ -5702,9 +5705,9 @@
 
       /* is this prime? */
       for (x = 0; x < t; x++) {
-          mp_set(&b, __prime_tab[t]);
+          mp_set(&b, ltm_prime_tab[t]);
           if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
-             goto __ERR;
+             goto LBL_ERR;
           }
           if (res == MP_NO) {
              break;
@@ -5717,7 +5720,7 @@
    }
 
    err = MP_OKAY;
-__ERR:
+LBL_ERR:
    mp_clear(&b);
    return err;
 }
@@ -5828,7 +5831,7 @@
    }
 
    /* calc the byte size */
-   bsize = (size>>3)+(size&7?1:0);
+   bsize = (size>>3) + ((size&7)?1:0);
 
    /* we need a buffer of bsize bytes */
    tmp = OPT_CAST(unsigned char) XMALLOC(bsize);
@@ -5837,19 +5840,19 @@
    }
 
    /* calc the maskAND value for the MSbyte*/
-   maskAND = 0xFF >> (8 - (size & 7));
+   maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
 
    /* calc the maskOR_msb */
    maskOR_msb        = 0;
-   maskOR_msb_offset = (size - 2) >> 3;
+   maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
    if (flags & LTM_PRIME_2MSB_ON) {
       maskOR_msb     |= 1 << ((size - 2) & 7);
    } else if (flags & LTM_PRIME_2MSB_OFF) {
       maskAND        &= ~(1 << ((size - 2) & 7));
-   }
+   } 
 
    /* get the maskOR_lsb */
-   maskOR_lsb         = 0;
+   maskOR_lsb         = 1;
    if (flags & LTM_PRIME_BBS) {
       maskOR_lsb     |= 3;
    }
@@ -5943,22 +5946,29 @@
     return MP_VAL;
   }
 
-  /* init a copy of the input */
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
+  if (mp_iszero(a) == MP_YES) {
+     *size = 2;
+    return MP_OKAY;
   }
 
   /* digs is the digit count */
   digs = 0;
 
   /* if it's negative add one for the sign */
-  if (t.sign == MP_NEG) {
+  if (a->sign == MP_NEG) {
     ++digs;
-    t.sign = MP_ZPOS;
-  }
+  }
+
+  /* init a copy of the input */
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* force temp to positive */
+  t.sign = MP_ZPOS; 
 
   /* fetch out all of the digits */
-  while (mp_iszero (&t) == 0) {
+  while (mp_iszero (&t) == MP_NO) {
     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
       mp_clear (&t);
       return res;
@@ -6032,14 +6042,14 @@
 
   /* first place a random non-zero digit */
   do {
-    d = ((mp_digit) abs (rand ()));
+    d = ((mp_digit) abs (rand ())) & MP_MASK;
   } while (d == 0);
 
   if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
     return res;
   }
 
-  while (digits-- > 0) {
+  while (--digits > 0) {
     if ((res = mp_lshd (a, 1)) != MP_OKAY) {
       return res;
     }
@@ -6074,7 +6084,7 @@
  */
 
 /* read a string [ASCII] in a given radix */
-int mp_read_radix (mp_int * a, char *str, int radix)
+int mp_read_radix (mp_int * a, const char *str, int radix)
 {
   int     y, res, neg;
   char    ch;
@@ -6257,8 +6267,7 @@
  * precomputed via mp_reduce_setup.
  * From HAC pp.604 Algorithm 14.42
  */
-int
-mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
 {
   mp_int  q;
   int     res, um = m->used;
@@ -6278,11 +6287,11 @@
     }
   } else {
 #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #else 
@@ -6355,8 +6364,7 @@
  */
 
 /* reduces a modulo n where n is of the form 2**p - d */
-int
-mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
 {
    mp_int q;
    int    p, res;
@@ -6398,6 +6406,68 @@
 
 /* End: bn_mp_reduce_2k.c */
 
+/* Start: bn_mp_reduce_2k_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* reduces a modulo n where n is of the form 2**p - d 
+   This differs from reduce_2k since "d" can be larger
+   than a single digit.
+*/
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   /* q = q * d */
+   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) { 
+      goto ERR;
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
+#endif
+
+/* End: bn_mp_reduce_2k_l.c */
+
 /* Start: bn_mp_reduce_2k_setup.c */
 #include <tommath.h>
 #ifdef BN_MP_REDUCE_2K_SETUP_C
@@ -6417,8 +6487,7 @@
  */
 
 /* determines the setup value */
-int 
-mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 {
    int res, p;
    mp_int tmp;
@@ -6446,6 +6515,50 @@
 
 /* End: bn_mp_reduce_2k_setup.c */
 
+/* Start: bn_mp_reduce_2k_setup_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* determines the setup value */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
+{
+   int    res;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+ERR:
+   mp_clear(&tmp);
+   return res;
+}
+#endif
+
+/* End: bn_mp_reduce_2k_setup_l.c */
+
 /* Start: bn_mp_reduce_is_2k.c */
 #include <tommath.h>
 #ifdef BN_MP_REDUCE_IS_2K_C
@@ -6471,9 +6584,9 @@
    mp_digit iz;
    
    if (a->used == 0) {
-      return 0;
+      return MP_NO;
    } else if (a->used == 1) {
-      return 1;
+      return MP_YES;
    } else if (a->used > 1) {
       iy = mp_count_bits(a);
       iz = 1;
@@ -6482,7 +6595,7 @@
       /* Test every bit from the second digit up, must be 1 */
       for (ix = DIGIT_BIT; ix < iy; ix++) {
           if ((a->dp[iw] & iz) == 0) {
-             return 0;
+             return MP_NO;
           }
           iz <<= 1;
           if (iz > (mp_digit)MP_MASK) {
@@ -6491,13 +6604,57 @@
           }
       }
    }
-   return 1;
+   return MP_YES;
 }
 
 #endif
 
 /* End: bn_mp_reduce_is_2k.c */
 
+/* Start: bn_mp_reduce_is_2k_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* determines if reduce_2k_l can be used */
+int mp_reduce_is_2k_l(mp_int *a)
+{
+   int ix, iy;
+   
+   if (a->used == 0) {
+      return MP_NO;
+   } else if (a->used == 1) {
+      return MP_YES;
+   } else if (a->used > 1) {
+      /* if more than half of the digits are -1 we're sold */
+      for (iy = ix = 0; ix < a->used; ix++) {
+          if (a->dp[ix] == MP_MASK) {
+              ++iy;
+          }
+      }
+      return (iy >= (a->used/2)) ? MP_YES : MP_NO;
+      
+   }
+   return MP_NO;
+}
+
+#endif
+
+/* End: bn_mp_reduce_is_2k_l.c */
+
 /* Start: bn_mp_reduce_setup.c */
 #include <tommath.h>
 #ifdef BN_MP_REDUCE_SETUP_C
@@ -7132,8 +7289,7 @@
  */
 
 /* store in signed [big endian] format */
-int
-mp_to_signed_bin (mp_int * a, unsigned char *b)
+int mp_to_signed_bin (mp_int * a, unsigned char *b)
 {
   int     res;
 
@@ -7147,6 +7303,37 @@
 
 /* End: bn_mp_to_signed_bin.c */
 
+/* Start: bn_mp_to_signed_bin_n.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_signed_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_signed_bin_size(a);
+   return mp_to_signed_bin(a, b);
+}
+#endif
+
+/* End: bn_mp_to_signed_bin_n.c */
+
 /* Start: bn_mp_to_unsigned_bin.c */
 #include <tommath.h>
 #ifdef BN_MP_TO_UNSIGNED_BIN_C
@@ -7166,8 +7353,7 @@
  */
 
 /* store in unsigned [big endian] format */
-int
-mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 {
   int     x, res;
   mp_int  t;
@@ -7196,6 +7382,37 @@
 
 /* End: bn_mp_to_unsigned_bin.c */
 
+/* Start: bn_mp_to_unsigned_bin_n.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_unsigned_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_unsigned_bin_size(a);
+   return mp_to_unsigned_bin(a, b);
+}
+#endif
+
+/* End: bn_mp_to_unsigned_bin_n.c */
+
 /* Start: bn_mp_toom_mul.c */
 #include <tommath.h>
 #ifdef BN_MP_TOOM_MUL_C
@@ -7216,9 +7433,10 @@
 
 /* multiplication using the Toom-Cook 3-way algorithm 
  *
- * Much more complicated than Karatsuba but has a lower asymptotic running time of 
- * O(N**1.464).  This algorithm is only particularly useful on VERY large
- * inputs (we're talking 1000s of digits here...).
+ * Much more complicated than Karatsuba but has a lower 
+ * asymptotic running time of O(N**1.464).  This algorithm is 
+ * only particularly useful on VERY large inputs 
+ * (we're talking 1000s of digits here...).
 */
 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
@@ -7888,8 +8106,7 @@
  */
 
 /* get the size for an unsigned equivalent */
-int
-mp_unsigned_bin_size (mp_int * a)
+int mp_unsigned_bin_size (mp_int * a)
 {
   int     size = mp_count_bits (a);
   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
@@ -7938,7 +8155,7 @@
   }
 
   for (ix = 0; ix < px; ix++) {
-
+     t.dp[ix] ^= x->dp[ix];
   }
   mp_clamp (&t);
   mp_exch (c, &t);
@@ -7968,12 +8185,18 @@
  */
 
 /* set to zero */
-void
-mp_zero (mp_int * a)
-{
+void mp_zero (mp_int * a)
+{
+  int       n;
+  mp_digit *tmp;
+
   a->sign = MP_ZPOS;
   a->used = 0;
-  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+
+  tmp = a->dp;
+  for (n = 0; n < a->alloc; n++) {
+     *tmp++ = 0;
+  }
 }
 #endif
 
@@ -7996,7 +8219,7 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-const mp_digit __prime_tab[] = {
+const mp_digit ltm_prime_tab[] = {
   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
   0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
@@ -8212,11 +8435,12 @@
    #define TAB_SIZE 256
 #endif
 
-int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res, mu;
   mp_digit buf;
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  int (*redux)(mp_int*,mp_int*,mp_int*);
 
   /* find window size */
   x = mp_count_bits (X);
@@ -8261,11 +8485,20 @@
 
   /* create mu, used for Barrett reduction */
   if ((err = mp_init (&mu)) != MP_OKAY) {
-    goto __M;
-  }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto __MU;
-  }
+    goto LBL_M;
+  }
+  
+  if (redmode == 0) {
+     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce;
+  } else {
+     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce_2k_l;
+  }    
 
   /* create M table
    *
@@ -8276,23 +8509,26 @@
    * computed though accept for M[0] and M[1]
    */
   if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
 
   /* compute the value at M[1<<(winsize-1)] by squaring 
    * M[1] (winsize-1) times 
    */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
 
   for (x = 0; x < (winsize - 1); x++) {
+    /* square it */
     if ((err = mp_sqr (&M[1 << (winsize - 1)], 
                        &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
-      goto __MU;
+      goto LBL_MU;
+    }
+
+    /* reduce modulo P */
+    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
     }
   }
 
@@ -8301,16 +8537,16 @@
    */
   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
-      goto __MU;
+      goto LBL_MU;
+    }
+    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
     }
   }
 
   /* setup result */
   if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __MU;
+    goto LBL_MU;
   }
   mp_set (&res, 1);
 
@@ -8350,10 +8586,10 @@
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
       }
       continue;
     }
@@ -8367,19 +8603,19 @@
       /* square first */
       for (x = 0; x < winsize; x++) {
         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
         }
       }
 
       /* then multiply */
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
       }
 
       /* empty window and reset */
@@ -8394,20 +8630,20 @@
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
       }
 
       bitbuf <<= 1;
       if ((bitbuf & (1 << winsize)) != 0) {
         /* then multiply */
         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
         }
       }
     }
@@ -8415,9 +8651,9 @@
 
   mp_exch (&res, Y);
   err = MP_OKAY;
-__RES:mp_clear (&res);
-__MU:mp_clear (&mu);
-__M:
+LBL_RES:mp_clear (&res);
+LBL_MU:mp_clear (&mu);
+LBL_M:
   mp_clear(&M[1]);
   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
@@ -8450,8 +8686,7 @@
  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
  * many digits of output are created.
  */
-int
-s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   mp_int  t;
   int     res, pa, pb, ix, iy;
@@ -8619,8 +8854,7 @@
  */
 
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int
-s_mp_sqr (mp_int * a, mp_int * b)
+int s_mp_sqr (mp_int * a, mp_int * b)
 {
   mp_int  t;
   int     res, ix, iy, pa;
@@ -8797,11 +9031,12 @@
  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
  Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
+ AMD Athlon64           /GCC v3.4.4   /        74/       124/LTM 0.34
  
 */
 
-int     KARATSUBA_MUL_CUTOFF = 88,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 128,     /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 74,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 124,     /* Min. number of digits before Karatsuba squaring is used. */
         
         TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
         TOOM_SQR_CUTOFF      = 400; 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tombc/grammar.txt	Fri May 06 08:59:30 2005 +0000
@@ -0,0 +1,35 @@
+program       := program statement | statement | empty
+statement     := { statement }                                                                              | 
+                 identifier = numexpression;                                                                | 
+                 identifier[numexpression] = numexpression;                                                 |
+                 function(expressionlist);                                                                  | 
+                 for (identifer = numexpression; numexpression; identifier = numexpression) { statement }   |
+                 while (numexpression) { statement }                                                        | 
+                 if (numexpresion) { statement } elif                                                       | 
+                 break;                                                                                     | 
+                 continue;                                                                                  
+                 
+elif          := else statement | empty
+function      := abs | countbits | exptmod | jacobi | print | isprime | nextprime | issquare | readinteger | exit
+expressionlist := expressionlist, expression | expression
+
+// LR(1) !!!?
+expression    := string | numexpression
+numexpression := cmpexpr && cmpexpr | cmpexpr \|\| cmpexpr | cmpexpr
+cmpexpr       := boolexpr  < boolexpr | boolexpr  > boolexpr | boolexpr == boolexpr | 
+                 boolexpr <= boolexpr | boolexpr >= boolexpr | boolexpr
+boolexpr      := shiftexpr & shiftexpr | shiftexpr ^ shiftexpr | shiftexpr \| shiftexpr | shiftexpr
+shiftexpr     := addsubexpr << addsubexpr | addsubexpr >> addsubexpr | addsubexpr
+addsubexpr    := mulexpr + mulexpr | mulexpr - mulexpr | mulexpr
+mulexpr       := expr * expr       | expr / expr | expr % expr | expr
+expr          := -nexpr | nexpr 
+nexpr         := integer | identifier | ( numexpression ) | identifier[numexpression] 
+
+identifier    := identifer digits | identifier alpha | alpha
+alpha         := a ... z | A ... Z
+integer       := hexnumber | digits 
+hexnumber     := 0xhexdigits
+hexdigits     := hexdigits hexdigit | hexdigit
+hexdigit      := 0 ... 9 | a ... f | A ... F
+digits        := digits digit | digit 
+digit         := 0 ... 9
--- a/tommath.h	Sun Dec 19 11:33:56 2004 +0000
+++ b/tommath.h	Fri May 06 08:59:30 2005 +0000
@@ -429,6 +429,15 @@
 /* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
 int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d);
 
+/* returns true if a can be reduced with mp_reduce_2k_l */
+int mp_reduce_is_2k_l(mp_int *a);
+
+/* determines k value for 2k reduction */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d);
+
+/* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d);
+
 /* d = a**b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 
@@ -442,7 +451,7 @@
 #endif
 
 /* table of first PRIME_SIZE primes */
-extern const mp_digit __prime_tab[];
+extern const mp_digit ltm_prime_tab[];
 
 /* result=1 if a is divisible by one of the first PRIME_SIZE primes */
 int mp_prime_is_divisible(mp_int *a, int *result);
@@ -511,12 +520,14 @@
 int mp_unsigned_bin_size(mp_int *a);
 int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
 int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen);
 
 int mp_signed_bin_size(mp_int *a);
 int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
 int mp_to_signed_bin(mp_int *a, unsigned char *b);
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen);
 
-int mp_read_radix(mp_int *a, char *str, int radix);
+int mp_read_radix(mp_int *a, const char *str, int radix);
 int mp_toradix(mp_int *a, char *str, int radix);
 int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen);
 int mp_radix_size(mp_int *a, int radix, int *size);
@@ -554,7 +565,7 @@
 int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c);
 int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
 int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode);
-int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int mode);
 void bn_reverse(unsigned char *s, int len);
 
 extern const char *mp_s_rmap;
Binary file tommath.pdf has changed
--- a/tommath.src	Sun Dec 19 11:33:56 2004 +0000
+++ b/tommath.src	Fri May 06 08:59:30 2005 +0000
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition }
+\title{Multi--Precision Math}
 \author{\mbox{
 %\begin{small}
 \begin{tabular}{c}
@@ -66,7 +66,7 @@
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.30 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.35 release of the 
 LibTomMath project.
 
 \begin{alltt}
@@ -85,66 +85,32 @@
 
 \tableofcontents
 \listoffigures
-\chapter*{Prefaces to the Draft Edition}
-I started this text in April 2003 to complement my LibTomMath library.  That is, explain how to implement the functions
-contained in LibTomMath.  The goal is to have a textbook that any Computer Science student can use when implementing their
-own multiple precision arithmetic.  The plan I wanted to follow was flesh out all the
-ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time.  Chance
-would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the
-text.  
-
-Choosing to not waste any time I dove right into the project even before my spring semester was finished.  I wrote a bit
-off and on at first.  The moment my exams were finished I jumped into long 12 to 16 hour days.  The result after only
-a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted
-to read it.  I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara.  So far I have
-managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain
-rewarding.
-
-Now we are past December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
-Currently I am far off from this goal.  I've done partial re-writes of chapters one, two and three but they are not even
-finished yet.  I haven't given up on the project, only had some setbacks.  First O'Reilly declined to publish the text then
-Addison-Wesley and Greg is tried another which I don't know the name of.  However, at this point I want to focus my energy
-onto finishing the book not securing a contract.
-
-So why am I writing this text?  It seems like a lot of work right?  Most certainly it is a lot of work writing a textbook.  
-Even the simplest introductory material has to be lined with references and figures.  A lot of the text has to be re-written
-from point form to prose form to ensure an easier read.  Why am I doing all this work for free then?  Simple. My philosophy
-is quite simply ``Open Source.  Open Academia.  Open Minds'' which means that to achieve a goal of open minds, that is,
-people willing to accept new ideas and explore the unknown you have to make available material they can access freely 
-without hinderance.  
-
-I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come
-to depend upon.  I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their
-software.  Several educational institutions use it as a matter of course and many freelance developers use it as
-part of their projects.  To further my contributions I started the LibTomMath project in December 2002 aimed at providing
-multiple precision arithmetic routines that students could learn from.  That is write routines that are not only easy
-to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C.  
-
-The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in.  In the end, when all is
-said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic.  
-
-At this time I feel I should share a little information about myself.  The most common question I was asked at 
-Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended.  The unfortunate
-truth is that I neither teach at or attend a school of academic reputation.  I'm currently at Algonquin College which 
-is what I'd like to call ``somewhat academic but mostly vocational'' college.  In otherwords, job training.
-
-I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen
-computer science fields, a few fields of mathematics and some English).  I look forward to teaching someday but I am
-still far off from that goal.  
-
-Now it would be improper for me to not introduce the rest of the texts co-authors.  While they are only contributing 
-corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out
-in the text so far.  Greg has always been there for me.  He has tracked my LibTom projects since their inception and even
-sent cheques to help pay tuition from time to time.  His background has provided a wonderful source to bounce ideas off
-of and improve the quality of my writing.  Mads is another fellow who has just ``been there''.  I don't even recall what
-his interest in the LibTom projects is but I'm definitely glad he has been around.  His ability to catch logical errors
-in my written English have saved me on several occasions to say the least.
-
-What to expect next?  Well this is still a rough draft.  I've only had the chance to update a few chapters.  However, I've
-been getting the feeling that people are starting to use my text and I owe them some updated material.  My current tenative
-plan is to edit one chapter every two weeks starting January 4th.  It seems insane but my lower course load at college
-should provide ample time.  By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many
-people who will take it.
+\chapter*{Prefaces}
+When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.  
+They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''  
+Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which 
+perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
+others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give 
+back to society in the form of tools and knowledge that can help others in their endeavours.
+
+I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
+code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
+explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
+itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
+of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
+from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
+
+This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
+of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
+length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
+comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg 
+were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to 
+continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.  
+
+To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
+honour your kind gestures with this project.
+
+Open Source.  Open Academia.  Open Minds.
 
 \begin{flushright} Tom St Denis \end{flushright}
 
@@ -937,7 +903,7 @@
 
 EXAM,bn_mp_grow.c
 
-A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @23,if@) checks
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @24,alloc@) checks
 if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
 the function skips the re-allocation part thus saving time.
 
@@ -1310,7 +1276,7 @@
 With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
 the absolute value of an mp\_int.
 
-\newpage\begin{figure}[here]
+\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_abs}. \\
@@ -1335,6 +1301,9 @@
 
 EXAM,bn_mp_abs.c
 
+This fairly trivial algorithm first eliminates non--required duplications (line @27,a != b@) and then sets the
+\textbf{sign} flag to \textbf{MP\_ZPOS}.
+
 \subsection{Integer Negation}
 With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
 the negative of an mp\_int input.
@@ -1368,11 +1337,15 @@
 
 EXAM,bn_mp_neg.c
 
+Like mp\_abs() this function avoids non--required duplications (line @21,a != b@) and then sets the sign.  We
+have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
+than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
+
 \section{Small Constants}
 \subsection{Setting Small Constants}
 Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
 
-\begin{figure}[here]
+\newpage\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_set}. \\
@@ -1397,11 +1370,14 @@
 
 EXAM,bn_mp_set.c
 
-Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign.  Line @22,MP_MASK@ copies the digit 
-into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
-reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
-$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line @23,a->used@ will set the \textbf{used} member with respect to the 
-digit actually set. This function will always make the integer positive.
+First we zero (line @21,mp_zero@) the mp\_int to make sure that the other members are initialized for a 
+small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
+is zero.  Next we set the digit and reduce it modulo $\beta$ (line @22,MP_MASK@).  After this step we have to 
+check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
+to zero.
+
+We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with 
+$2^k - 1$ will perform the same operation.
 
 One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
 this function should take that into account.  Only trivially small constants can be set using this function.
@@ -1503,10 +1479,12 @@
 
 EXAM,bn_mp_cmp_mag.c
 
-The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs.  These two are performed before all of the digits
-are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
-without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
-array of digits.
+The two if statements (lines @24,if@ and @28,if@) compare the number of digits in the two inputs.  These two are 
+performed before all of the digits are compared since it is a very cheap test to perform and can potentially save 
+considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be 
+smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
+
+
 
 \subsection{Signed Comparisons}
 Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
@@ -1539,9 +1517,9 @@
 
 EXAM,bn_mp_cmp.c
 
-The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   At line @30,if@, the inputs are compared based on magnitudes.  If the signs were both negative then 
-the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}).  Otherwise, the signs are assumed to 
+The two if statements (lines @22,if@ and @26,if@) perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   The inputs are compared (line @30,if@) based on magnitudes.  If the signs were both 
+negative then the unsigned comparison is performed in the opposite direction (line @31,mp_cmp_mag@).  Otherwise, the signs are assumed to 
 be both positive and a forward direction unsigned comparison is performed.
 
 \section*{Exercises}
@@ -1664,19 +1642,21 @@
 
 EXAM,bn_s_mp_add.c
 
-Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
-mp\_int assigned to the largest input, in effect it is a local alias.  Lines @37,init@ to @42,}@ ensure that the destination is grown to 
-accomodate the result of the addition. 
+We first sort (lines @27,if@ to @35,}@) the inputs based on magnitude and determine the $min$ and $max$ variables.
+Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
+grow the destination (@37,init@ to @42,}@) ensure that it can accomodate the result of the addition. 
 
 Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
 lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively.  These aliases are used to ensure the
 compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
 
-The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the 
-implementation.  The initial addition loop begins on line @66,for@ and ends on line @75,}@.  Similarly the conditional addition loop
-begins on line @81,for@ and ends on line @90,}@.  The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@.  
-Note the ``++'' operator on the same line.  After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero.
+The initial carry $u$ will be cleared (line @65,u = 0@), note that $u$ is of type mp\_digit which ensures type 
+compatibility within the implementation.  The initial addition (line @66,for@ to @75,}@) adds digits from
+both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
+(line @81,for@ to @90,}@) adds the remaining digits from the larger of the two inputs.  The addition is finished 
+with the final carry being stored in $tmpc$ (line @94,tmpc++@).  Note the ``++'' operator within the same expression.
+After line @94,tmpc++@, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop (line @97,for@ to @99,}@) which set any old upper digits to zero.
 
 \subsection{Low Level Subtraction}
 The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
@@ -1692,7 +1672,7 @@
 mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
 
 For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
 
 \newpage\begin{figure}[!here]
 \begin{center}
@@ -1759,20 +1739,23 @@
 
 EXAM,bn_s_mp_sub.c
 
-Line @24,min@ and @25,max@ perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
-used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for 
-$a$, $b$ and $c$ respectively.
-
-The first subtraction loop occurs on lines @47,u = 0@ through @61,}@.  The theory behind the subtraction loop is exactly the same as that for
-the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
-(\textit{see line @57, >>@}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
-the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
-occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
-shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
-twos compliment machines which is a safe assumption to make.
-
-If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through
-$a$ and copy the result to $c$.  
+Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded 
+(lines @24,min@ and @25,max@).  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used 
+within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
+(lines @42,tmpa@, @43,tmpb@ and @44,tmpc@) for $a$, $b$ and $c$ respectively.
+
+The first subtraction loop (lines @47,u = 0@ through @61,}@) subtract digits from both inputs until the smaller of
+the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward'' 
+method of extracting the carry (line @57, >>@).  The traditional method for extracting the carry would be to shift 
+by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of 
+the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry 
+extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the 
+most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This 
+optimization only works on twos compliment machines which is a safe assumption to make.
+
+If $a$ has a larger magnitude than $b$ an additional loop (lines @64,for@ through @73,}@) is required to propagate 
+the carry through $a$ and copy the result to $c$.  
 
 \subsection{High Level Addition}
 Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
@@ -2098,10 +2081,11 @@
 
 EXAM,bn_mp_lshd.c
 
-The if statement on line @24,if@ ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
-the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line @42,top@ is an alias
-for the leading digit while $bottom$ on line @45,bottom@ is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
-over the input.  
+The if statement (line @24,if@) ensures that the $b$ variable is greater than zero since we do not interpret negative
+shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates 
+the need for an additional variable in the for loop.  The variable $top$ (line @42,top@) is an alias
+for the leading digit while $bottom$ (line @45,bottom@) is an alias for the trailing edge.  The aliases form a 
+window of exactly $b$ digits over the input.  
 
 \subsection{Division by $x$}
 
@@ -2151,9 +2135,9 @@
 
 EXAM,bn_mp_rshd.c
 
-The only noteworthy element of this routine is the lack of a return type.  
-
--- Will update later to give it a return type...Tom
+The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
+form a sliding window except we copy in the other direction.  After the window (line @59,for (;@) we then zero
+the upper digits of the input to make sure the result is correct.
 
 \section{Powers of Two}
 
@@ -2214,7 +2198,15 @@
 
 EXAM,bn_mp_mul_2d.c
 
-Notes to be revised when code is updated. -- Tom
+The shifting is performed in--place which means the first step (line @24,a != c@) is to copy the input to the 
+destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
+has to be grown (line @31,grow@) to accomodate the result.
+
+If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples 
+of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift 
+loop (lines @45,if@ to @76,}@) we make use of pre--computed values $shift$ and $mask$.   These are used to
+extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a 
+chain between consecutive iterations to propagate the carry.  
 
 \subsection{Division by Power of Two}
 
@@ -2263,7 +2255,8 @@
 result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
 the quotient is obtained.
 
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
+the direction of the shifts.
 
 \subsection{Remainder of Division by Power of Two}
 
@@ -2306,7 +2299,13 @@
 
 EXAM,bn_mp_mod_2d.c
 
--- Add comments later, Tom.
+We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
+than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
+perform some work to produce the remainder.
+
+Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce 
+the number.  First we zero any digits above the last digit in $2^b$ (line @41,for@).  Next we reduce the 
+leading digit of both (line @45,&=@) and then mp\_clamp().
 
 \section*{Exercises}
 \begin{tabular}{cl}
@@ -2464,33 +2463,46 @@
 
 EXAM,bn_s_mp_mul_digs.c
 
-Lines @31,if@ to @35,}@ determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
-the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
-the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
-
-Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@.  Note how all of the
-variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
-are used instead of single precision.  The multiplication on line @65,) * (@ makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
-the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
-processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
-example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+First we determine (line @30,if@) if the Comba method can be used first since it's faster.  The conditions for 
+sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than 
+\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is 
+set to $\delta$ but can be reduced when memory is at a premium.
+
+If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
+$t$ (line @36,init@) to the exact size of the output to avoid further re--allocations.  At this point we now 
+begin the $O(n^2)$ loop.
+
+This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
+digits as output.  In each iteration of the outer loop the $pb$ variable is set (line @48,MIN@) to the maximum 
+number of inner loop iterations.  
+
+Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
+carry from the previous iteration.  A particularly important observation is that most modern optimizing 
+C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that 
+is required for the product.  In x86 terms for example, this means using the MUL instruction.
+
+Each digit of the product is stored in turn (line @68,tmpt@) and the carry propagated (line @71,>>@) to the 
+next iteration.
 
 \subsection{Faster Multiplication by the ``Comba'' Method}
 MARK,COMBA
 
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
-makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
-(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
-carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
-his 1986 paper \cite{BARRETT} written five years before.
-
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
-the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
-final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
-
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
-simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
-of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be 
+computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement 
+in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G. 
+Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an 
+interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written 
+five years before.
+
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight 
+twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products 
+are produced then added together to form the final result.  In the baseline algorithm the columns are added together 
+after each iteration to get the result instantaneously.  
+
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at 
+the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated 
+after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute 
+the product vector $\vec x$ as follows. 
 
 \begin{equation}
 \vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
@@ -2584,38 +2596,32 @@
 \textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
 \textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
 \hline \\
-Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
 1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
 2.  If step 1 failed return(\textit{MP\_MEM}).\\
 \\
-Zero the temporary array $\hat W$. \\
-3.  for $n$ from $0$ to $digs - 1$ do \\
-\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
-\\
-Compute the columns. \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
-\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
 \\
-Propagate the carries upwards. \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow digs$ \\
-7.  If $digs > 1$ then do \\
-\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
-\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
-\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
-8.  else do \\
-\hspace{3mm}8.1  $ix \leftarrow 0$ \\
-9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+4.  $\_ \hat W \leftarrow 0$ \\
+5.  for $ix$ from 0 to $pa - 1$ do \\
+\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
+\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
+\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
+\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
+\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+6.  $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
 \\
-Zero excess digits. \\
-10.  If $digs < oldused$ then do \\
-\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\
+7.  $oldused \leftarrow c.used$ \\
+8.  $c.used \leftarrow digs$ \\
+9.  for $ix$ from $0$ to $pa$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow W_{ix}$ \\
+10.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
+\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\
+\\
+11.  Clamp $c$. \\
+12.  Return MP\_OKAY. \\
 \hline
 \end{tabular}
 \end{center}
@@ -2625,15 +2631,24 @@
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
-essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
-
-The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
-unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
-
-The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
-a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
-iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
+
+The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the 
+loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
+reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
+
+The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
+$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
+$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.  
+
+The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
+means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
+pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to 
+move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until 
+$tx \ge a.used$ or $ty < 0$ occurs.
+
+After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
+into the next round by dividing $\_ \hat W$ by $\beta$.
 
 To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
 cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
@@ -2643,20 +2658,20 @@
 
 EXAM,bn_fast_s_mp_mul_digs.c
 
-The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
-implementation a series of aliases (\textit{lines @67, tmpx@, @70, tmpy@ and @75,_W@}) are used to simplify the inner $O(n^2)$ loop.  
-In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
-
-The inner loop on lines @83,for@, @84,mp_word@ and @85,}@ is where the algorithm will spend the majority of the time, which is why it has been 
-stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
-very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
-(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
-and scheduling the instructions so there are very few dependency stalls.
-
-In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
-baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
-digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
-be simultaneously used.  
+As per the pseudo--code we first calculate $pa$ (line @47,MIN@) as the number of digits to output.  Next we begin the outer loop
+to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines @61,tmpx@, @62,tmpy@) to point
+inside the two multiplicands quickly.  
+
+The inner loop (lines @70,for@ to @72,}@) of this implementation is where the tradeoff come into play.  Originally this comba 
+implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
+the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
+one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
+is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often 
+slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
+compiler has aliased $\_ \hat W$ to a CPU register.
+
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines @75,W[ix]@, @78,>>@) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line @82,W[ix]@) as the last digit of the product.  
 
 \subsection{Polynomial Basis Multiplication}
 To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
@@ -2976,13 +2991,26 @@
 
 EXAM,bn_mp_toom_mul.c
 
--- Comments to be added during editing phase.
+The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very 
+large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
+Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
+algorithm is not practical as Karatsuba has a much lower cutoff point.
+
+First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines @40,mod@ to @69,rshd@) with 
+combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
+for $b$.  
+
+Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
+we get those out of the way first (lines @72,mul@ and @77,mul@).  Next we compute $w1, w2$ and $w3$ using Horners method.
+
+After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
+straight forward.  
 
 \subsection{Signed Multiplication}
 Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
 of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -3065,7 +3093,7 @@
 The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
 will not handle.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -3121,9 +3149,14 @@
 
 EXAM,bn_s_mp_sqr.c
 
-Inside the outer loop (\textit{see line @32,for@}) the square term is calculated on line @35,r =@.  Line @42,>>@ extracts the carry from the square
-term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines @45,tmpx@ and @48,tmpt@ respectively.  The doubling is performed using two
-additions (\textit{see line @57,r + r@}) since it is usually faster than shifting,if not at least as fast.  
+Inside the outer loop (line @32,for@) the square term is calculated on line @35,r =@.  The carry (line @42,>>@) has been
+extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized 
+(lines @45,tmpx@ and @48,tmpt@) to simplify the inner loop.  The doubling is performed using two
+additions (line @57,r + r@) since it is usually faster than shifting, if not at least as fast.  
+
+The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
+get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
+square a number. 
 
 \subsection{Faster Squaring by the ``Comba'' Method}
 A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
@@ -3135,9 +3168,9 @@
 that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
 $ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
 
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
-arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
-moved to a $O(n)$ work level outside the $O(n^2)$ level.  
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two 
+mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and 
+carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -3147,34 +3180,34 @@
 \textbf{Input}.   mp\_int $a$ \\
 \textbf{Output}.  $b \leftarrow a^2$ \\
 \hline \\
-Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
 1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
 2.  If step 1 failed return(\textit{MP\_MEM}). \\
-3.  for $ix$ from $0$ to $2a.used + 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}Compute the square.\\
-\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\
-\\
-\hspace{3mm}Compute the double products.\\
-\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
-5.  $oldused \leftarrow b.used$ \\
-6.  $b.used \leftarrow 2a.used + 1$ \\
 \\
-Double the products and propagate the carries simultaneously. \\
-7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
-8.  for $ix$ from $1$ to $2a.used$ do \\
-\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
-\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
-\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
-10.  if $2a.used + 1 < oldused$ then do \\
-\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
-\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
-11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\ 
+3.  $pa \leftarrow 2 \cdot a.used$ \\
+4.  $\hat W1 \leftarrow 0$ \\
+5.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
+\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
+\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
+\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
+\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
+\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
+\hspace{3mm}5.8  if $ix$ is even then \\
+\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
+\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+\\
+6.  $oldused \leftarrow b.used$ \\
+7.  $b.used \leftarrow 2 \cdot a.used$ \\
+8.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa$ to $oldused - 1$ do \\
+\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+11.  Return(\textit{MP\_OKAY}). \\ 
 \hline
 \end{tabular}
 \end{center}
@@ -3183,24 +3216,24 @@
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
-the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
-
-This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
-array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
-processors to simply make it a full size array.
-
-The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
-it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
-computes the sum of the products for each column.  They are not doubled until later.
-
-After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
-operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
-squares in place.  
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm 
+s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
+
+First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
+products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
+addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
+$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
+of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
+fewer multiplications and the routine ends up being faster.
+
+Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
+only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
 
 EXAM,bn_fast_s_mp_sqr.c
 
--- Write something deep and insightful later, Tom.
+This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for 
+the special case of squaring.  
 
 \subsection{Polynomial Basis Squaring}
 The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
@@ -3312,14 +3345,13 @@
 is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
 it is actually below the Comba limit (\textit{at 110 digits}).
 
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
-the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
-
-\textit{Last paragraph sucks.  re-write! -- Tom}
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are 
+redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and 
+mp\_clears are executed normally.
 
 \subsection{Toom-Cook Squaring}
 The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
+instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to 
 derive their own Toom-Cook squaring algorithm.  
 
 \subsection{High Level Squaring}
@@ -3362,12 +3394,9 @@
 $\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
                       & that have different number of digits in Karatsuba multiplication. \\
                       & \\
-$\left [ 3 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
+$\left [ 2 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
                       & of double products and at most one square is stated.  Prove this statement. \\
                       & \\                      
-$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
-                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
-                      & \\
 $\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
                       & \\
 $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
@@ -3375,6 +3404,14 @@
 $\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
                       & required for equation $6.7$ to be true.  \\
                       & \\
+$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
+                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
+                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
+                      &\\
+$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
+                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
+                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
+                      & \\
 \end{tabular}
 
 \chapter{Modular Reduction}
@@ -3394,7 +3431,7 @@
 Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
 is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
 RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
-Elliptic Curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
 exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
 range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
 algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
@@ -3610,7 +3647,7 @@
 In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
 future use so that the Barrett algorithm can be used without delay.  
 
-\begin{figure}[!here]
+\newpage\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -5818,6 +5855,8 @@
 defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
 equivalent to equation \ref{eqn:legendre}.
 
+\textit{-- Tom, don't be an ass, cite your source here...!}
+
 \begin{equation}
 a^{(p-1)/2} \equiv \begin{array}{rl}
                               -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
--- a/tommath.tex	Sun Dec 19 11:33:56 2004 +0000
+++ b/tommath.tex	Fri May 06 08:59:30 2005 +0000
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition }
+\title{Multi--Precision Math}
 \author{\mbox{
 %\begin{small}
 \begin{tabular}{c}
@@ -66,7 +66,7 @@
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.30 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.35 release of the 
 LibTomMath project.
 
 \begin{alltt}
@@ -85,66 +85,32 @@
 
 \tableofcontents
 \listoffigures
-\chapter*{Prefaces to the Draft Edition}
-I started this text in April 2003 to complement my LibTomMath library.  That is, explain how to implement the functions
-contained in LibTomMath.  The goal is to have a textbook that any Computer Science student can use when implementing their
-own multiple precision arithmetic.  The plan I wanted to follow was flesh out all the
-ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time.  Chance
-would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the
-text.  
-
-Choosing to not waste any time I dove right into the project even before my spring semester was finished.  I wrote a bit
-off and on at first.  The moment my exams were finished I jumped into long 12 to 16 hour days.  The result after only
-a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted
-to read it.  I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara.  So far I have
-managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain
-rewarding.
-
-Now we are past December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
-Currently I am far off from this goal.  I've done partial re-writes of chapters one, two and three but they are not even
-finished yet.  I haven't given up on the project, only had some setbacks.  First O'Reilly declined to publish the text then
-Addison-Wesley and Greg is tried another which I don't know the name of.  However, at this point I want to focus my energy
-onto finishing the book not securing a contract.
-
-So why am I writing this text?  It seems like a lot of work right?  Most certainly it is a lot of work writing a textbook.  
-Even the simplest introductory material has to be lined with references and figures.  A lot of the text has to be re-written
-from point form to prose form to ensure an easier read.  Why am I doing all this work for free then?  Simple. My philosophy
-is quite simply ``Open Source.  Open Academia.  Open Minds'' which means that to achieve a goal of open minds, that is,
-people willing to accept new ideas and explore the unknown you have to make available material they can access freely 
-without hinderance.  
-
-I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come
-to depend upon.  I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their
-software.  Several educational institutions use it as a matter of course and many freelance developers use it as
-part of their projects.  To further my contributions I started the LibTomMath project in December 2002 aimed at providing
-multiple precision arithmetic routines that students could learn from.  That is write routines that are not only easy
-to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C.  
-
-The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in.  In the end, when all is
-said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic.  
-
-At this time I feel I should share a little information about myself.  The most common question I was asked at 
-Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended.  The unfortunate
-truth is that I neither teach at or attend a school of academic reputation.  I'm currently at Algonquin College which 
-is what I'd like to call ``somewhat academic but mostly vocational'' college.  In otherwords, job training.
-
-I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen
-computer science fields, a few fields of mathematics and some English).  I look forward to teaching someday but I am
-still far off from that goal.  
-
-Now it would be improper for me to not introduce the rest of the texts co-authors.  While they are only contributing 
-corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out
-in the text so far.  Greg has always been there for me.  He has tracked my LibTom projects since their inception and even
-sent cheques to help pay tuition from time to time.  His background has provided a wonderful source to bounce ideas off
-of and improve the quality of my writing.  Mads is another fellow who has just ``been there''.  I don't even recall what
-his interest in the LibTom projects is but I'm definitely glad he has been around.  His ability to catch logical errors
-in my written English have saved me on several occasions to say the least.
-
-What to expect next?  Well this is still a rough draft.  I've only had the chance to update a few chapters.  However, I've
-been getting the feeling that people are starting to use my text and I owe them some updated material.  My current tenative
-plan is to edit one chapter every two weeks starting January 4th.  It seems insane but my lower course load at college
-should provide ample time.  By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many
-people who will take it.
+\chapter*{Prefaces}
+When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.  
+They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''  
+Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which 
+perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
+others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give 
+back to society in the form of tools and knowledge that can help others in their endeavours.
+
+I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
+code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
+explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
+itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
+of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
+from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
+
+This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
+of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
+length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
+comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg 
+were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to 
+continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.  
+
+To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
+honour your kind gestures with this project.
+
+Open Source.  Open Academia.  Open Minds.
 
 \begin{flushright} Tom St Denis \end{flushright}
 
@@ -1045,7 +1011,7 @@
 \end{alltt}
 \end{small}
 
-A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line 23) checks
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line 24) checks
 if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
 the function skips the re-allocation part thus saving time.
 
@@ -1590,14 +1556,20 @@
 \begin{alltt}
 016   
 017   /* set to zero */
-018   void
-019   mp_zero (mp_int * a)
-020   \{
-021     a->sign = MP_ZPOS;
-022     a->used = 0;
-023     memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
-024   \}
-025   #endif
+018   void mp_zero (mp_int * a)
+019   \{
+020     int       n;
+021     mp_digit *tmp;
+022   
+023     a->sign = MP_ZPOS;
+024     a->used = 0;
+025   
+026     tmp = a->dp;
+027     for (n = 0; n < a->alloc; n++) \{
+028        *tmp++ = 0;
+029     \}
+030   \}
+031   #endif
 \end{alltt}
 \end{small}
 
@@ -1609,7 +1581,7 @@
 With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
 the absolute value of an mp\_int.
 
-\newpage\begin{figure}[here]
+\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_abs}. \\
@@ -1662,6 +1634,9 @@
 \end{alltt}
 \end{small}
 
+This fairly trivial algorithm first eliminates non--required duplications (line 27) and then sets the
+\textbf{sign} flag to \textbf{MP\_ZPOS}.
+
 \subsection{Integer Negation}
 With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
 the negative of an mp\_int input.
@@ -1702,23 +1677,33 @@
 018   int mp_neg (mp_int * a, mp_int * b)
 019   \{
 020     int     res;
-021     if ((res = mp_copy (a, b)) != MP_OKAY) \{
-022       return res;
-023     \}
-024     if (mp_iszero(b) != MP_YES) \{
-025        b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-026     \}
-027     return MP_OKAY;
-028   \}
-029   #endif
+021     if (a != b) \{
+022        if ((res = mp_copy (a, b)) != MP_OKAY) \{
+023           return res;
+024        \}
+025     \}
+026   
+027     if (mp_iszero(b) != MP_YES) \{
+028        b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+029     \} else \{
+030        b->sign = MP_ZPOS;
+031     \}
+032   
+033     return MP_OKAY;
+034   \}
+035   #endif
 \end{alltt}
 \end{small}
 
+Like mp\_abs() this function avoids non--required duplications (line 21) and then sets the sign.  We
+have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
+than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
+
 \section{Small Constants}
 \subsection{Setting Small Constants}
 Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
 
-\begin{figure}[here]
+\newpage\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_set}. \\
@@ -1757,11 +1742,14 @@
 \end{alltt}
 \end{small}
 
-Line 20 calls mp\_zero() to clear the mp\_int and reset the sign.  Line 21 copies the digit 
-into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
-reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
-$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line 22 will set the \textbf{used} member with respect to the 
-digit actually set. This function will always make the integer positive.
+First we zero (line 20) the mp\_int to make sure that the other members are initialized for a 
+small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
+is zero.  Next we set the digit and reduce it modulo $\beta$ (line 21).  After this step we have to 
+check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
+to zero.
+
+We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with 
+$2^k - 1$ will perform the same operation.
 
 One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
 this function should take that into account.  Only trivially small constants can be set using this function.
@@ -1936,10 +1924,12 @@
 \end{alltt}
 \end{small}
 
-The two if statements on lines 24 and 28 compare the number of digits in the two inputs.  These two are performed before all of the digits
-are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
-without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
-array of digits.
+The two if statements (lines 24 and 28) compare the number of digits in the two inputs.  These two are 
+performed before all of the digits are compared since it is a very cheap test to perform and can potentially save 
+considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be 
+smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
+
+
 
 \subsection{Signed Comparisons}
 Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
@@ -2000,9 +1990,9 @@
 \end{alltt}
 \end{small}
 
-The two if statements on lines 22 and 23 perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   At line 31, the inputs are compared based on magnitudes.  If the signs were both negative then 
-the unsigned comparison is performed in the opposite direction (\textit{line 33}).  Otherwise, the signs are assumed to 
+The two if statements (lines 22 and 23) perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   The inputs are compared (line 31) based on magnitudes.  If the signs were both 
+negative then the unsigned comparison is performed in the opposite direction (line 33).  Otherwise, the signs are assumed to 
 be both positive and a forward direction unsigned comparison is performed.
 
 \section*{Exercises}
@@ -2218,19 +2208,21 @@
 \end{alltt}
 \end{small}
 
-Lines 27 to 35 perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
-mp\_int assigned to the largest input, in effect it is a local alias.  Lines 37 to 42 ensure that the destination is grown to 
-accomodate the result of the addition. 
+We first sort (lines 27 to 35) the inputs based on magnitude and determine the $min$ and $max$ variables.
+Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
+grow the destination (37 to 42) ensure that it can accomodate the result of the addition. 
 
 Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
 lines 55, 58 and 61 represent the two inputs and destination variables respectively.  These aliases are used to ensure the
 compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
 
-The initial carry $u$ is cleared on line 64, note that $u$ is of type mp\_digit which ensures type compatibility within the 
-implementation.  The initial addition loop begins on line 65 and ends on line 74.  Similarly the conditional addition loop
-begins on line 80 and ends on line 90.  The addition is finished with the final carry being stored in $tmpc$ on line 93.  
-Note the ``++'' operator on the same line.  After line 93 $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop on lines 96 to 99 which set any old upper digits to zero.
+The initial carry $u$ will be cleared (line 64), note that $u$ is of type mp\_digit which ensures type 
+compatibility within the implementation.  The initial addition (line 65 to 74) adds digits from
+both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
+(line 80 to 90) adds the remaining digits from the larger of the two inputs.  The addition is finished 
+with the final carry being stored in $tmpc$ (line 93).  Note the ``++'' operator within the same expression.
+After line 93, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop (line 96 to 99) which set any old upper digits to zero.
 
 \subsection{Low Level Subtraction}
 The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
@@ -2245,7 +2237,7 @@
 mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
 
 For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
 
 \newpage\begin{figure}[!here]
 \begin{center}
@@ -2387,20 +2379,23 @@
 \end{alltt}
 \end{small}
 
-Line 24 and 25 perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
-used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines 41, 42 and 43 initialize the aliases for 
-$a$, $b$ and $c$ respectively.
-
-The first subtraction loop occurs on lines 46 through 60.  The theory behind the subtraction loop is exactly the same as that for
-the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
-(\textit{see line 56}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
-the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
-occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
-shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
-twos compliment machines which is a safe assumption to make.
-
-If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines 63 through 72}) is required to propagate the carry through
-$a$ and copy the result to $c$.  
+Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded 
+(lines 24 and 25).  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used 
+within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
+(lines 41, 42 and 43) for $a$, $b$ and $c$ respectively.
+
+The first subtraction loop (lines 46 through 60) subtract digits from both inputs until the smaller of
+the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward'' 
+method of extracting the carry (line 56).  The traditional method for extracting the carry would be to shift 
+by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of 
+the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry 
+extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the 
+most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This 
+optimization only works on twos compliment machines which is a safe assumption to make.
+
+If $a$ has a larger magnitude than $b$ an additional loop (lines 63 through 72) is required to propagate 
+the carry through $a$ and copy the result to $c$.  
 
 \subsection{High Level Addition}
 Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
@@ -2985,10 +2980,11 @@
 \end{alltt}
 \end{small}
 
-The if statement on line 23 ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
-the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line 41 is an alias
-for the leading digit while $bottom$ on line 44 is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
-over the input.  
+The if statement (line 23) ensures that the $b$ variable is greater than zero since we do not interpret negative
+shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates 
+the need for an additional variable in the for loop.  The variable $top$ (line 41) is an alias
+for the leading digit while $bottom$ (line 44) is an alias for the trailing edge.  The aliases form a 
+window of exactly $b$ digits over the input.  
 
 \subsection{Division by $x$}
 
@@ -3095,9 +3091,9 @@
 \end{alltt}
 \end{small}
 
-The only noteworthy element of this routine is the lack of a return type.  
-
--- Will update later to give it a return type...Tom
+The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
+form a sliding window except we copy in the other direction.  After the window (line 59) we then zero
+the upper digits of the input to make sure the result is correct.
 
 \section{Powers of Two}
 
@@ -3228,7 +3224,15 @@
 \end{alltt}
 \end{small}
 
-Notes to be revised when code is updated. -- Tom
+The shifting is performed in--place which means the first step (line 24) is to copy the input to the 
+destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
+has to be grown (line 31) to accomodate the result.
+
+If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples 
+of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift 
+loop (lines 45 to 76) we make use of pre--computed values $shift$ and $mask$.   These are used to
+extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a 
+chain between consecutive iterations to propagate the carry.  
 
 \subsection{Division by Power of Two}
 
@@ -3361,7 +3365,8 @@
 result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
 the quotient is obtained.
 
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
+the direction of the shifts.
 
 \subsection{Remainder of Division by Power of Two}
 
@@ -3420,7 +3425,7 @@
 027     \}
 028   
 029     /* if the modulus is larger than the value than return */
-030     if (b > (int) (a->used * DIGIT_BIT)) \{
+030     if (b >= (int) (a->used * DIGIT_BIT)) \{
 031       res = mp_copy (a, c);
 032       return res;
 033     \}
@@ -3446,7 +3451,13 @@
 \end{alltt}
 \end{small}
 
--- Add comments later, Tom.
+We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
+than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
+perform some work to produce the remainder.
+
+Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce 
+the number.  First we zero any digits above the last digit in $2^b$ (line 41).  Next we reduce the 
+leading digit of both (line 45) and then mp\_clamp().
 
 \section*{Exercises}
 \begin{tabular}{cl}
@@ -3611,101 +3622,113 @@
 018    * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
 019    * many digits of output are created.
 020    */
-021   int
-022   s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-023   \{
-024     mp_int  t;
-025     int     res, pa, pb, ix, iy;
-026     mp_digit u;
-027     mp_word r;
-028     mp_digit tmpx, *tmpt, *tmpy;
-029   
-030     /* can we use the fast multiplier? */
-031     if (((digs) < MP_WARRAY) &&
-032         MIN (a->used, b->used) < 
-033             (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
-034       return fast_s_mp_mul_digs (a, b, c, digs);
-035     \}
-036   
-037     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
-038       return res;
-039     \}
-040     t.used = digs;
-041   
-042     /* compute the digits of the product directly */
-043     pa = a->used;
-044     for (ix = 0; ix < pa; ix++) \{
-045       /* set the carry to zero */
-046       u = 0;
-047   
-048       /* limit ourselves to making digs digits of output */
-049       pb = MIN (b->used, digs - ix);
-050   
-051       /* setup some aliases */
-052       /* copy of the digit from a used within the nested loop */
-053       tmpx = a->dp[ix];
-054       
-055       /* an alias for the destination shifted ix places */
-056       tmpt = t.dp + ix;
-057       
-058       /* an alias for the digits of b */
-059       tmpy = b->dp;
-060   
-061       /* compute the columns of the output and propagate the carry */
-062       for (iy = 0; iy < pb; iy++) \{
-063         /* compute the column as a mp_word */
-064         r       = ((mp_word)*tmpt) +
-065                   ((mp_word)tmpx) * ((mp_word)*tmpy++) +
-066                   ((mp_word) u);
-067   
-068         /* the new column is the lower part of the result */
-069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-070   
-071         /* get the carry word from the result */
-072         u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-073       \}
-074       /* set carry if it is placed below digs */
-075       if (ix + iy < digs) \{
-076         *tmpt = u;
-077       \}
-078     \}
-079   
-080     mp_clamp (&t);
-081     mp_exch (&t, c);
-082   
-083     mp_clear (&t);
-084     return MP_OKAY;
-085   \}
-086   #endif
+021   int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+022   \{
+023     mp_int  t;
+024     int     res, pa, pb, ix, iy;
+025     mp_digit u;
+026     mp_word r;
+027     mp_digit tmpx, *tmpt, *tmpy;
+028   
+029     /* can we use the fast multiplier? */
+030     if (((digs) < MP_WARRAY) &&
+031         MIN (a->used, b->used) < 
+032             (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
+033       return fast_s_mp_mul_digs (a, b, c, digs);
+034     \}
+035   
+036     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
+037       return res;
+038     \}
+039     t.used = digs;
+040   
+041     /* compute the digits of the product directly */
+042     pa = a->used;
+043     for (ix = 0; ix < pa; ix++) \{
+044       /* set the carry to zero */
+045       u = 0;
+046   
+047       /* limit ourselves to making digs digits of output */
+048       pb = MIN (b->used, digs - ix);
+049   
+050       /* setup some aliases */
+051       /* copy of the digit from a used within the nested loop */
+052       tmpx = a->dp[ix];
+053       
+054       /* an alias for the destination shifted ix places */
+055       tmpt = t.dp + ix;
+056       
+057       /* an alias for the digits of b */
+058       tmpy = b->dp;
+059   
+060       /* compute the columns of the output and propagate the carry */
+061       for (iy = 0; iy < pb; iy++) \{
+062         /* compute the column as a mp_word */
+063         r       = ((mp_word)*tmpt) +
+064                   ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+065                   ((mp_word) u);
+066   
+067         /* the new column is the lower part of the result */
+068         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+069   
+070         /* get the carry word from the result */
+071         u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+072       \}
+073       /* set carry if it is placed below digs */
+074       if (ix + iy < digs) \{
+075         *tmpt = u;
+076       \}
+077     \}
+078   
+079     mp_clamp (&t);
+080     mp_exch (&t, c);
+081   
+082     mp_clear (&t);
+083     return MP_OKAY;
+084   \}
+085   #endif
 \end{alltt}
 \end{small}
 
-Lines 31 to 35 determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
-the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
-the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
-
-Of particular importance is the calculation of the $ix+iy$'th column on lines 64, 65 and 66.  Note how all of the
-variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
-are used instead of single precision.  The multiplication on line 65 makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
-the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
-processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
-example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+First we determine (line 30) if the Comba method can be used first since it's faster.  The conditions for 
+sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than 
+\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is 
+set to $\delta$ but can be reduced when memory is at a premium.
+
+If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
+$t$ (line 36) to the exact size of the output to avoid further re--allocations.  At this point we now 
+begin the $O(n^2)$ loop.
+
+This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
+digits as output.  In each iteration of the outer loop the $pb$ variable is set (line 48) to the maximum 
+number of inner loop iterations.  
+
+Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
+carry from the previous iteration.  A particularly important observation is that most modern optimizing 
+C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that 
+is required for the product.  In x86 terms for example, this means using the MUL instruction.
+
+Each digit of the product is stored in turn (line 68) and the carry propagated (line 71) to the 
+next iteration.
 
 \subsection{Faster Multiplication by the ``Comba'' Method}
 
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
-makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
-(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
-carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
-his 1986 paper \cite{BARRETT} written five years before.
-
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
-the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
-final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
-
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
-simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
-of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be 
+computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement 
+in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G. 
+Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an 
+interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written 
+five years before.
+
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight 
+twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products 
+are produced then added together to form the final result.  In the baseline algorithm the columns are added together 
+after each iteration to get the result instantaneously.  
+
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at 
+the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated 
+after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute 
+the product vector $\vec x$ as follows. 
 
 \begin{equation}
 \vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
@@ -3799,38 +3822,32 @@
 \textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
 \textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
 \hline \\
-Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
 1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
 2.  If step 1 failed return(\textit{MP\_MEM}).\\
 \\
-Zero the temporary array $\hat W$. \\
-3.  for $n$ from $0$ to $digs - 1$ do \\
-\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
-\\
-Compute the columns. \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
-\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
 \\
-Propagate the carries upwards. \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow digs$ \\
-7.  If $digs > 1$ then do \\
-\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
-\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
-\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
-8.  else do \\
-\hspace{3mm}8.1  $ix \leftarrow 0$ \\
-9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+4.  $\_ \hat W \leftarrow 0$ \\
+5.  for $ix$ from 0 to $pa - 1$ do \\
+\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
+\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
+\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
+\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
+\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+6.  $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
 \\
-Zero excess digits. \\
-10.  If $digs < oldused$ then do \\
-\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\
+7.  $oldused \leftarrow c.used$ \\
+8.  $c.used \leftarrow digs$ \\
+9.  for $ix$ from $0$ to $pa$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow W_{ix}$ \\
+10.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
+\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\
+\\
+11.  Clamp $c$. \\
+12.  Return MP\_OKAY. \\
 \hline
 \end{tabular}
 \end{center}
@@ -3840,15 +3857,24 @@
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
-essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
-
-The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
-unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
-
-The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
-a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
-iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
+
+The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the 
+loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
+reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
+
+The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
+$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
+$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.  
+
+The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
+means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
+pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to 
+move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until 
+$tx \ge a.used$ or $ty < 0$ occurs.
+
+After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
+into the next round by dividing $\_ \hat W$ by $\beta$.
 
 To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
 cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
@@ -3877,94 +3903,95 @@
 030    * Based on Algorithm 14.12 on pp.595 of HAC.
 031    *
 032    */
-033   int
-034   fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-035   \{
-036     int     olduse, res, pa, ix, iz;
-037     mp_digit W[MP_WARRAY];
-038     register mp_word  _W;
-039   
-040     /* grow the destination as required */
-041     if (c->alloc < digs) \{
-042       if ((res = mp_grow (c, digs)) != MP_OKAY) \{
-043         return res;
-044       \}
-045     \}
-046   
-047     /* number of output digits to produce */
-048     pa = MIN(digs, a->used + b->used);
-049   
-050     /* clear the carry */
-051     _W = 0;
-052     for (ix = 0; ix <= pa; ix++) \{ 
-053         int      tx, ty;
-054         int      iy;
-055         mp_digit *tmpx, *tmpy;
-056   
-057         /* get offsets into the two bignums */
-058         ty = MIN(b->used-1, ix);
-059         tx = ix - ty;
-060   
-061         /* setup temp aliases */
-062         tmpx = a->dp + tx;
-063         tmpy = b->dp + ty;
-064   
-065         /* this is the number of times the loop will iterrate, essentially its
-       
-066            while (tx++ < a->used && ty-- >= 0) \{ ... \}
-067          */
-068         iy = MIN(a->used-tx, ty+1);
-069   
-070         /* execute loop */
-071         for (iz = 0; iz < iy; ++iz) \{
-072            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-073         \}
-074   
-075         /* store term */
-076         W[ix] = ((mp_digit)_W) & MP_MASK;
-077   
-078         /* make next carry */
-079         _W = _W >> ((mp_word)DIGIT_BIT);
-080     \}
-081   
-082     /* setup dest */
-083     olduse  = c->used;
-084     c->used = digs;
-085   
-086     \{
-087       register mp_digit *tmpc;
-088       tmpc = c->dp;
-089       for (ix = 0; ix < digs; ix++) \{
-090         /* now extract the previous digit [below the carry] */
-091         *tmpc++ = W[ix];
-092       \}
-093   
-094       /* clear unused digits [that existed in the old copy of c] */
-095       for (; ix < olduse; ix++) \{
-096         *tmpc++ = 0;
-097       \}
-098     \}
-099     mp_clamp (c);
-100     return MP_OKAY;
-101   \}
-102   #endif
+033   int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+034   \{
+035     int     olduse, res, pa, ix, iz;
+036     mp_digit W[MP_WARRAY];
+037     register mp_word  _W;
+038   
+039     /* grow the destination as required */
+040     if (c->alloc < digs) \{
+041       if ((res = mp_grow (c, digs)) != MP_OKAY) \{
+042         return res;
+043       \}
+044     \}
+045   
+046     /* number of output digits to produce */
+047     pa = MIN(digs, a->used + b->used);
+048   
+049     /* clear the carry */
+050     _W = 0;
+051     for (ix = 0; ix < pa; ix++) \{ 
+052         int      tx, ty;
+053         int      iy;
+054         mp_digit *tmpx, *tmpy;
+055   
+056         /* get offsets into the two bignums */
+057         ty = MIN(b->used-1, ix);
+058         tx = ix - ty;
+059   
+060         /* setup temp aliases */
+061         tmpx = a->dp + tx;
+062         tmpy = b->dp + ty;
+063   
+064         /* this is the number of times the loop will iterrate, essentially 
+065            while (tx++ < a->used && ty-- >= 0) \{ ... \}
+066          */
+067         iy = MIN(a->used-tx, ty+1);
+068   
+069         /* execute loop */
+070         for (iz = 0; iz < iy; ++iz) \{
+071            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+072         \}
+073   
+074         /* store term */
+075         W[ix] = ((mp_digit)_W) & MP_MASK;
+076   
+077         /* make next carry */
+078         _W = _W >> ((mp_word)DIGIT_BIT);
+079     \}
+080   
+081     /* store final carry */
+082     W[ix] = (mp_digit)(_W & MP_MASK);
+083   
+084     /* setup dest */
+085     olduse  = c->used;
+086     c->used = pa;
+087   
+088     \{
+089       register mp_digit *tmpc;
+090       tmpc = c->dp;
+091       for (ix = 0; ix < pa+1; ix++) \{
+092         /* now extract the previous digit [below the carry] */
+093         *tmpc++ = W[ix];
+094       \}
+095   
+096       /* clear unused digits [that existed in the old copy of c] */
+097       for (; ix < olduse; ix++) \{
+098         *tmpc++ = 0;
+099       \}
+100     \}
+101     mp_clamp (c);
+102     return MP_OKAY;
+103   \}
+104   #endif
 \end{alltt}
 \end{small}
 
-The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
-implementation a series of aliases (\textit{lines 62, 63 and 76}) are used to simplify the inner $O(n^2)$ loop.  
-In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
-
-The inner loop on lines 89, 79 and 80 is where the algorithm will spend the majority of the time, which is why it has been 
-stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
-very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
-(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
-and scheduling the instructions so there are very few dependency stalls.
-
-In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
-baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
-digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
-be simultaneously used.  
+As per the pseudo--code we first calculate $pa$ (line 47) as the number of digits to output.  Next we begin the outer loop
+to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines 61, 62) to point
+inside the two multiplicands quickly.  
+
+The inner loop (lines 70 to 72) of this implementation is where the tradeoff come into play.  Originally this comba 
+implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
+the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
+one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
+is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often 
+slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
+compiler has aliased $\_ \hat W$ to a CPU register.
+
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 75, 78) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line 82) as the last digit of the product.  
 
 \subsection{Polynomial Basis Multiplication}
 To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
@@ -4441,277 +4468,290 @@
 016   
 017   /* multiplication using the Toom-Cook 3-way algorithm 
 018    *
-019    * Much more complicated than Karatsuba but has a lower asymptotic running t
-      ime of 
-020    * O(N**1.464).  This algorithm is only particularly useful on VERY large
-021    * inputs (we're talking 1000s of digits here...).
-022   */
-023   int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
-024   \{
-025       mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
-026       int res, B;
-027           
-028       /* init temps */
-029       if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
-030                                &a0, &a1, &a2, &b0, &b1, 
-031                                &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{
-032          return res;
-033       \}
-034       
-035       /* B */
-036       B = MIN(a->used, b->used) / 3;
-037       
-038       /* a = a2 * B**2 + a1 * B + a0 */
-039       if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{
-040          goto ERR;
-041       \}
-042   
-043       if ((res = mp_copy(a, &a1)) != MP_OKAY) \{
-044          goto ERR;
-045       \}
-046       mp_rshd(&a1, B);
-047       mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
-048   
-049       if ((res = mp_copy(a, &a2)) != MP_OKAY) \{
-050          goto ERR;
-051       \}
-052       mp_rshd(&a2, B*2);
-053       
-054       /* b = b2 * B**2 + b1 * B + b0 */
-055       if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{
-056          goto ERR;
-057       \}
-058   
-059       if ((res = mp_copy(b, &b1)) != MP_OKAY) \{
-060          goto ERR;
-061       \}
-062       mp_rshd(&b1, B);
-063       mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
-064   
-065       if ((res = mp_copy(b, &b2)) != MP_OKAY) \{
-066          goto ERR;
-067       \}
-068       mp_rshd(&b2, B*2);
-069       
-070       /* w0 = a0*b0 */
-071       if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{
-072          goto ERR;
-073       \}
-074       
-075       /* w4 = a2 * b2 */
-076       if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{
-077          goto ERR;
-078       \}
-079       
-080       /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
-081       if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{
-082          goto ERR;
-083       \}
-084       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-085          goto ERR;
-086       \}
-087       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-088          goto ERR;
-089       \}
-090       if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{
-091          goto ERR;
-092       \}
-093       
-094       if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{
-095          goto ERR;
-096       \}
-097       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-098          goto ERR;
-099       \}
-100       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-101          goto ERR;
-102       \}
-103       if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{
-104          goto ERR;
-105       \}
-106       
-107       if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{
-108          goto ERR;
-109       \}
-110       
-111       /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
-112       if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{
-113          goto ERR;
-114       \}
-115       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-116          goto ERR;
-117       \}
-118       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-119          goto ERR;
-120       \}
-121       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-122          goto ERR;
-123       \}
-124       
-125       if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{
-126          goto ERR;
-127       \}
-128       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-129          goto ERR;
-130       \}
-131       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-132          goto ERR;
-133       \}
-134       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-135          goto ERR;
-136       \}
-137       
-138       if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{
-139          goto ERR;
-140       \}
-141       
-142   
-143       /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
-144       if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{
-145          goto ERR;
-146       \}
-147       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-148          goto ERR;
-149       \}
-150       if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{
-151          goto ERR;
-152       \}
-153       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-154          goto ERR;
-155       \}
-156       if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{
-157          goto ERR;
-158       \}
-159       
-160       /* now solve the matrix 
-161       
-162          0  0  0  0  1
-163          1  2  4  8  16
-164          1  1  1  1  1
-165          16 8  4  2  1
-166          1  0  0  0  0
-167          
-168          using 12 subtractions, 4 shifts, 
-169                 2 small divisions and 1 small multiplication 
-170        */
-171        
-172        /* r1 - r4 */
-173        if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{
-174           goto ERR;
-175        \}
-176        /* r3 - r0 */
-177        if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{
-178           goto ERR;
-179        \}
-180        /* r1/2 */
-181        if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{
-182           goto ERR;
-183        \}
-184        /* r3/2 */
-185        if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{
-186           goto ERR;
-187        \}
-188        /* r2 - r0 - r4 */
-189        if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{
-190           goto ERR;
-191        \}
-192        if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{
-193           goto ERR;
-194        \}
-195        /* r1 - r2 */
-196        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-197           goto ERR;
-198        \}
-199        /* r3 - r2 */
-200        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-201           goto ERR;
-202        \}
-203        /* r1 - 8r0 */
-204        if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{
-205           goto ERR;
-206        \}
-207        if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{
-208           goto ERR;
-209        \}
-210        /* r3 - 8r4 */
-211        if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{
-212           goto ERR;
-213        \}
-214        if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{
-215           goto ERR;
-216        \}
-217        /* 3r2 - r1 - r3 */
-218        if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{
-219           goto ERR;
-220        \}
-221        if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{
-222           goto ERR;
-223        \}
-224        if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{
-225           goto ERR;
-226        \}
-227        /* r1 - r2 */
-228        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-229           goto ERR;
-230        \}
-231        /* r3 - r2 */
-232        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-233           goto ERR;
-234        \}
-235        /* r1/3 */
-236        if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{
-237           goto ERR;
-238        \}
-239        /* r3/3 */
-240        if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{
-241           goto ERR;
-242        \}
-243        
-244        /* at this point shift W[n] by B*n */
-245        if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{
-246           goto ERR;
-247        \}
-248        if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{
-249           goto ERR;
-250        \}
-251        if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{
-252           goto ERR;
-253        \}
-254        if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{
-255           goto ERR;
-256        \}     
-257        
-258        if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{
-259           goto ERR;
-260        \}
-261        if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{
-262           goto ERR;
-263        \}
-264        if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{
-265           goto ERR;
-266        \}
-267        if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{
-268           goto ERR;
-269        \}     
-270        
-271   ERR:
-272        mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
-273                       &a0, &a1, &a2, &b0, &b1, 
-274                       &b2, &tmp1, &tmp2, NULL);
-275        return res;
-276   \}     
-277        
-278   #endif
+019    * Much more complicated than Karatsuba but has a lower 
+020    * asymptotic running time of O(N**1.464).  This algorithm is 
+021    * only particularly useful on VERY large inputs 
+022    * (we're talking 1000s of digits here...).
+023   */
+024   int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+025   \{
+026       mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
+027       int res, B;
+028           
+029       /* init temps */
+030       if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
+031                                &a0, &a1, &a2, &b0, &b1, 
+032                                &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{
+033          return res;
+034       \}
+035       
+036       /* B */
+037       B = MIN(a->used, b->used) / 3;
+038       
+039       /* a = a2 * B**2 + a1 * B + a0 */
+040       if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{
+041          goto ERR;
+042       \}
+043   
+044       if ((res = mp_copy(a, &a1)) != MP_OKAY) \{
+045          goto ERR;
+046       \}
+047       mp_rshd(&a1, B);
+048       mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+049   
+050       if ((res = mp_copy(a, &a2)) != MP_OKAY) \{
+051          goto ERR;
+052       \}
+053       mp_rshd(&a2, B*2);
+054       
+055       /* b = b2 * B**2 + b1 * B + b0 */
+056       if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{
+057          goto ERR;
+058       \}
+059   
+060       if ((res = mp_copy(b, &b1)) != MP_OKAY) \{
+061          goto ERR;
+062       \}
+063       mp_rshd(&b1, B);
+064       mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
+065   
+066       if ((res = mp_copy(b, &b2)) != MP_OKAY) \{
+067          goto ERR;
+068       \}
+069       mp_rshd(&b2, B*2);
+070       
+071       /* w0 = a0*b0 */
+072       if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{
+073          goto ERR;
+074       \}
+075       
+076       /* w4 = a2 * b2 */
+077       if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{
+078          goto ERR;
+079       \}
+080       
+081       /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
+082       if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{
+083          goto ERR;
+084       \}
+085       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
+086          goto ERR;
+087       \}
+088       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
+089          goto ERR;
+090       \}
+091       if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{
+092          goto ERR;
+093       \}
+094       
+095       if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{
+096          goto ERR;
+097       \}
+098       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
+099          goto ERR;
+100       \}
+101       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
+102          goto ERR;
+103       \}
+104       if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{
+105          goto ERR;
+106       \}
+107       
+108       if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{
+109          goto ERR;
+110       \}
+111       
+112       /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
+113       if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{
+114          goto ERR;
+115       \}
+116       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
+117          goto ERR;
+118       \}
+119       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
+120          goto ERR;
+121       \}
+122       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
+123          goto ERR;
+124       \}
+125       
+126       if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{
+127          goto ERR;
+128       \}
+129       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
+130          goto ERR;
+131       \}
+132       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
+133          goto ERR;
+134       \}
+135       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
+136          goto ERR;
+137       \}
+138       
+139       if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{
+140          goto ERR;
+141       \}
+142       
+143   
+144       /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
+145       if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{
+146          goto ERR;
+147       \}
+148       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
+149          goto ERR;
+150       \}
+151       if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{
+152          goto ERR;
+153       \}
+154       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
+155          goto ERR;
+156       \}
+157       if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{
+158          goto ERR;
+159       \}
+160       
+161       /* now solve the matrix 
+162       
+163          0  0  0  0  1
+164          1  2  4  8  16
+165          1  1  1  1  1
+166          16 8  4  2  1
+167          1  0  0  0  0
+168          
+169          using 12 subtractions, 4 shifts, 
+170                 2 small divisions and 1 small multiplication 
+171        */
+172        
+173        /* r1 - r4 */
+174        if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{
+175           goto ERR;
+176        \}
+177        /* r3 - r0 */
+178        if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{
+179           goto ERR;
+180        \}
+181        /* r1/2 */
+182        if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{
+183           goto ERR;
+184        \}
+185        /* r3/2 */
+186        if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{
+187           goto ERR;
+188        \}
+189        /* r2 - r0 - r4 */
+190        if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{
+191           goto ERR;
+192        \}
+193        if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{
+194           goto ERR;
+195        \}
+196        /* r1 - r2 */
+197        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
+198           goto ERR;
+199        \}
+200        /* r3 - r2 */
+201        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
+202           goto ERR;
+203        \}
+204        /* r1 - 8r0 */
+205        if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{
+206           goto ERR;
+207        \}
+208        if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{
+209           goto ERR;
+210        \}
+211        /* r3 - 8r4 */
+212        if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{
+213           goto ERR;
+214        \}
+215        if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{
+216           goto ERR;
+217        \}
+218        /* 3r2 - r1 - r3 */
+219        if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{
+220           goto ERR;
+221        \}
+222        if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{
+223           goto ERR;
+224        \}
+225        if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{
+226           goto ERR;
+227        \}
+228        /* r1 - r2 */
+229        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
+230           goto ERR;
+231        \}
+232        /* r3 - r2 */
+233        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
+234           goto ERR;
+235        \}
+236        /* r1/3 */
+237        if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{
+238           goto ERR;
+239        \}
+240        /* r3/3 */
+241        if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{
+242           goto ERR;
+243        \}
+244        
+245        /* at this point shift W[n] by B*n */
+246        if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{
+247           goto ERR;
+248        \}
+249        if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{
+250           goto ERR;
+251        \}
+252        if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{
+253           goto ERR;
+254        \}
+255        if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{
+256           goto ERR;
+257        \}     
+258        
+259        if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{
+260           goto ERR;
+261        \}
+262        if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{
+263           goto ERR;
+264        \}
+265        if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{
+266           goto ERR;
+267        \}
+268        if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{
+269           goto ERR;
+270        \}     
+271        
+272   ERR:
+273        mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
+274                       &a0, &a1, &a2, &b0, &b1, 
+275                       &b2, &tmp1, &tmp2, NULL);
+276        return res;
+277   \}     
+278        
+279   #endif
 \end{alltt}
 \end{small}
 
--- Comments to be added during editing phase.
+The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very 
+large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
+Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
+algorithm is not practical as Karatsuba has a much lower cutoff point.
+
+First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines 40 to 69) with 
+combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
+for $b$.  
+
+Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
+we get those out of the way first (lines 72 and 77).  Next we compute $w1, w2$ and $w3$ using Horners method.
+
+After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
+straight forward.  
 
 \subsection{Signed Multiplication}
 Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
 of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -4844,7 +4884,7 @@
 The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
 will not handle.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -4904,75 +4944,79 @@
 \begin{alltt}
 016   
 017   /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-018   int
-019   s_mp_sqr (mp_int * a, mp_int * b)
-020   \{
-021     mp_int  t;
-022     int     res, ix, iy, pa;
-023     mp_word r;
-024     mp_digit u, tmpx, *tmpt;
-025   
-026     pa = a->used;
-027     if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{
-028       return res;
-029     \}
-030   
-031     /* default used is maximum possible size */
-032     t.used = 2*pa + 1;
-033   
-034     for (ix = 0; ix < pa; ix++) \{
-035       /* first calculate the digit at 2*ix */
-036       /* calculate double precision result */
-037       r = ((mp_word) t.dp[2*ix]) +
-038           ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
-039   
-040       /* store lower part in result */
-041       t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-042   
-043       /* get the carry */
-044       u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-045   
-046       /* left hand side of A[ix] * A[iy] */
-047       tmpx        = a->dp[ix];
-048   
-049       /* alias for where to store the results */
-050       tmpt        = t.dp + (2*ix + 1);
-051       
-052       for (iy = ix + 1; iy < pa; iy++) \{
-053         /* first calculate the product */
-054         r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
-055   
-056         /* now calculate the double precision result, note we use
-057          * addition instead of *2 since it's easier to optimize
-058          */
-059         r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
-060   
-061         /* store lower part */
-062         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-063   
-064         /* get carry */
-065         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-066       \}
-067       /* propagate upwards */
-068       while (u != ((mp_digit) 0)) \{
-069         r       = ((mp_word) *tmpt) + ((mp_word) u);
-070         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-071         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-072       \}
-073     \}
-074   
-075     mp_clamp (&t);
-076     mp_exch (&t, b);
-077     mp_clear (&t);
-078     return MP_OKAY;
-079   \}
-080   #endif
+018   int s_mp_sqr (mp_int * a, mp_int * b)
+019   \{
+020     mp_int  t;
+021     int     res, ix, iy, pa;
+022     mp_word r;
+023     mp_digit u, tmpx, *tmpt;
+024   
+025     pa = a->used;
+026     if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{
+027       return res;
+028     \}
+029   
+030     /* default used is maximum possible size */
+031     t.used = 2*pa + 1;
+032   
+033     for (ix = 0; ix < pa; ix++) \{
+034       /* first calculate the digit at 2*ix */
+035       /* calculate double precision result */
+036       r = ((mp_word) t.dp[2*ix]) +
+037           ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
+038   
+039       /* store lower part in result */
+040       t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+041   
+042       /* get the carry */
+043       u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+044   
+045       /* left hand side of A[ix] * A[iy] */
+046       tmpx        = a->dp[ix];
+047   
+048       /* alias for where to store the results */
+049       tmpt        = t.dp + (2*ix + 1);
+050       
+051       for (iy = ix + 1; iy < pa; iy++) \{
+052         /* first calculate the product */
+053         r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
+054   
+055         /* now calculate the double precision result, note we use
+056          * addition instead of *2 since it's easier to optimize
+057          */
+058         r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
+059   
+060         /* store lower part */
+061         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+062   
+063         /* get carry */
+064         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+065       \}
+066       /* propagate upwards */
+067       while (u != ((mp_digit) 0)) \{
+068         r       = ((mp_word) *tmpt) + ((mp_word) u);
+069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+070         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+071       \}
+072     \}
+073   
+074     mp_clamp (&t);
+075     mp_exch (&t, b);
+076     mp_clear (&t);
+077     return MP_OKAY;
+078   \}
+079   #endif
 \end{alltt}
 \end{small}
 
-Inside the outer loop (\textit{see line 34}) the square term is calculated on line 37.  Line 44 extracts the carry from the square
-term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines 47 and 50 respectively.  The doubling is performed using two
-additions (\textit{see line 59}) since it is usually faster than shifting,if not at least as fast.  
+Inside the outer loop (line 33) the square term is calculated on line 36.  The carry (line 43) has been
+extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized 
+(lines 46 and 49) to simplify the inner loop.  The doubling is performed using two
+additions (line 58) since it is usually faster than shifting, if not at least as fast.  
+
+The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
+get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
+square a number. 
 
 \subsection{Faster Squaring by the ``Comba'' Method}
 A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
@@ -4984,9 +5028,9 @@
 that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
 $ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
 
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
-arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
-moved to a $O(n)$ work level outside the $O(n^2)$ level.  
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two 
+mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and 
+carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -4996,34 +5040,34 @@
 \textbf{Input}.   mp\_int $a$ \\
 \textbf{Output}.  $b \leftarrow a^2$ \\
 \hline \\
-Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
 1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
 2.  If step 1 failed return(\textit{MP\_MEM}). \\
-3.  for $ix$ from $0$ to $2a.used + 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}Compute the square.\\
-\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\
-\\
-\hspace{3mm}Compute the double products.\\
-\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
-5.  $oldused \leftarrow b.used$ \\
-6.  $b.used \leftarrow 2a.used + 1$ \\
 \\
-Double the products and propagate the carries simultaneously. \\
-7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
-8.  for $ix$ from $1$ to $2a.used$ do \\
-\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
-\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
-\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
-10.  if $2a.used + 1 < oldused$ then do \\
-\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
-\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
-11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\ 
+3.  $pa \leftarrow 2 \cdot a.used$ \\
+4.  $\hat W1 \leftarrow 0$ \\
+5.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
+\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
+\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
+\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
+\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
+\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
+\hspace{3mm}5.8  if $ix$ is even then \\
+\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
+\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+\\
+6.  $oldused \leftarrow b.used$ \\
+7.  $b.used \leftarrow 2 \cdot a.used$ \\
+8.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa$ to $oldused - 1$ do \\
+\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+11.  Return(\textit{MP\_OKAY}). \\ 
 \hline
 \end{tabular}
 \end{center}
@@ -5032,146 +5076,123 @@
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
-the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
-
-This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
-array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
-processors to simply make it a full size array.
-
-The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
-it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
-computes the sum of the products for each column.  They are not doubled until later.
-
-After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
-operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
-squares in place.  
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm 
+s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
+
+First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
+products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
+addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
+$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
+of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
+fewer multiplications and the routine ends up being faster.
+
+Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
+only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
 
 \vspace{+3mm}\begin{small}
 \hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_sqr.c
 \vspace{-3mm}
 \begin{alltt}
 016   
-017   /* fast squaring
-018    *
-019    * This is the comba method where the columns of the product
-020    * are computed first then the carries are computed.  This
-021    * has the effect of making a very simple inner loop that
-022    * is executed the most
-023    *
-024    * W2 represents the outer products and W the inner.
-025    *
-026    * A further optimizations is made because the inner
-027    * products are of the form "A * B * 2".  The *2 part does
-028    * not need to be computed until the end which is good
-029    * because 64-bit shifts are slow!
-030    *
-031    * Based on Algorithm 14.16 on pp.597 of HAC.
-032    *
-033    */
-034   /* the jist of squaring...
-035   
-036   you do like mult except the offset of the tmpx [one that starts closer to ze
-      ro]
-037   can't equal the offset of tmpy.  So basically you set up iy like before then
-       you min it with
-038   (ty-tx) so that it never happens.  You double all those you add in the inner
-       loop
-039   
-040   After that loop you do the squares and add them in.
-041   
-042   Remove W2 and don't memset W
-043   
-044   */
-045   
-046   int fast_s_mp_sqr (mp_int * a, mp_int * b)
-047   \{
-048     int       olduse, res, pa, ix, iz;
-049     mp_digit   W[MP_WARRAY], *tmpx;
-050     mp_word   W1;
-051   
-052     /* grow the destination as required */
-053     pa = a->used + a->used;
-054     if (b->alloc < pa) \{
-055       if ((res = mp_grow (b, pa)) != MP_OKAY) \{
-056         return res;
-057       \}
-058     \}
-059   
-060     /* number of output digits to produce */
-061     W1 = 0;
-062     for (ix = 0; ix <= pa; ix++) \{ 
-063         int      tx, ty, iy;
-064         mp_word  _W;
-065         mp_digit *tmpy;
-066   
-067         /* clear counter */
-068         _W = 0;
+017   /* the jist of squaring...
+018    * you do like mult except the offset of the tmpx [one that 
+019    * starts closer to zero] can't equal the offset of tmpy.  
+020    * So basically you set up iy like before then you min it with
+021    * (ty-tx) so that it never happens.  You double all those 
+022    * you add in the inner loop
+023   
+024   After that loop you do the squares and add them in.
+025   */
+026   
+027   int fast_s_mp_sqr (mp_int * a, mp_int * b)
+028   \{
+029     int       olduse, res, pa, ix, iz;
+030     mp_digit   W[MP_WARRAY], *tmpx;
+031     mp_word   W1;
+032   
+033     /* grow the destination as required */
+034     pa = a->used + a->used;
+035     if (b->alloc < pa) \{
+036       if ((res = mp_grow (b, pa)) != MP_OKAY) \{
+037         return res;
+038       \}
+039     \}
+040   
+041     /* number of output digits to produce */
+042     W1 = 0;
+043     for (ix = 0; ix < pa; ix++) \{ 
+044         int      tx, ty, iy;
+045         mp_word  _W;
+046         mp_digit *tmpy;
+047   
+048         /* clear counter */
+049         _W = 0;
+050   
+051         /* get offsets into the two bignums */
+052         ty = MIN(a->used-1, ix);
+053         tx = ix - ty;
+054   
+055         /* setup temp aliases */
+056         tmpx = a->dp + tx;
+057         tmpy = a->dp + ty;
+058   
+059         /* this is the number of times the loop will iterrate, essentially
+060            while (tx++ < a->used && ty-- >= 0) \{ ... \}
+061          */
+062         iy = MIN(a->used-tx, ty+1);
+063   
+064         /* now for squaring tx can never equal ty 
+065          * we halve the distance since they approach at a rate of 2x
+066          * and we have to round because odd cases need to be executed
+067          */
+068         iy = MIN(iy, (ty-tx+1)>>1);
 069   
-070         /* get offsets into the two bignums */
-071         ty = MIN(a->used-1, ix);
-072         tx = ix - ty;
-073   
-074         /* setup temp aliases */
-075         tmpx = a->dp + tx;
-076         tmpy = a->dp + ty;
+070         /* execute loop */
+071         for (iz = 0; iz < iy; iz++) \{
+072            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+073         \}
+074   
+075         /* double the inner product and add carry */
+076         _W = _W + _W + W1;
 077   
-078         /* this is the number of times the loop will iterrate, essentially its
-       
-079            while (tx++ < a->used && ty-- >= 0) \{ ... \}
-080          */
-081         iy = MIN(a->used-tx, ty+1);
+078         /* even columns have the square term in them */
+079         if ((ix&1) == 0) \{
+080            _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+081         \}
 082   
-083         /* now for squaring tx can never equal ty 
-084          * we halve the distance since they approach at a rate of 2x
-085          * and we have to round because odd cases need to be executed
-086          */
-087         iy = MIN(iy, (ty-tx+1)>>1);
-088   
-089         /* execute loop */
-090         for (iz = 0; iz < iy; iz++) \{
-091            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-092         \}
+083         /* store it */
+084         W[ix] = (mp_digit)(_W & MP_MASK);
+085   
+086         /* make next carry */
+087         W1 = _W >> ((mp_word)DIGIT_BIT);
+088     \}
+089   
+090     /* setup dest */
+091     olduse  = b->used;
+092     b->used = a->used+a->used;
 093   
-094         /* double the inner product and add carry */
-095         _W = _W + _W + W1;
-096   
-097         /* even columns have the square term in them */
-098         if ((ix&1) == 0) \{
-099            _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
-100         \}
-101   
-102         /* store it */
-103         W[ix] = _W;
-104   
-105         /* make next carry */
-106         W1 = _W >> ((mp_word)DIGIT_BIT);
-107     \}
-108   
-109     /* setup dest */
-110     olduse  = b->used;
-111     b->used = a->used+a->used;
-112   
-113     \{
-114       mp_digit *tmpb;
-115       tmpb = b->dp;
-116       for (ix = 0; ix < pa; ix++) \{
-117         *tmpb++ = W[ix] & MP_MASK;
-118       \}
-119   
-120       /* clear unused digits [that existed in the old copy of c] */
-121       for (; ix < olduse; ix++) \{
-122         *tmpb++ = 0;
-123       \}
-124     \}
-125     mp_clamp (b);
-126     return MP_OKAY;
-127   \}
-128   #endif
+094     \{
+095       mp_digit *tmpb;
+096       tmpb = b->dp;
+097       for (ix = 0; ix < pa; ix++) \{
+098         *tmpb++ = W[ix] & MP_MASK;
+099       \}
+100   
+101       /* clear unused digits [that existed in the old copy of c] */
+102       for (; ix < olduse; ix++) \{
+103         *tmpb++ = 0;
+104       \}
+105     \}
+106     mp_clamp (b);
+107     return MP_OKAY;
+108   \}
+109   #endif
 \end{alltt}
 \end{small}
 
--- Write something deep and insightful later, Tom.
+This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for 
+the special case of squaring.  
 
 \subsection{Polynomial Basis Squaring}
 The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
@@ -5389,14 +5410,13 @@
 is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
 it is actually below the Comba limit (\textit{at 110 digits}).
 
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
-the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
-
-\textit{Last paragraph sucks.  re-write! -- Tom}
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are 
+redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and 
+mp\_clears are executed normally.
 
 \subsection{Toom-Cook Squaring}
 The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
+instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to 
 derive their own Toom-Cook squaring algorithm.  
 
 \subsection{High Level Squaring}
@@ -5482,12 +5502,9 @@
 $\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
                       & that have different number of digits in Karatsuba multiplication. \\
                       & \\
-$\left [ 3 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\
+$\left [ 2 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\
                       & of double products and at most one square is stated.  Prove this statement. \\
                       & \\                      
-$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
-                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
-                      & \\
 $\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
                       & \\
 $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
@@ -5495,6 +5512,14 @@
 $\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
                       & required for equation $6.7$ to be true.  \\
                       & \\
+$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
+                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
+                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
+                      &\\
+$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
+                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
+                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
+                      & \\
 \end{tabular}
 
 \chapter{Modular Reduction}
@@ -5513,7 +5538,7 @@
 Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
 is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
 RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
-Elliptic Curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
 exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
 range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
 algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
@@ -5727,95 +5752,94 @@
 018    * precomputed via mp_reduce_setup.
 019    * From HAC pp.604 Algorithm 14.42
 020    */
-021   int
-022   mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
-023   \{
-024     mp_int  q;
-025     int     res, um = m->used;
-026   
-027     /* q = x */
-028     if ((res = mp_init_copy (&q, x)) != MP_OKAY) \{
-029       return res;
-030     \}
-031   
-032     /* q1 = x / b**(k-1)  */
-033     mp_rshd (&q, um - 1);         
-034   
-035     /* according to HAC this optimization is ok */
-036     if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) \{
-037       if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) \{
-038         goto CLEANUP;
-039       \}
-040     \} else \{
-041   #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-042       if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{
-043         goto CLEANUP;
-044       \}
-045   #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-046       if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{
-047         goto CLEANUP;
-048       \}
-049   #else 
-050       \{ 
-051         res = MP_VAL;
-052         goto CLEANUP;
-053       \}
-054   #endif
-055     \}
-056   
-057     /* q3 = q2 / b**(k+1) */
-058     mp_rshd (&q, um + 1);         
-059   
-060     /* x = x mod b**(k+1), quick (no division) */
-061     if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) \{
-062       goto CLEANUP;
-063     \}
-064   
-065     /* q = q * m mod b**(k+1), quick (no division) */
-066     if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) \{
-067       goto CLEANUP;
-068     \}
-069   
-070     /* x = x - q */
-071     if ((res = mp_sub (x, &q, x)) != MP_OKAY) \{
-072       goto CLEANUP;
-073     \}
-074   
-075     /* If x < 0, add b**(k+1) to it */
-076     if (mp_cmp_d (x, 0) == MP_LT) \{
-077       mp_set (&q, 1);
-078       if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
-079         goto CLEANUP;
-080       if ((res = mp_add (x, &q, x)) != MP_OKAY)
-081         goto CLEANUP;
-082     \}
-083   
-084     /* Back off if it's too big */
-085     while (mp_cmp (x, m) != MP_LT) \{
-086       if ((res = s_mp_sub (x, m, x)) != MP_OKAY) \{
-087         goto CLEANUP;
-088       \}
-089     \}
-090     
-091   CLEANUP:
-092     mp_clear (&q);
-093   
-094     return res;
-095   \}
-096   #endif
+021   int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+022   \{
+023     mp_int  q;
+024     int     res, um = m->used;
+025   
+026     /* q = x */
+027     if ((res = mp_init_copy (&q, x)) != MP_OKAY) \{
+028       return res;
+029     \}
+030   
+031     /* q1 = x / b**(k-1)  */
+032     mp_rshd (&q, um - 1);         
+033   
+034     /* according to HAC this optimization is ok */
+035     if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) \{
+036       if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) \{
+037         goto CLEANUP;
+038       \}
+039     \} else \{
+040   #ifdef BN_S_MP_MUL_HIGH_DIGS_C
+041       if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) \{
+042         goto CLEANUP;
+043       \}
+044   #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
+045       if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) \{
+046         goto CLEANUP;
+047       \}
+048   #else 
+049       \{ 
+050         res = MP_VAL;
+051         goto CLEANUP;
+052       \}
+053   #endif
+054     \}
+055   
+056     /* q3 = q2 / b**(k+1) */
+057     mp_rshd (&q, um + 1);         
+058   
+059     /* x = x mod b**(k+1), quick (no division) */
+060     if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) \{
+061       goto CLEANUP;
+062     \}
+063   
+064     /* q = q * m mod b**(k+1), quick (no division) */
+065     if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) \{
+066       goto CLEANUP;
+067     \}
+068   
+069     /* x = x - q */
+070     if ((res = mp_sub (x, &q, x)) != MP_OKAY) \{
+071       goto CLEANUP;
+072     \}
+073   
+074     /* If x < 0, add b**(k+1) to it */
+075     if (mp_cmp_d (x, 0) == MP_LT) \{
+076       mp_set (&q, 1);
+077       if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
+078         goto CLEANUP;
+079       if ((res = mp_add (x, &q, x)) != MP_OKAY)
+080         goto CLEANUP;
+081     \}
+082   
+083     /* Back off if it's too big */
+084     while (mp_cmp (x, m) != MP_LT) \{
+085       if ((res = s_mp_sub (x, m, x)) != MP_OKAY) \{
+086         goto CLEANUP;
+087       \}
+088     \}
+089     
+090   CLEANUP:
+091     mp_clear (&q);
+092   
+093     return res;
+094   \}
+095   #endif
 \end{alltt}
 \end{small}
 
 The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up.  This essentially halves
 the number of single precision multiplications required.  However, the optimization is only safe if $\beta$ is much larger than the number of digits
-in the modulus.  In the source code this is evaluated on lines 36 to 44 where algorithm s\_mp\_mul\_high\_digs is used when it is
+in the modulus.  In the source code this is evaluated on lines 36 to 43 where algorithm s\_mp\_mul\_high\_digs is used when it is
 safe to do so.  
 
 \subsection{The Barrett Setup Algorithm}
 In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
 future use so that the Barrett algorithm can be used without delay.  
 
-\begin{figure}[!here]
+\newpage\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -6311,161 +6335,160 @@
 022    *
 023    * Based on Algorithm 14.32 on pp.601 of HAC.
 024   */
-025   int
-026   fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-027   \{
-028     int     ix, res, olduse;
-029     mp_word W[MP_WARRAY];
-030   
-031     /* get old used count */
-032     olduse = x->used;
-033   
-034     /* grow a as required */
-035     if (x->alloc < n->used + 1) \{
-036       if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) \{
-037         return res;
-038       \}
-039     \}
-040   
-041     /* first we have to get the digits of the input into
-042      * an array of double precision words W[...]
-043      */
-044     \{
-045       register mp_word *_W;
-046       register mp_digit *tmpx;
-047   
-048       /* alias for the W[] array */
-049       _W   = W;
-050   
-051       /* alias for the digits of  x*/
-052       tmpx = x->dp;
-053   
-054       /* copy the digits of a into W[0..a->used-1] */
-055       for (ix = 0; ix < x->used; ix++) \{
-056         *_W++ = *tmpx++;
-057       \}
-058   
-059       /* zero the high words of W[a->used..m->used*2] */
-060       for (; ix < n->used * 2 + 1; ix++) \{
-061         *_W++ = 0;
-062       \}
-063     \}
-064   
-065     /* now we proceed to zero successive digits
-066      * from the least significant upwards
-067      */
-068     for (ix = 0; ix < n->used; ix++) \{
-069       /* mu = ai * m' mod b
-070        *
-071        * We avoid a double precision multiplication (which isn't required)
-072        * by casting the value down to a mp_digit.  Note this requires
-073        * that W[ix-1] have  the carry cleared (see after the inner loop)
-074        */
-075       register mp_digit mu;
-076       mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
-077   
-078       /* a = a + mu * m * b**i
-079        *
-080        * This is computed in place and on the fly.  The multiplication
-081        * by b**i is handled by offseting which columns the results
-082        * are added to.
-083        *
-084        * Note the comba method normally doesn't handle carries in the
-085        * inner loop In this case we fix the carry from the previous
-086        * column since the Montgomery reduction requires digits of the
-087        * result (so far) [see above] to work.  This is
-088        * handled by fixing up one carry after the inner loop.  The
-089        * carry fixups are done in order so after these loops the
-090        * first m->used words of W[] have the carries fixed
-091        */
-092       \{
-093         register int iy;
-094         register mp_digit *tmpn;
-095         register mp_word *_W;
-096   
-097         /* alias for the digits of the modulus */
-098         tmpn = n->dp;
-099   
-100         /* Alias for the columns set by an offset of ix */
-101         _W = W + ix;
-102   
-103         /* inner loop */
-104         for (iy = 0; iy < n->used; iy++) \{
-105             *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
-106         \}
-107       \}
-108   
-109       /* now fix carry for next digit, W[ix+1] */
-110       W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
-111     \}
-112   
-113     /* now we have to propagate the carries and
-114      * shift the words downward [all those least
-115      * significant digits we zeroed].
-116      */
-117     \{
-118       register mp_digit *tmpx;
-119       register mp_word *_W, *_W1;
-120   
-121       /* nox fix rest of carries */
-122   
-123       /* alias for current word */
-124       _W1 = W + ix;
-125   
-126       /* alias for next word, where the carry goes */
-127       _W = W + ++ix;
-128   
-129       for (; ix <= n->used * 2 + 1; ix++) \{
-130         *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
-131       \}
-132   
-133       /* copy out, A = A/b**n
-134        *
-135        * The result is A/b**n but instead of converting from an
-136        * array of mp_word to mp_digit than calling mp_rshd
-137        * we just copy them in the right order
-138        */
-139   
-140       /* alias for destination word */
-141       tmpx = x->dp;
-142   
-143       /* alias for shifted double precision result */
-144       _W = W + n->used;
-145   
-146       for (ix = 0; ix < n->used + 1; ix++) \{
-147         *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
-148       \}
-149   
-150       /* zero oldused digits, if the input a was larger than
-151        * m->used+1 we'll have to clear the digits
-152        */
-153       for (; ix < olduse; ix++) \{
-154         *tmpx++ = 0;
-155       \}
-156     \}
-157   
-158     /* set the max used and clamp */
-159     x->used = n->used + 1;
-160     mp_clamp (x);
-161   
-162     /* if A >= m then A = A - m */
-163     if (mp_cmp_mag (x, n) != MP_LT) \{
-164       return s_mp_sub (x, n, x);
-165     \}
-166     return MP_OKAY;
-167   \}
-168   #endif
+025   int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+026   \{
+027     int     ix, res, olduse;
+028     mp_word W[MP_WARRAY];
+029   
+030     /* get old used count */
+031     olduse = x->used;
+032   
+033     /* grow a as required */
+034     if (x->alloc < n->used + 1) \{
+035       if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) \{
+036         return res;
+037       \}
+038     \}
+039   
+040     /* first we have to get the digits of the input into
+041      * an array of double precision words W[...]
+042      */
+043     \{
+044       register mp_word *_W;
+045       register mp_digit *tmpx;
+046   
+047       /* alias for the W[] array */
+048       _W   = W;
+049   
+050       /* alias for the digits of  x*/
+051       tmpx = x->dp;
+052   
+053       /* copy the digits of a into W[0..a->used-1] */
+054       for (ix = 0; ix < x->used; ix++) \{
+055         *_W++ = *tmpx++;
+056       \}
+057   
+058       /* zero the high words of W[a->used..m->used*2] */
+059       for (; ix < n->used * 2 + 1; ix++) \{
+060         *_W++ = 0;
+061       \}
+062     \}
+063   
+064     /* now we proceed to zero successive digits
+065      * from the least significant upwards
+066      */
+067     for (ix = 0; ix < n->used; ix++) \{
+068       /* mu = ai * m' mod b
+069        *
+070        * We avoid a double precision multiplication (which isn't required)
+071        * by casting the value down to a mp_digit.  Note this requires
+072        * that W[ix-1] have  the carry cleared (see after the inner loop)
+073        */
+074       register mp_digit mu;
+075       mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
+076   
+077       /* a = a + mu * m * b**i
+078        *
+079        * This is computed in place and on the fly.  The multiplication
+080        * by b**i is handled by offseting which columns the results
+081        * are added to.
+082        *
+083        * Note the comba method normally doesn't handle carries in the
+084        * inner loop In this case we fix the carry from the previous
+085        * column since the Montgomery reduction requires digits of the
+086        * result (so far) [see above] to work.  This is
+087        * handled by fixing up one carry after the inner loop.  The
+088        * carry fixups are done in order so after these loops the
+089        * first m->used words of W[] have the carries fixed
+090        */
+091       \{
+092         register int iy;
+093         register mp_digit *tmpn;
+094         register mp_word *_W;
+095   
+096         /* alias for the digits of the modulus */
+097         tmpn = n->dp;
+098   
+099         /* Alias for the columns set by an offset of ix */
+100         _W = W + ix;
+101   
+102         /* inner loop */
+103         for (iy = 0; iy < n->used; iy++) \{
+104             *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
+105         \}
+106       \}
+107   
+108       /* now fix carry for next digit, W[ix+1] */
+109       W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
+110     \}
+111   
+112     /* now we have to propagate the carries and
+113      * shift the words downward [all those least
+114      * significant digits we zeroed].
+115      */
+116     \{
+117       register mp_digit *tmpx;
+118       register mp_word *_W, *_W1;
+119   
+120       /* nox fix rest of carries */
+121   
+122       /* alias for current word */
+123       _W1 = W + ix;
+124   
+125       /* alias for next word, where the carry goes */
+126       _W = W + ++ix;
+127   
+128       for (; ix <= n->used * 2 + 1; ix++) \{
+129         *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
+130       \}
+131   
+132       /* copy out, A = A/b**n
+133        *
+134        * The result is A/b**n but instead of converting from an
+135        * array of mp_word to mp_digit than calling mp_rshd
+136        * we just copy them in the right order
+137        */
+138   
+139       /* alias for destination word */
+140       tmpx = x->dp;
+141   
+142       /* alias for shifted double precision result */
+143       _W = W + n->used;
+144   
+145       for (ix = 0; ix < n->used + 1; ix++) \{
+146         *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
+147       \}
+148   
+149       /* zero oldused digits, if the input a was larger than
+150        * m->used+1 we'll have to clear the digits
+151        */
+152       for (; ix < olduse; ix++) \{
+153         *tmpx++ = 0;
+154       \}
+155     \}
+156   
+157     /* set the max used and clamp */
+158     x->used = n->used + 1;
+159     mp_clamp (x);
+160   
+161     /* if A >= m then A = A - m */
+162     if (mp_cmp_mag (x, n) != MP_LT) \{
+163       return s_mp_sub (x, n, x);
+164     \}
+165     return MP_OKAY;
+166   \}
+167   #endif
 \end{alltt}
 \end{small}
 
-The $\hat W$ array is first filled with digits of $x$ on line 48 then the rest of the digits are zeroed on line 55.  Both loops share
+The $\hat W$ array is first filled with digits of $x$ on line 50 then the rest of the digits are zeroed on line 54.  Both loops share
 the same alias variables to make the code easier to read.  
 
 The value of $\mu$ is calculated in an interesting fashion.  First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit.  This
-forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line 110 fixes the carry 
+forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line 109 fixes the carry 
 for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$.
 
-The for loop on line 109 propagates the rest of the carries upwards through the columns.  The for loop on line 126 reduces the columns
+The for loop on line 108 propagates the rest of the carries upwards through the columns.  The for loop on line 125 reduces the columns
 modulo $\beta$ and shifts them $k$ places at the same time.  The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th
 digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$.  
 
@@ -6739,7 +6762,7 @@
 019    * Based on algorithm from the paper
 020    *
 021    * "Generating Efficient Primes for Discrete Log Cryptosystems"
-022    *                 Chae Hoon Lim, Pil Loong Lee,
+022    *                 Chae Hoon Lim, Pil Joong Lee,
 023    *          POSTECH Information Research Laboratories
 024    *
 025    * The modulus must be of a special format [see manual]
@@ -6965,51 +6988,50 @@
 \begin{alltt}
 016   
 017   /* reduces a modulo n where n is of the form 2**p - d */
-018   int
-019   mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
-020   \{
-021      mp_int q;
-022      int    p, res;
-023      
-024      if ((res = mp_init(&q)) != MP_OKAY) \{
-025         return res;
-026      \}
-027      
-028      p = mp_count_bits(n);    
-029   top:
-030      /* q = a/2**p, a = a mod 2**p */
-031      if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) \{
-032         goto ERR;
-033      \}
-034      
-035      if (d != 1) \{
-036         /* q = q * d */
-037         if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) \{ 
-038            goto ERR;
-039         \}
-040      \}
-041      
-042      /* a = a + q */
-043      if ((res = s_mp_add(a, &q, a)) != MP_OKAY) \{
-044         goto ERR;
-045      \}
-046      
-047      if (mp_cmp_mag(a, n) != MP_LT) \{
-048         s_mp_sub(a, n, a);
-049         goto top;
-050      \}
-051      
-052   ERR:
-053      mp_clear(&q);
-054      return res;
-055   \}
-056   
-057   #endif
+018   int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+019   \{
+020      mp_int q;
+021      int    p, res;
+022      
+023      if ((res = mp_init(&q)) != MP_OKAY) \{
+024         return res;
+025      \}
+026      
+027      p = mp_count_bits(n);    
+028   top:
+029      /* q = a/2**p, a = a mod 2**p */
+030      if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) \{
+031         goto ERR;
+032      \}
+033      
+034      if (d != 1) \{
+035         /* q = q * d */
+036         if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) \{ 
+037            goto ERR;
+038         \}
+039      \}
+040      
+041      /* a = a + q */
+042      if ((res = s_mp_add(a, &q, a)) != MP_OKAY) \{
+043         goto ERR;
+044      \}
+045      
+046      if (mp_cmp_mag(a, n) != MP_LT) \{
+047         s_mp_sub(a, n, a);
+048         goto top;
+049      \}
+050      
+051   ERR:
+052      mp_clear(&q);
+053      return res;
+054   \}
+055   
+056   #endif
 \end{alltt}
 \end{small}
 
 The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$.  The call to mp\_div\_2d
-on line 31 calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
+on line 30 calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
 is kept fairly small.  The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without
 any multiplications.  
 
@@ -7049,32 +7071,31 @@
 \begin{alltt}
 016   
 017   /* determines the setup value */
-018   int 
-019   mp_reduce_2k_setup(mp_int *a, mp_digit *d)
-020   \{
-021      int res, p;
-022      mp_int tmp;
-023      
-024      if ((res = mp_init(&tmp)) != MP_OKAY) \{
-025         return res;
-026      \}
-027      
-028      p = mp_count_bits(a);
-029      if ((res = mp_2expt(&tmp, p)) != MP_OKAY) \{
-030         mp_clear(&tmp);
-031         return res;
-032      \}
-033      
-034      if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) \{
-035         mp_clear(&tmp);
-036         return res;
-037      \}
-038      
-039      *d = tmp.dp[0];
-040      mp_clear(&tmp);
-041      return MP_OKAY;
-042   \}
-043   #endif
+018   int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+019   \{
+020      int res, p;
+021      mp_int tmp;
+022      
+023      if ((res = mp_init(&tmp)) != MP_OKAY) \{
+024         return res;
+025      \}
+026      
+027      p = mp_count_bits(a);
+028      if ((res = mp_2expt(&tmp, p)) != MP_OKAY) \{
+029         mp_clear(&tmp);
+030         return res;
+031      \}
+032      
+033      if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) \{
+034         mp_clear(&tmp);
+035         return res;
+036      \}
+037      
+038      *d = tmp.dp[0];
+039      mp_clear(&tmp);
+040      return MP_OKAY;
+041   \}
+042   #endif
 \end{alltt}
 \end{small}
 
@@ -7127,9 +7148,9 @@
 021      mp_digit iz;
 022      
 023      if (a->used == 0) \{
-024         return 0;
+024         return MP_NO;
 025      \} else if (a->used == 1) \{
-026         return 1;
+026         return MP_YES;
 027      \} else if (a->used > 1) \{
 028         iy = mp_count_bits(a);
 029         iz = 1;
@@ -7138,7 +7159,7 @@
 032         /* Test every bit from the second digit up, must be 1 */
 033         for (ix = DIGIT_BIT; ix < iy; ix++) \{
 034             if ((a->dp[iw] & iz) == 0) \{
-035                return 0;
+035                return MP_NO;
 036             \}
 037             iz <<= 1;
 038             if (iz > (mp_digit)MP_MASK) \{
@@ -7147,7 +7168,7 @@
 041             \}
 042         \}
 043      \}
-044      return 1;
+044      return MP_YES;
 045   \}
 046   
 047   #endif
@@ -7594,43 +7615,51 @@
 060        return err;
 061   #else 
 062        /* no invmod */
-063        return MP_VAL
+063        return MP_VAL;
 064   #endif
 065     \}
 066   
-067   #ifdef BN_MP_DR_IS_MODULUS_C
-068     /* is it a DR modulus? */
-069     dr = mp_dr_is_modulus(P);
-070   #else
-071     dr = 0;
+067   /* modified diminished radix reduction */
+068   #if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+069     if (mp_reduce_is_2k_l(P) == MP_YES) \{
+070        return s_mp_exptmod(G, X, P, Y, 1);
+071     \}
 072   #endif
 073   
-074   #ifdef BN_MP_REDUCE_IS_2K_C
-075     /* if not, is it a uDR modulus? */
-076     if (dr == 0) \{
-077        dr = mp_reduce_is_2k(P) << 1;
-078     \}
-079   #endif
-080       
-081     /* if the modulus is odd or dr != 0 use the fast method */
-082   #ifdef BN_MP_EXPTMOD_FAST_C
-083     if (mp_isodd (P) == 1 || dr !=  0) \{
-084       return mp_exptmod_fast (G, X, P, Y, dr);
-085     \} else \{
-086   #endif
-087   #ifdef BN_S_MP_EXPTMOD_C
-088       /* otherwise use the generic Barrett reduction technique */
-089       return s_mp_exptmod (G, X, P, Y);
-090   #else
-091       /* no exptmod for evens */
-092       return MP_VAL;
-093   #endif
-094   #ifdef BN_MP_EXPTMOD_FAST_C
-095     \}
-096   #endif
-097   \}
-098   
-099   #endif
+074   #ifdef BN_MP_DR_IS_MODULUS_C
+075     /* is it a DR modulus? */
+076     dr = mp_dr_is_modulus(P);
+077   #else
+078     /* default to no */
+079     dr = 0;
+080   #endif
+081   
+082   #ifdef BN_MP_REDUCE_IS_2K_C
+083     /* if not, is it a unrestricted DR modulus? */
+084     if (dr == 0) \{
+085        dr = mp_reduce_is_2k(P) << 1;
+086     \}
+087   #endif
+088       
+089     /* if the modulus is odd or dr != 0 use the montgomery method */
+090   #ifdef BN_MP_EXPTMOD_FAST_C
+091     if (mp_isodd (P) == 1 || dr !=  0) \{
+092       return mp_exptmod_fast (G, X, P, Y, dr);
+093     \} else \{
+094   #endif
+095   #ifdef BN_S_MP_EXPTMOD_C
+096       /* otherwise use the generic Barrett reduction technique */
+097       return s_mp_exptmod (G, X, P, Y, 0);
+098   #else
+099       /* no exptmod for evens */
+100       return MP_VAL;
+101   #endif
+102   #ifdef BN_MP_EXPTMOD_FAST_C
+103     \}
+104   #endif
+105   \}
+106   
+107   #endif
 \end{alltt}
 \end{small}
 
@@ -7639,8 +7668,8 @@
 the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
 exponent.
 
-If the exponent is positive the algorithm resumes the exponentiation.  Line 69 determines if the modulus is of the restricted Diminished Radix 
-form.  If it is not line 77 attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
+If the exponent is positive the algorithm resumes the exponentiation.  Line 76 determines if the modulus is of the restricted Diminished Radix 
+form.  If it is not line 69 attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
 of three values.
 
 \begin{enumerate}
@@ -7649,7 +7678,7 @@
 \item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
 \end{enumerate}
 
-Line 67 determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
+Line 69 determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
 the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.  
 
 \subsection{Barrett Modular Exponentiation}
@@ -7817,230 +7846,244 @@
 020      #define TAB_SIZE 256
 021   #endif
 022   
-023   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+023   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmod
+      e)
 024   \{
 025     mp_int  M[TAB_SIZE], res, mu;
 026     mp_digit buf;
 027     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-028   
-029     /* find window size */
-030     x = mp_count_bits (X);
-031     if (x <= 7) \{
-032       winsize = 2;
-033     \} else if (x <= 36) \{
-034       winsize = 3;
-035     \} else if (x <= 140) \{
-036       winsize = 4;
-037     \} else if (x <= 450) \{
-038       winsize = 5;
-039     \} else if (x <= 1303) \{
-040       winsize = 6;
-041     \} else if (x <= 3529) \{
-042       winsize = 7;
-043     \} else \{
-044       winsize = 8;
-045     \}
-046   
-047   #ifdef MP_LOW_MEM
-048       if (winsize > 5) \{
-049          winsize = 5;
-050       \}
-051   #endif
-052   
-053     /* init M array */
-054     /* init first cell */
-055     if ((err = mp_init(&M[1])) != MP_OKAY) \{
-056        return err; 
-057     \}
-058   
-059     /* now init the second half of the array */
-060     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-061       if ((err = mp_init(&M[x])) != MP_OKAY) \{
-062         for (y = 1<<(winsize-1); y < x; y++) \{
-063           mp_clear (&M[y]);
-064         \}
-065         mp_clear(&M[1]);
-066         return err;
-067       \}
-068     \}
-069   
-070     /* create mu, used for Barrett reduction */
-071     if ((err = mp_init (&mu)) != MP_OKAY) \{
-072       goto __M;
-073     \}
-074     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
-075       goto __MU;
-076     \}
-077   
-078     /* create M table
-079      *
-080      * The M table contains powers of the base, 
-081      * e.g. M[x] = G**x mod P
-082      *
-083      * The first half of the table is not 
-084      * computed though accept for M[0] and M[1]
-085      */
-086     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
-087       goto __MU;
-088     \}
-089   
-090     /* compute the value at M[1<<(winsize-1)] by squaring 
-091      * M[1] (winsize-1) times 
-092      */
-093     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
-094       goto __MU;
-095     \}
-096   
-097     for (x = 0; x < (winsize - 1); x++) \{
-098       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
-099                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
-100         goto __MU;
-101       \}
-102       if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
-103         goto __MU;
-104       \}
+028     int (*redux)(mp_int*,mp_int*,mp_int*);
+029   
+030     /* find window size */
+031     x = mp_count_bits (X);
+032     if (x <= 7) \{
+033       winsize = 2;
+034     \} else if (x <= 36) \{
+035       winsize = 3;
+036     \} else if (x <= 140) \{
+037       winsize = 4;
+038     \} else if (x <= 450) \{
+039       winsize = 5;
+040     \} else if (x <= 1303) \{
+041       winsize = 6;
+042     \} else if (x <= 3529) \{
+043       winsize = 7;
+044     \} else \{
+045       winsize = 8;
+046     \}
+047   
+048   #ifdef MP_LOW_MEM
+049       if (winsize > 5) \{
+050          winsize = 5;
+051       \}
+052   #endif
+053   
+054     /* init M array */
+055     /* init first cell */
+056     if ((err = mp_init(&M[1])) != MP_OKAY) \{
+057        return err; 
+058     \}
+059   
+060     /* now init the second half of the array */
+061     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+062       if ((err = mp_init(&M[x])) != MP_OKAY) \{
+063         for (y = 1<<(winsize-1); y < x; y++) \{
+064           mp_clear (&M[y]);
+065         \}
+066         mp_clear(&M[1]);
+067         return err;
+068       \}
+069     \}
+070   
+071     /* create mu, used for Barrett reduction */
+072     if ((err = mp_init (&mu)) != MP_OKAY) \{
+073       goto LBL_M;
+074     \}
+075     
+076     if (redmode == 0) \{
+077        if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
+078           goto LBL_MU;
+079        \}
+080        redux = mp_reduce;
+081     \} else \{
+082        if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) \{
+083           goto LBL_MU;
+084        \}
+085        redux = mp_reduce_2k_l;
+086     \}    
+087   
+088     /* create M table
+089      *
+090      * The M table contains powers of the base, 
+091      * e.g. M[x] = G**x mod P
+092      *
+093      * The first half of the table is not 
+094      * computed though accept for M[0] and M[1]
+095      */
+096     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
+097       goto LBL_MU;
+098     \}
+099   
+100     /* compute the value at M[1<<(winsize-1)] by squaring 
+101      * M[1] (winsize-1) times 
+102      */
+103     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
+104       goto LBL_MU;
 105     \}
 106   
-107     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
-108      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
-109      */
-110     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
-111       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
-112         goto __MU;
-113       \}
-114       if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) \{
-115         goto __MU;
-116       \}
-117     \}
-118   
-119     /* setup result */
-120     if ((err = mp_init (&res)) != MP_OKAY) \{
-121       goto __MU;
-122     \}
-123     mp_set (&res, 1);
-124   
-125     /* set initial mode and bit cnt */
-126     mode   = 0;
-127     bitcnt = 1;
-128     buf    = 0;
-129     digidx = X->used - 1;
-130     bitcpy = 0;
-131     bitbuf = 0;
-132   
-133     for (;;) \{
-134       /* grab next digit as required */
-135       if (--bitcnt == 0) \{
-136         /* if digidx == -1 we are out of digits */
-137         if (digidx == -1) \{
-138           break;
-139         \}
-140         /* read next digit and reset the bitcnt */
-141         buf    = X->dp[digidx--];
-142         bitcnt = (int) DIGIT_BIT;
-143       \}
-144   
-145       /* grab the next msb from the exponent */
-146       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-147       buf <<= (mp_digit)1;
-148   
-149       /* if the bit is zero and mode == 0 then we ignore it
-150        * These represent the leading zero bits before the first 1 bit
-151        * in the exponent.  Technically this opt is not required but it
-152        * does lower the # of trivial squaring/reductions used
-153        */
-154       if (mode == 0 && y == 0) \{
-155         continue;
+107     for (x = 0; x < (winsize - 1); x++) \{
+108       /* square it */
+109       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+110                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
+111         goto LBL_MU;
+112       \}
+113   
+114       /* reduce modulo P */
+115       if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
+116         goto LBL_MU;
+117       \}
+118     \}
+119   
+120     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+121      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+122      */
+123     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
+124       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
+125         goto LBL_MU;
+126       \}
+127       if ((err = redux (&M[x], P, &mu)) != MP_OKAY) \{
+128         goto LBL_MU;
+129       \}
+130     \}
+131   
+132     /* setup result */
+133     if ((err = mp_init (&res)) != MP_OKAY) \{
+134       goto LBL_MU;
+135     \}
+136     mp_set (&res, 1);
+137   
+138     /* set initial mode and bit cnt */
+139     mode   = 0;
+140     bitcnt = 1;
+141     buf    = 0;
+142     digidx = X->used - 1;
+143     bitcpy = 0;
+144     bitbuf = 0;
+145   
+146     for (;;) \{
+147       /* grab next digit as required */
+148       if (--bitcnt == 0) \{
+149         /* if digidx == -1 we are out of digits */
+150         if (digidx == -1) \{
+151           break;
+152         \}
+153         /* read next digit and reset the bitcnt */
+154         buf    = X->dp[digidx--];
+155         bitcnt = (int) DIGIT_BIT;
 156       \}
 157   
-158       /* if the bit is zero and mode == 1 then we square */
-159       if (mode == 1 && y == 0) \{
-160         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-161           goto __RES;
-162         \}
-163         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-164           goto __RES;
-165         \}
-166         continue;
-167       \}
-168   
-169       /* else we add it to the window */
-170       bitbuf |= (y << (winsize - ++bitcpy));
-171       mode    = 2;
-172   
-173       if (bitcpy == winsize) \{
-174         /* ok window is filled so square as required and multiply  */
-175         /* square first */
-176         for (x = 0; x < winsize; x++) \{
-177           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-178             goto __RES;
-179           \}
-180           if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-181             goto __RES;
-182           \}
-183         \}
-184   
-185         /* then multiply */
-186         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
-187           goto __RES;
-188         \}
-189         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-190           goto __RES;
-191         \}
-192   
-193         /* empty window and reset */
-194         bitcpy = 0;
-195         bitbuf = 0;
-196         mode   = 1;
-197       \}
-198     \}
-199   
-200     /* if bits remain then square/multiply */
-201     if (mode == 2 && bitcpy > 0) \{
-202       /* square then multiply if the bit is set */
-203       for (x = 0; x < bitcpy; x++) \{
-204         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-205           goto __RES;
-206         \}
-207         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-208           goto __RES;
-209         \}
-210   
-211         bitbuf <<= 1;
-212         if ((bitbuf & (1 << winsize)) != 0) \{
-213           /* then multiply */
-214           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
-215             goto __RES;
-216           \}
-217           if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-218             goto __RES;
-219           \}
-220         \}
-221       \}
-222     \}
+158       /* grab the next msb from the exponent */
+159       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+160       buf <<= (mp_digit)1;
+161   
+162       /* if the bit is zero and mode == 0 then we ignore it
+163        * These represent the leading zero bits before the first 1 bit
+164        * in the exponent.  Technically this opt is not required but it
+165        * does lower the # of trivial squaring/reductions used
+166        */
+167       if (mode == 0 && y == 0) \{
+168         continue;
+169       \}
+170   
+171       /* if the bit is zero and mode == 1 then we square */
+172       if (mode == 1 && y == 0) \{
+173         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+174           goto LBL_RES;
+175         \}
+176         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+177           goto LBL_RES;
+178         \}
+179         continue;
+180       \}
+181   
+182       /* else we add it to the window */
+183       bitbuf |= (y << (winsize - ++bitcpy));
+184       mode    = 2;
+185   
+186       if (bitcpy == winsize) \{
+187         /* ok window is filled so square as required and multiply  */
+188         /* square first */
+189         for (x = 0; x < winsize; x++) \{
+190           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+191             goto LBL_RES;
+192           \}
+193           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+194             goto LBL_RES;
+195           \}
+196         \}
+197   
+198         /* then multiply */
+199         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
+200           goto LBL_RES;
+201         \}
+202         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+203           goto LBL_RES;
+204         \}
+205   
+206         /* empty window and reset */
+207         bitcpy = 0;
+208         bitbuf = 0;
+209         mode   = 1;
+210       \}
+211     \}
+212   
+213     /* if bits remain then square/multiply */
+214     if (mode == 2 && bitcpy > 0) \{
+215       /* square then multiply if the bit is set */
+216       for (x = 0; x < bitcpy; x++) \{
+217         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+218           goto LBL_RES;
+219         \}
+220         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+221           goto LBL_RES;
+222         \}
 223   
-224     mp_exch (&res, Y);
-225     err = MP_OKAY;
-226   __RES:mp_clear (&res);
-227   __MU:mp_clear (&mu);
-228   __M:
-229     mp_clear(&M[1]);
-230     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-231       mp_clear (&M[x]);
-232     \}
-233     return err;
-234   \}
-235   #endif
+224         bitbuf <<= 1;
+225         if ((bitbuf & (1 << winsize)) != 0) \{
+226           /* then multiply */
+227           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
+228             goto LBL_RES;
+229           \}
+230           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+231             goto LBL_RES;
+232           \}
+233         \}
+234       \}
+235     \}
+236   
+237     mp_exch (&res, Y);
+238     err = MP_OKAY;
+239   LBL_RES:mp_clear (&res);
+240   LBL_MU:mp_clear (&mu);
+241   LBL_M:
+242     mp_clear(&M[1]);
+243     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+244       mp_clear (&M[x]);
+245     \}
+246     return err;
+247   \}
+248   #endif
 \end{alltt}
 \end{small}
 
-Lines 31 through 41 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
+Lines 21 through 40 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
 from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
-on line 33 the value of $x$ is already known to be greater than $140$.  
-
-The conditional piece of code beginning on line 47 allows the window size to be restricted to five bits.  This logic is used to ensure
+on line 32 the value of $x$ is already known to be greater than $140$.  
+
+The conditional piece of code beginning on line 48 allows the window size to be restricted to five bits.  This logic is used to ensure
 the table of precomputed powers of $G$ remains relatively small.  
 
-The for loop on line 60 initializes the $M$ array while lines 61 and 74 compute the value of $\mu$ required for
+The for loop on line 61 initializes the $M$ array while lines 62 and 77 compute the value of $\mu$ required for
 Barrett reduction.  
 
 -- More later.
@@ -8386,23 +8429,23 @@
 048   
 049     mp_set(&tq, 1);
 050     n = mp_count_bits(a) - mp_count_bits(b);
-051     if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
-052         ((res = mp_copy(b, &tb)) != MP_OKAY) || 
+051     if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
+052         ((res = mp_abs(b, &tb)) != MP_OKAY) || 
 053         ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
 054         ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) \{
-055         goto __ERR;
+055         goto LBL_ERR;
 056     \}
 057   
 058     while (n-- >= 0) \{
 059        if (mp_cmp(&tb, &ta) != MP_GT) \{
 060           if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
 061               ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) \{
-062              goto __ERR;
+062              goto LBL_ERR;
 063           \}
 064        \}
 065        if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
 066            ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) \{
-067              goto __ERR;
+067              goto LBL_ERR;
 068        \}
 069     \}
 070   
@@ -8411,13 +8454,13 @@
 073     n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
 074     if (c != NULL) \{
 075        mp_exch(c, &q);
-076        c->sign  = n2;
+076        c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
 077     \}
 078     if (d != NULL) \{
 079        mp_exch(d, &ta);
-080        d->sign = n;
+080        d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
 081     \}
-082   __ERR:
+082   LBL_ERR:
 083      mp_clear_multi(&ta, &tb, &tq, &q, NULL);
 084      return res;
 085   \}
@@ -8466,19 +8509,19 @@
 128     q.used = a->used + 2;
 129   
 130     if ((res = mp_init (&t1)) != MP_OKAY) \{
-131       goto __Q;
+131       goto LBL_Q;
 132     \}
 133   
 134     if ((res = mp_init (&t2)) != MP_OKAY) \{
-135       goto __T1;
+135       goto LBL_T1;
 136     \}
 137   
 138     if ((res = mp_init_copy (&x, a)) != MP_OKAY) \{
-139       goto __T2;
+139       goto LBL_T2;
 140     \}
 141   
 142     if ((res = mp_init_copy (&y, b)) != MP_OKAY) \{
-143       goto __X;
+143       goto LBL_X;
 144     \}
 145   
 146     /* fix the sign */
@@ -8490,10 +8533,10 @@
 152     if (norm < (int)(DIGIT_BIT-1)) \{
 153        norm = (DIGIT_BIT-1) - norm;
 154        if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) \{
-155          goto __Y;
+155          goto LBL_Y;
 156        \}
 157        if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) \{
-158          goto __Y;
+158          goto LBL_Y;
 159        \}
 160     \} else \{
 161        norm = 0;
@@ -8505,13 +8548,13 @@
 167   
 168     /* while (x >= y*b**n-t) do \{ q[n-t] += 1; x -= y*b**\{n-t\} \} */
 169     if ((res = mp_lshd (&y, n - t)) != MP_OKAY) \{ /* y = y*b**\{n-t\} */
-170       goto __Y;
+170       goto LBL_Y;
 171     \}
 172   
 173     while (mp_cmp (&x, &y) != MP_LT) \{
 174       ++(q.dp[n - t]);
 175       if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) \{
-176         goto __Y;
+176         goto LBL_Y;
 177       \}
 178     \}
 179   
@@ -8553,7 +8596,7 @@
 215         t1.dp[1] = y.dp[t];
 216         t1.used = 2;
 217         if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) \{
-218           goto __Y;
+218           goto LBL_Y;
 219         \}
 220   
 221         /* find right hand */
@@ -8565,27 +8608,27 @@
 227   
 228       /* step 3.3 x = x - q\{i-t-1\} * y * b**\{i-t-1\} */
 229       if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) \{
-230         goto __Y;
+230         goto LBL_Y;
 231       \}
 232   
 233       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{
-234         goto __Y;
+234         goto LBL_Y;
 235       \}
 236   
 237       if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) \{
-238         goto __Y;
+238         goto LBL_Y;
 239       \}
 240   
 241       /* if x < 0 then \{ x = x + y*b**\{i-t-1\}; q\{i-t-1\} -= 1; \} */
 242       if (x.sign == MP_NEG) \{
 243         if ((res = mp_copy (&y, &t1)) != MP_OKAY) \{
-244           goto __Y;
+244           goto LBL_Y;
 245         \}
 246         if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{
-247           goto __Y;
+247           goto LBL_Y;
 248         \}
 249         if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) \{
-250           goto __Y;
+250           goto LBL_Y;
 251         \}
 252   
 253         q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
@@ -8612,11 +8655,11 @@
 274   
 275     res = MP_OKAY;
 276   
-277   __Y:mp_clear (&y);
-278   __X:mp_clear (&x);
-279   __T2:mp_clear (&t2);
-280   __T1:mp_clear (&t1);
-281   __Q:mp_clear (&q);
+277   LBL_Y:mp_clear (&y);
+278   LBL_X:mp_clear (&x);
+279   LBL_T2:mp_clear (&t2);
+280   LBL_T1:mp_clear (&t1);
+281   LBL_Q:mp_clear (&q);
 282     return res;
 283   \}
 284   
@@ -8870,21 +8913,22 @@
 056       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
 057     \}
 058   
-059     /* store final carry [if any] */
+059     /* store final carry [if any] and increment ix offset  */
 060     *tmpc++ = u;
-061   
-062     /* now zero digits above the top */
-063     while (ix++ < olduse) \{
-064        *tmpc++ = 0;
-065     \}
-066   
-067     /* set used count */
-068     c->used = a->used + 1;
-069     mp_clamp(c);
-070   
-071     return MP_OKAY;
-072   \}
-073   #endif
+061     ++ix;
+062   
+063     /* now zero digits above the top */
+064     while (ix++ < olduse) \{
+065        *tmpc++ = 0;
+066     \}
+067   
+068     /* set used count */
+069     c->used = a->used + 1;
+070     mp_clamp(c);
+071   
+072     return MP_OKAY;
+073   \}
+074   #endif
 \end{alltt}
 \end{small}
 
@@ -9130,11 +9174,11 @@
 039     \}
 040   
 041     if ((res = mp_init (&t2)) != MP_OKAY) \{
-042       goto __T1;
+042       goto LBL_T1;
 043     \}
 044   
 045     if ((res = mp_init (&t3)) != MP_OKAY) \{
-046       goto __T2;
+046       goto LBL_T2;
 047     \}
 048   
 049     /* if a is negative fudge the sign but keep track */
@@ -9147,52 +9191,52 @@
 056     do \{
 057       /* t1 = t2 */
 058       if ((res = mp_copy (&t2, &t1)) != MP_OKAY) \{
-059         goto __T3;
+059         goto LBL_T3;
 060       \}
 061   
 062       /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
 063       
 064       /* t3 = t1**(b-1) */
 065       if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) \{   
-066         goto __T3;
+066         goto LBL_T3;
 067       \}
 068   
 069       /* numerator */
 070       /* t2 = t1**b */
 071       if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) \{    
-072         goto __T3;
+072         goto LBL_T3;
 073       \}
 074   
 075       /* t2 = t1**b - a */
 076       if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) \{  
-077         goto __T3;
+077         goto LBL_T3;
 078       \}
 079   
 080       /* denominator */
 081       /* t3 = t1**(b-1) * b  */
 082       if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) \{    
-083         goto __T3;
+083         goto LBL_T3;
 084       \}
 085   
 086       /* t3 = (t1**b - a)/(b * t1**(b-1)) */
 087       if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) \{  
-088         goto __T3;
+088         goto LBL_T3;
 089       \}
 090   
 091       if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) \{
-092         goto __T3;
+092         goto LBL_T3;
 093       \}
 094     \}  while (mp_cmp (&t1, &t2) != MP_EQ);
 095   
 096     /* result can be off by a few so check */
 097     for (;;) \{
 098       if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) \{
-099         goto __T3;
+099         goto LBL_T3;
 100       \}
 101   
 102       if (mp_cmp (&t2, a) == MP_GT) \{
 103         if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) \{
-104            goto __T3;
+104            goto LBL_T3;
 105         \}
 106       \} else \{
 107         break;
@@ -9210,9 +9254,9 @@
 119   
 120     res = MP_OKAY;
 121   
-122   __T3:mp_clear (&t3);
-123   __T2:mp_clear (&t2);
-124   __T1:mp_clear (&t1);
+122   LBL_T3:mp_clear (&t3);
+123   LBL_T2:mp_clear (&t2);
+124   LBL_T1:mp_clear (&t1);
 125     return res;
 126   \}
 127   #endif
@@ -9272,14 +9316,14 @@
 028   
 029     /* first place a random non-zero digit */
 030     do \{
-031       d = ((mp_digit) abs (rand ()));
+031       d = ((mp_digit) abs (rand ())) & MP_MASK;
 032     \} while (d == 0);
 033   
 034     if ((res = mp_add_d (a, d, a)) != MP_OKAY) \{
 035       return res;
 036     \}
 037   
-038     while (digits-- > 0) \{
+038     while (--digits > 0) \{
 039       if ((res = mp_lshd (a, 1)) != MP_OKAY) \{
 040         return res;
 041       \}
@@ -9376,7 +9420,7 @@
 \begin{alltt}
 016   
 017   /* read a string [ASCII] in a given radix */
-018   int mp_read_radix (mp_int * a, char *str, int radix)
+018   int mp_read_radix (mp_int * a, const char *str, int radix)
 019   \{
 020     int     y, res, neg;
 021     char    ch;
@@ -9771,7 +9815,7 @@
 042     \}
 043   
 044     if ((res = mp_init_copy (&v, b)) != MP_OKAY) \{
-045       goto __U;
+045       goto LBL_U;
 046     \}
 047   
 048     /* must be positive for the remainder of the algorithm */
@@ -9785,24 +9829,24 @@
 056     if (k > 0) \{
 057        /* divide the power of two out */
 058        if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) \{
-059           goto __V;
+059           goto LBL_V;
 060        \}
 061   
 062        if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) \{
-063           goto __V;
+063           goto LBL_V;
 064        \}
 065     \}
 066   
 067     /* divide any remaining factors of two out */
 068     if (u_lsb != k) \{
 069        if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) \{
-070           goto __V;
+070           goto LBL_V;
 071        \}
 072     \}
 073   
 074     if (v_lsb != k) \{
 075        if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) \{
-076           goto __V;
+076           goto LBL_V;
 077        \}
 078     \}
 079   
@@ -9815,23 +9859,23 @@
 086        
 087        /* subtract smallest from largest */
 088        if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) \{
-089           goto __V;
+089           goto LBL_V;
 090        \}
 091        
 092        /* Divide out all factors of two */
 093        if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) \{
-094           goto __V;
+094           goto LBL_V;
 095        \} 
 096     \} 
 097   
 098     /* multiply by 2**k which we divided out at the beginning */
 099     if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) \{
-100        goto __V;
+100        goto LBL_V;
 101     \}
 102     c->sign = MP_ZPOS;
 103     res = MP_OKAY;
-104   __V:mp_clear (&u);
-105   __U:mp_clear (&v);
+104   LBL_V:mp_clear (&u);
+105   LBL_U:mp_clear (&v);
 106     return res;
 107   \}
 108   #endif
@@ -9904,20 +9948,20 @@
 027   
 028     /* t1 = get the GCD of the two inputs */
 029     if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) \{
-030       goto __T;
+030       goto LBL_T;
 031     \}
 032   
 033     /* divide the smallest by the GCD */
 034     if (mp_cmp_mag(a, b) == MP_LT) \{
 035        /* store quotient in t2 such that t2 * b is the LCM */
 036        if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) \{
-037           goto __T;
+037           goto LBL_T;
 038        \}
 039        res = mp_mul(b, &t2, c);
 040     \} else \{
 041        /* store quotient in t2 such that t2 * a is the LCM */
 042        if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) \{
-043           goto __T;
+043           goto LBL_T;
 044        \}
 045        res = mp_mul(a, &t2, c);
 046     \}
@@ -9925,7 +9969,7 @@
 048     /* fix the sign to positive */
 049     c->sign = MP_ZPOS;
 050   
-051   __T:
+051   LBL_T:
 052     mp_clear_multi (&t1, &t2, NULL);
 053     return res;
 054   \}
@@ -9938,6 +9982,8 @@
 defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
 equivalent to equation \ref{eqn:legendre}.
 
+\textit{-- Tom, don't be an ass, cite your source here...!}
+
 \begin{equation}
 a^{(p-1)/2} \equiv \begin{array}{rl}
                               -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
@@ -10123,13 +10169,13 @@
 049     \}
 050   
 051     if ((res = mp_init (&p1)) != MP_OKAY) \{
-052       goto __A1;
+052       goto LBL_A1;
 053     \}
 054   
 055     /* divide out larger power of two */
 056     k = mp_cnt_lsb(&a1);
 057     if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) \{
-058        goto __P1;
+058        goto LBL_P1;
 059     \}
 060   
 061     /* step 4.  if e is even set s=1 */
@@ -10157,18 +10203,18 @@
 083     \} else \{
 084       /* n1 = n mod a1 */
 085       if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) \{
-086         goto __P1;
+086         goto LBL_P1;
 087       \}
 088       if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) \{
-089         goto __P1;
+089         goto LBL_P1;
 090       \}
 091       *c = s * r;
 092     \}
 093   
 094     /* done */
 095     res = MP_OKAY;
-096   __P1:mp_clear (&p1);
-097   __A1:mp_clear (&a1);
+096   LBL_P1:mp_clear (&p1);
+097   LBL_A1:mp_clear (&a1);
 098     return res;
 099   \}
 100   #endif
@@ -10406,8 +10452,8 @@
 028     *result = MP_NO;
 029   
 030     for (ix = 0; ix < PRIME_SIZE; ix++) \{
-031       /* what is a mod __prime_tab[ix] */
-032       if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) \{
+031       /* what is a mod LBL_prime_tab[ix] */
+032       if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) \{
 033         return err;
 034       \}
 035   
@@ -10431,7 +10477,7 @@
 \hspace{-5.1mm}{\bf File}: bn\_prime\_tab.c
 \vspace{-3mm}
 \begin{alltt}
-016   const mp_digit __prime_tab[] = \{
+016   const mp_digit ltm_prime_tab[] = \{
 017     0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
 018     0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
 019     0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
@@ -10547,7 +10593,7 @@
 042   
 043     /* compute t = b**a mod a */
 044     if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) \{
-045       goto __T;
+045       goto LBL_T;
 046     \}
 047   
 048     /* is it equal to b? */
@@ -10556,7 +10602,7 @@
 051     \}
 052   
 053     err = MP_OKAY;
-054   __T:mp_clear (&t);
+054   LBL_T:mp_clear (&t);
 055     return err;
 056   \}
 057   #endif
@@ -10638,12 +10684,12 @@
 039       return err;
 040     \}
 041     if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) \{
-042       goto __N1;
+042       goto LBL_N1;
 043     \}
 044   
 045     /* set 2**s * r = n1 */
 046     if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) \{
-047       goto __N1;
+047       goto LBL_N1;
 048     \}
 049   
 050     /* count the number of least significant bits
@@ -10653,15 +10699,15 @@
 054   
 055     /* now divide n - 1 by 2**s */
 056     if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) \{
-057       goto __R;
+057       goto LBL_R;
 058     \}
 059   
 060     /* compute y = b**r mod a */
 061     if ((err = mp_init (&y)) != MP_OKAY) \{
-062       goto __R;
+062       goto LBL_R;
 063     \}
 064     if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) \{
-065       goto __Y;
+065       goto LBL_Y;
 066     \}
 067   
 068     /* if y != 1 and y != n1 do */
@@ -10670,12 +10716,12 @@
 071       /* while j <= s-1 and y != n1 */
 072       while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) \{
 073         if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) \{
-074            goto __Y;
+074            goto LBL_Y;
 075         \}
 076   
 077         /* if y == 1 then composite */
 078         if (mp_cmp_d (&y, 1) == MP_EQ) \{
-079            goto __Y;
+079            goto LBL_Y;
 080         \}
 081   
 082         ++j;
@@ -10683,15 +10729,15 @@
 084   
 085       /* if y != n1 then composite */
 086       if (mp_cmp (&y, &n1) != MP_EQ) \{
-087         goto __Y;
+087         goto LBL_Y;
 088       \}
 089     \}
 090   
 091     /* probably prime now */
 092     *result = MP_YES;
-093   __Y:mp_clear (&y);
-094   __R:mp_clear (&r);
-095   __N1:mp_clear (&n1);
+093   LBL_Y:mp_clear (&y);
+094   LBL_R:mp_clear (&r);
+095   LBL_N1:mp_clear (&n1);
 096     return err;
 097   \}
 098   #endif
--- a/tommath_class.h	Sun Dec 19 11:33:56 2004 +0000
+++ b/tommath_class.h	Fri May 06 08:59:30 2005 +0000
@@ -90,8 +90,11 @@
 #define BN_MP_READ_UNSIGNED_BIN_C
 #define BN_MP_REDUCE_C
 #define BN_MP_REDUCE_2K_C
+#define BN_MP_REDUCE_2K_L_C
 #define BN_MP_REDUCE_2K_SETUP_C
+#define BN_MP_REDUCE_2K_SETUP_L_C
 #define BN_MP_REDUCE_IS_2K_C
+#define BN_MP_REDUCE_IS_2K_L_C
 #define BN_MP_REDUCE_SETUP_C
 #define BN_MP_RSHD_C
 #define BN_MP_SET_C
@@ -105,7 +108,9 @@
 #define BN_MP_SUB_D_C
 #define BN_MP_SUBMOD_C
 #define BN_MP_TO_SIGNED_BIN_C
+#define BN_MP_TO_SIGNED_BIN_N_C
 #define BN_MP_TO_UNSIGNED_BIN_C
+#define BN_MP_TO_UNSIGNED_BIN_N_C
 #define BN_MP_TOOM_MUL_C
 #define BN_MP_TOOM_SQR_C
 #define BN_MP_TORADIX_C
@@ -132,7 +137,7 @@
    #define BN_MP_ISEVEN_C
    #define BN_MP_INIT_MULTI_C
    #define BN_MP_COPY_C
-   #define BN_MP_ABS_C
+   #define BN_MP_MOD_C
    #define BN_MP_SET_C
    #define BN_MP_DIV_2_C
    #define BN_MP_ISODD_C
@@ -242,6 +247,7 @@
    #define BN_MP_INIT_MULTI_C
    #define BN_MP_SET_C
    #define BN_MP_COUNT_BITS_C
+   #define BN_MP_ABS_C
    #define BN_MP_MUL_2D_C
    #define BN_MP_CMP_C
    #define BN_MP_SUB_C
@@ -323,11 +329,12 @@
    #define BN_MP_CLEAR_C
    #define BN_MP_ABS_C
    #define BN_MP_CLEAR_MULTI_C
+   #define BN_MP_REDUCE_IS_2K_L_C
+   #define BN_S_MP_EXPTMOD_C
    #define BN_MP_DR_IS_MODULUS_C
    #define BN_MP_REDUCE_IS_2K_C
    #define BN_MP_ISODD_C
    #define BN_MP_EXPTMOD_FAST_C
-   #define BN_S_MP_EXPTMOD_C
 #endif
 
 #if defined(BN_MP_EXPTMOD_FAST_C)
@@ -359,6 +366,7 @@
    #define BN_MP_DIV_C
    #define BN_MP_MUL_C
    #define BN_MP_SUB_C
+   #define BN_MP_NEG_C
    #define BN_MP_EXCH_C
    #define BN_MP_CLEAR_MULTI_C
 #endif
@@ -433,6 +441,7 @@
 #if defined(BN_MP_INVMOD_SLOW_C)
    #define BN_MP_ISZERO_C
    #define BN_MP_INIT_MULTI_C
+   #define BN_MP_MOD_C
    #define BN_MP_COPY_C
    #define BN_MP_ISEVEN_C
    #define BN_MP_SET_C
@@ -724,6 +733,17 @@
    #define BN_MP_CLEAR_C
 #endif
 
+#if defined(BN_MP_REDUCE_2K_L_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_MUL_C
+   #define BN_S_MP_ADD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
 #if defined(BN_MP_REDUCE_2K_SETUP_C)
    #define BN_MP_INIT_C
    #define BN_MP_COUNT_BITS_C
@@ -732,11 +752,22 @@
    #define BN_S_MP_SUB_C
 #endif
 
+#if defined(BN_MP_REDUCE_2K_SETUP_L_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_2EXPT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
 #if defined(BN_MP_REDUCE_IS_2K_C)
    #define BN_MP_REDUCE_2K_C
    #define BN_MP_COUNT_BITS_C
 #endif
 
+#if defined(BN_MP_REDUCE_IS_2K_L_C)
+#endif
+
 #if defined(BN_MP_REDUCE_SETUP_C)
    #define BN_MP_2EXPT_C
    #define BN_MP_DIV_C
@@ -814,6 +845,11 @@
    #define BN_MP_TO_UNSIGNED_BIN_C
 #endif
 
+#if defined(BN_MP_TO_SIGNED_BIN_N_C)
+   #define BN_MP_SIGNED_BIN_SIZE_C
+   #define BN_MP_TO_SIGNED_BIN_C
+#endif
+
 #if defined(BN_MP_TO_UNSIGNED_BIN_C)
    #define BN_MP_INIT_COPY_C
    #define BN_MP_ISZERO_C
@@ -821,6 +857,11 @@
    #define BN_MP_CLEAR_C
 #endif
 
+#if defined(BN_MP_TO_UNSIGNED_BIN_N_C)
+   #define BN_MP_UNSIGNED_BIN_SIZE_C
+   #define BN_MP_TO_UNSIGNED_BIN_C
+#endif
+
 #if defined(BN_MP_TOOM_MUL_C)
    #define BN_MP_INIT_MULTI_C
    #define BN_MP_MOD_2D_C
@@ -901,10 +942,12 @@
    #define BN_MP_INIT_C
    #define BN_MP_CLEAR_C
    #define BN_MP_REDUCE_SETUP_C
+   #define BN_MP_REDUCE_C
+   #define BN_MP_REDUCE_2K_SETUP_L_C
+   #define BN_MP_REDUCE_2K_L_C
    #define BN_MP_MOD_C
    #define BN_MP_COPY_C
    #define BN_MP_SQR_C
-   #define BN_MP_REDUCE_C
    #define BN_MP_MUL_C
    #define BN_MP_SET_C
    #define BN_MP_EXCH_C