diff tommath.tex @ 386:97db060d0ef5 libtommath-orig libtommath-0.40

Update to LibTomMath 0.40
author Matt Johnston <matt@ucc.asn.au>
date Thu, 11 Jan 2007 03:11:15 +0000
parents 91fbc376f010
children
line wrap: on
line diff
--- a/tommath.tex	Wed Mar 08 13:16:18 2006 +0000
+++ b/tommath.tex	Thu Jan 11 03:11:15 2007 +0000
@@ -66,7 +66,7 @@
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.35 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.39 release of the 
 LibTomMath project.
 
 \begin{alltt}
@@ -77,7 +77,7 @@
 Canada
 
 Phone: 1-613-836-3160
-Email: [email protected]
+Email: [email protected]
 \end{alltt}
 
 This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
@@ -268,7 +268,7 @@
 any form of useful performance in non-trivial applications.  
 
 To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer
-package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.org}} package is used 
+package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.com}} package is used 
 to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field 
 tested and work very well.  The LibTomMath library is freely available on the Internet for all uses and this text 
 discusses a very large portion of the inner workings of the library.
@@ -814,6 +814,7 @@
 039     return MP_OKAY;
 040   \}
 041   #endif
+042   
 \end{alltt}
 \end{small}
 
@@ -902,6 +903,7 @@
 037     \}
 038   \}
 039   #endif
+040   
 \end{alltt}
 \end{small}
 
@@ -1008,6 +1010,7 @@
 050     return MP_OKAY;
 051   \}
 052   #endif
+053   
 \end{alltt}
 \end{small}
 
@@ -1096,6 +1099,7 @@
 041     return MP_OKAY;
 042   \}
 043   #endif
+044   
 \end{alltt}
 \end{small}
 
@@ -1183,6 +1187,7 @@
 052   \}
 053   
 054   #endif
+055   
 \end{alltt}
 \end{small}
 
@@ -1268,6 +1273,7 @@
 037     \}
 038   \}
 039   #endif
+040   
 \end{alltt}
 \end{small}
 
@@ -1405,6 +1411,7 @@
 061     return MP_OKAY;
 062   \}
 063   #endif
+064   
 \end{alltt}
 \end{small}
 
@@ -1519,6 +1526,7 @@
 025     return mp_copy (b, a);
 026   \}
 027   #endif
+028   
 \end{alltt}
 \end{small}
 
@@ -1570,6 +1578,7 @@
 029     \}
 030   \}
 031   #endif
+032   
 \end{alltt}
 \end{small}
 
@@ -1631,6 +1640,7 @@
 036     return MP_OKAY;
 037   \}
 038   #endif
+039   
 \end{alltt}
 \end{small}
 
@@ -1692,6 +1702,7 @@
 033     return MP_OKAY;
 034   \}
 035   #endif
+036   
 \end{alltt}
 \end{small}
 
@@ -1739,6 +1750,7 @@
 022     a->used  = (a->dp[0] != 0) ? 1 : 0;
 023   \}
 024   #endif
+025   
 \end{alltt}
 \end{small}
 
@@ -1819,6 +1831,7 @@
 041     return MP_OKAY;
 042   \}
 043   #endif
+044   
 \end{alltt}
 \end{small}
 
@@ -1921,6 +1934,7 @@
 048     return MP_EQ;
 049   \}
 050   #endif
+051   
 \end{alltt}
 \end{small}
 
@@ -1987,6 +2001,7 @@
 036     \}
 037   \}
 038   #endif
+039   
 \end{alltt}
 \end{small}
 
@@ -2205,6 +2220,7 @@
 102     return MP_OKAY;
 103   \}
 104   #endif
+105   
 \end{alltt}
 \end{small}
 
@@ -2376,6 +2392,7 @@
 082   \}
 083   
 084   #endif
+085   
 \end{alltt}
 \end{small}
 
@@ -2511,6 +2528,7 @@
 046   \}
 047   
 048   #endif
+049   
 \end{alltt}
 \end{small}
 
@@ -2623,6 +2641,7 @@
 052   \}
 053   
 054   #endif
+055   
 \end{alltt}
 \end{small}
 
@@ -2757,6 +2776,7 @@
 075     return MP_OKAY;
 076   \}
 077   #endif
+078   
 \end{alltt}
 \end{small}
 
@@ -2857,6 +2877,7 @@
 061     return MP_OKAY;
 062   \}
 063   #endif
+064   
 \end{alltt}
 \end{small}
 
@@ -2977,6 +2998,7 @@
 060     return MP_OKAY;
 061   \}
 062   #endif
+063   
 \end{alltt}
 \end{small}
 
@@ -3088,6 +3110,7 @@
 065     a->used -= b;
 066   \}
 067   #endif
+068   
 \end{alltt}
 \end{small}
 
@@ -3146,7 +3169,7 @@
 
 After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts 
 required.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
-Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
+Essentially the loop is a generic version of algorithm mp\_mul\_2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
 variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
 
 This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
@@ -3221,6 +3244,7 @@
 078     return MP_OKAY;
 079   \}
 080   #endif
+081   
 \end{alltt}
 \end{small}
 
@@ -3357,6 +3381,7 @@
 090     return MP_OKAY;
 091   \}
 092   #endif
+093   
 \end{alltt}
 \end{small}
 
@@ -3448,6 +3473,7 @@
 048     return MP_OKAY;
 049   \}
 050   #endif
+051   
 \end{alltt}
 \end{small}
 
@@ -3687,6 +3713,7 @@
 083     return MP_OKAY;
 084   \}
 085   #endif
+086   
 \end{alltt}
 \end{small}
 
@@ -3837,17 +3864,16 @@
 \hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
 \hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
 \hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
-6.  $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
 \\
-7.  $oldused \leftarrow c.used$ \\
-8.  $c.used \leftarrow digs$ \\
-9.  for $ix$ from $0$ to $pa$ do \\
-\hspace{3mm}9.1  $c_{ix} \leftarrow W_{ix}$ \\
-10.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
-\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\
+6.  $oldused \leftarrow c.used$ \\
+7.  $c.used \leftarrow digs$ \\
+8.  for $ix$ from $0$ to $pa$ do \\
+\hspace{3mm}8.1  $c_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
+\hspace{3mm}9.1 $c_{ix} \leftarrow 0$ \\
 \\
-11.  Clamp $c$. \\
-12.  Return MP\_OKAY. \\
+10.  Clamp $c$. \\
+11.  Return MP\_OKAY. \\
 \hline
 \end{tabular}
 \end{center}
@@ -3942,39 +3968,38 @@
 069         /* execute loop */
 070         for (iz = 0; iz < iy; ++iz) \{
 071            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-072         \}
-073   
-074         /* store term */
-075         W[ix] = ((mp_digit)_W) & MP_MASK;
-076   
-077         /* make next carry */
-078         _W = _W >> ((mp_word)DIGIT_BIT);
-079     \}
-080   
-081     /* store final carry */
-082     W[ix] = (mp_digit)(_W & MP_MASK);
-083   
-084     /* setup dest */
-085     olduse  = c->used;
-086     c->used = pa;
-087   
-088     \{
-089       register mp_digit *tmpc;
-090       tmpc = c->dp;
-091       for (ix = 0; ix < pa+1; ix++) \{
-092         /* now extract the previous digit [below the carry] */
-093         *tmpc++ = W[ix];
-094       \}
-095   
-096       /* clear unused digits [that existed in the old copy of c] */
-097       for (; ix < olduse; ix++) \{
-098         *tmpc++ = 0;
-099       \}
-100     \}
-101     mp_clamp (c);
-102     return MP_OKAY;
-103   \}
-104   #endif
+072   
+073         \}
+074   
+075         /* store term */
+076         W[ix] = ((mp_digit)_W) & MP_MASK;
+077   
+078         /* make next carry */
+079         _W = _W >> ((mp_word)DIGIT_BIT);
+080    \}
+081   
+082     /* setup dest */
+083     olduse  = c->used;
+084     c->used = pa;
+085   
+086     \{
+087       register mp_digit *tmpc;
+088       tmpc = c->dp;
+089       for (ix = 0; ix < pa+1; ix++) \{
+090         /* now extract the previous digit [below the carry] */
+091         *tmpc++ = W[ix];
+092       \}
+093   
+094       /* clear unused digits [that existed in the old copy of c] */
+095       for (; ix < olduse; ix++) \{
+096         *tmpc++ = 0;
+097       \}
+098     \}
+099     mp_clamp (c);
+100     return MP_OKAY;
+101   \}
+102   #endif
+103   
 \end{alltt}
 \end{small}
 
@@ -3982,7 +4007,7 @@
 to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines 61, 62) to point
 inside the two multiplicands quickly.  
 
-The inner loop (lines 70 to 72) of this implementation is where the tradeoff come into play.  Originally this comba 
+The inner loop (lines 70 to 73) of this implementation is where the tradeoff come into play.  Originally this comba 
 implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
 the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
 one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
@@ -3990,8 +4015,8 @@
 slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
 compiler has aliased $\_ \hat W$ to a CPU register.
 
-After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 75, 78) to forward it as 
-a carry for the next pass.  After the outer loop we use the final carry (line 82) as the last digit of the product.  
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 76, 79) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line 76) as the last digit of the product.  
 
 \subsection{Polynomial Basis Multiplication}
 To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
@@ -4095,26 +4120,25 @@
 light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
 
 \begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
+f(x) \cdot g(x) = acx^2 + ((a + b)(c + d) - (ac + bd))x + bd
 \end{equation}
 
 Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
 this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
 out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
-$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
+$\zeta_0$, $\zeta_{\infty}$ and $\zeta_{1}$.  Consider the resultant system of equations.
 
 \begin{center}
 \begin{tabular}{rcrcrcrc}
 $\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
-$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
+$\zeta_{1}$ &      $=$ & $w_2$ & $+$ & $w_1$ & $+$ & $w_0$ \\
 $\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
 \end{tabular}
 \end{center}
 
 By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
 of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
-making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
-$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
+making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -4137,13 +4161,13 @@
 Calculate the three products. \\
 8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
 9.  $x1y1 \leftarrow x1 \cdot y1$ \\
-10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
-11.  $x0 \leftarrow y1 - y0$ \\
+10.  $t1 \leftarrow x1 + x0$ (\textit{mp\_add}) \\
+11.  $x0 \leftarrow y1 + y0$ \\
 12.  $t1 \leftarrow t1 \cdot x0$ \\
 \\
 Calculate the middle term. \\
 13.  $x0 \leftarrow x0y0 + x1y1$ \\
-14.  $t1 \leftarrow x0 - t1$ \\
+14.  $t1 \leftarrow t1 - x0$ (\textit{s\_mp\_sub}) \\
 \\
 Calculate the final product. \\
 15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
@@ -4170,7 +4194,7 @@
 compute the lower halves.  Step 6 and 7 computer the upper halves.  
 
 After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
-$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
+$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 + x0$ has been computed.  By using $x0$ instead
 of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
 
 The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
@@ -4191,12 +4215,12 @@
 025    * b = b1 * B**n + b0
 026    *
 027    * Then, a * b => 
-028      a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+028      a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
 029    *
 030    * Note that a1b1 and a0b0 are used twice and only need to be 
 031    * computed once.  So in total three half size (half # of 
 032    * digit) multiplications are performed, a0b0, a1b1 and 
-033    * (a1-b1)(a0-b0)
+033    * (a1+b1)(a0+b0)
 034    *
 035    * Note that a multiplication of half the digits requires
 036    * 1/4th the number of single precision multiplications so in 
@@ -4287,19 +4311,19 @@
 121     if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
 122       goto X1Y1;          /* x1y1 = x1*y1 */
 123   
-124     /* now calc x1-x0 and y1-y0 */
-125     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+124     /* now calc x1+x0 and y1+y0 */
+125     if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
 126       goto X1Y1;          /* t1 = x1 - x0 */
-127     if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+127     if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
 128       goto X1Y1;          /* t2 = y1 - y0 */
 129     if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-130       goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+130       goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
 131   
 132     /* add x0y0 */
 133     if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
 134       goto X1Y1;          /* t2 = x0y0 + x1y1 */
-135     if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
-136       goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+135     if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
+136       goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
 137   
 138     /* shift by B */
 139     if (mp_lshd (&t1, B) != MP_OKAY)
@@ -4326,6 +4350,7 @@
 160     return err;
 161   \}
 162   #endif
+163   
 \end{alltt}
 \end{small}
 
@@ -4729,6 +4754,7 @@
 277   \}     
 278        
 279   #endif
+280   
 \end{alltt}
 \end{small}
 
@@ -4837,6 +4863,7 @@
 059     return res;
 060   \}
 061   #endif
+062   
 \end{alltt}
 \end{small}
 
@@ -5006,6 +5033,7 @@
 077     return MP_OKAY;
 078   \}
 079   #endif
+080   
 \end{alltt}
 \end{small}
 
@@ -5188,6 +5216,7 @@
 107     return MP_OKAY;
 108   \}
 109   #endif
+110   
 \end{alltt}
 \end{small}
 
@@ -5205,10 +5234,10 @@
 number with the following equation.
 
 \begin{equation}
-h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
+h(x) = a^2x^2 + \left ((a + b)^2 - (a^2 + b^2) \right )x + b^2
 \end{equation}
 
-Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
+Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a + b)^2$.  As in 
 Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
 $O \left ( n^{lg(3)} \right )$.
 
@@ -5240,12 +5269,12 @@
 Calculate the three squares. \\
 6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
 7.  $x1x1 \leftarrow x1^2$ \\
-8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+8.  $t1 \leftarrow x1 + x0$ (\textit{s\_mp\_add}) \\
 9.  $t1 \leftarrow t1^2$ \\
 \\
 Compute the middle term. \\
 10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
-11.  $t1 \leftarrow t2 - t1$ \\
+11.  $t1 \leftarrow t1 - t2$ \\
 \\
 Compute final product. \\
 12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
@@ -5268,7 +5297,7 @@
 placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
 as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
 
-By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
+By expanding $\left (x1 + x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $(x0 - x1)^2 - (x1^2 + x0^2)  = 2 \cdot x0 \cdot x1$.
 Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
 this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
 
@@ -5363,8 +5392,8 @@
 079     if (mp_sqr (&x1, &x1x1) != MP_OKAY)
 080       goto X1X1;           /* x1x1 = x1*x1 */
 081   
-082     /* now calc (x1-x0)**2 */
-083     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+082     /* now calc (x1+x0)**2 */
+083     if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
 084       goto X1X1;           /* t1 = x1 - x0 */
 085     if (mp_sqr (&t1, &t1) != MP_OKAY)
 086       goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
@@ -5372,8 +5401,8 @@
 088     /* add x0y0 */
 089     if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
 090       goto X1X1;           /* t2 = x0x0 + x1x1 */
-091     if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-092       goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+091     if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
+092       goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
 093   
 094     /* shift by B */
 095     if (mp_lshd (&t1, B) != MP_OKAY)
@@ -5398,6 +5427,7 @@
 114     return err;
 115   \}
 116   #endif
+117   
 \end{alltt}
 \end{small}
 
@@ -5494,6 +5524,7 @@
 051     return res;
 052   \}
 053   #endif
+054   
 \end{alltt}
 \end{small}
 
@@ -5827,6 +5858,7 @@
 093     return res;
 094   \}
 095   #endif
+096   
 \end{alltt}
 \end{small}
 
@@ -5879,6 +5911,7 @@
 027     return mp_div (a, b, a, NULL);
 028   \}
 029   #endif
+030   
 \end{alltt}
 \end{small}
 
@@ -5943,6 +5976,7 @@
 \hline $6$ & $x/2 = 139$ \\
 \hline $7$ & $x + n = 396$, $x/2 = 198$ \\
 \hline $8$ & $x/2 = 99$ \\
+\hline $9$ & $x + n = 356$, $x/2 = 178$ \\
 \hline
 \end{tabular}
 \end{center}
@@ -5951,8 +5985,8 @@
 \label{fig:MONT1}
 \end{figure}
 
-Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$.  The result of the algorithm $r = 99$ is
-congruent to the value of $2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^8$ modulo $257$ the correct residue 
+Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 9$ (note $\beta^k = 512$ which is larger than $n$).  The result of 
+the algorithm $r = 178$ is congruent to the value of $2^{-9} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^9$ modulo $257$ the correct residue 
 $r \equiv 158$ is produced.  
 
 Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
@@ -5964,10 +5998,10 @@
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ ($2^k > n$) \\
 \textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
 \hline \\
-1.  for $t$ from $0$ to $k - 1$ do \\
+1.  for $t$ from $1$ to $k$ do \\
 \hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
 \hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
 2.  Return $x/2^k$. \\
@@ -5995,7 +6029,8 @@
 \hline $6$ & $8896$ & $10001011000000$ \\
 \hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\
 \hline $8$ & $25344$ & $110001100000000$ \\
-\hline -- & $x/2^k = 99$ & \\
+\hline $9$ & $x + 2^{7}n = 91136$ & $10110010000000000$ \\
+\hline -- & $x/2^k = 178$ & \\
 \hline
 \end{tabular}
 \end{center}
@@ -6004,7 +6039,7 @@
 \label{fig:MONT2}
 \end{figure}
 
-Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 8$. 
+Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 9$. 
 With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the 
 loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is 
 zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.  
@@ -6018,7 +6053,7 @@
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ ($\beta^k > n$) \\
 \textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
 \hline \\
 1.  for $t$ from $0$ to $k - 1$ do \\
@@ -6234,6 +6269,7 @@
 111     return MP_OKAY;
 112   \}
 113   #endif
+114   
 \end{alltt}
 \end{small}
 
@@ -6478,6 +6514,7 @@
 165     return MP_OKAY;
 166   \}
 167   #endif
+168   
 \end{alltt}
 \end{small}
 
@@ -6505,7 +6542,7 @@
 \hline \\
 1.  $b \leftarrow n_0$ \\
 2.  If $b$ is even return(\textit{MP\_VAL}) \\
-3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
+3.  $x \leftarrow (((b + 2) \mbox{ AND } 4) << 1) + b$ \\
 4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
 \hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
 5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
@@ -6559,11 +6596,13 @@
 047   #endif
 048   
 049     /* rho = -1/m mod b */
-050     *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
+050     *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MAS
+      K;
 051   
 052     return MP_OKAY;
 053   \}
 054   #endif
+055   
 \end{alltt}
 \end{small}
 
@@ -6830,6 +6869,7 @@
 087     return MP_OKAY;
 088   \}
 089   #endif
+090   
 \end{alltt}
 \end{small}
 
@@ -6885,6 +6925,7 @@
 025   \}
 026   
 027   #endif
+028   
 \end{alltt}
 \end{small}
 
@@ -6943,6 +6984,7 @@
 036   \}
 037   
 038   #endif
+039   
 \end{alltt}
 \end{small}
 
@@ -7027,6 +7069,7 @@
 054   \}
 055   
 056   #endif
+057   
 \end{alltt}
 \end{small}
 
@@ -7096,6 +7139,7 @@
 040      return MP_OKAY;
 041   \}
 042   #endif
+043   
 \end{alltt}
 \end{small}
 
@@ -7172,6 +7216,7 @@
 045   \}
 046   
 047   #endif
+048   
 \end{alltt}
 \end{small}
 
@@ -7381,6 +7426,7 @@
 050     return MP_OKAY;
 051   \}
 052   #endif
+053   
 \end{alltt}
 \end{small}
 
@@ -7620,7 +7666,8 @@
 065     \}
 066   
 067   /* modified diminished radix reduction */
-068   #if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+068   #if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defin
+      ed(BN_S_MP_EXPTMOD_C)
 069     if (mp_reduce_is_2k_l(P) == MP_YES) \{
 070        return s_mp_exptmod(G, X, P, Y, 1);
 071     \}
@@ -7660,6 +7707,7 @@
 105   \}
 106   
 107   #endif
+108   
 \end{alltt}
 \end{small}
 
@@ -7839,252 +7887,252 @@
 \hspace{-5.1mm}{\bf File}: bn\_s\_mp\_exptmod.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   #ifdef MP_LOW_MEM
-018      #define TAB_SIZE 32
-019   #else
-020      #define TAB_SIZE 256
-021   #endif
-022   
-023   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmod
+016   #ifdef MP_LOW_MEM
+017      #define TAB_SIZE 32
+018   #else
+019      #define TAB_SIZE 256
+020   #endif
+021   
+022   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmod
       e)
-024   \{
-025     mp_int  M[TAB_SIZE], res, mu;
-026     mp_digit buf;
-027     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-028     int (*redux)(mp_int*,mp_int*,mp_int*);
-029   
-030     /* find window size */
-031     x = mp_count_bits (X);
-032     if (x <= 7) \{
-033       winsize = 2;
-034     \} else if (x <= 36) \{
-035       winsize = 3;
-036     \} else if (x <= 140) \{
-037       winsize = 4;
-038     \} else if (x <= 450) \{
-039       winsize = 5;
-040     \} else if (x <= 1303) \{
-041       winsize = 6;
-042     \} else if (x <= 3529) \{
-043       winsize = 7;
-044     \} else \{
-045       winsize = 8;
-046     \}
-047   
-048   #ifdef MP_LOW_MEM
-049       if (winsize > 5) \{
-050          winsize = 5;
-051       \}
-052   #endif
-053   
-054     /* init M array */
-055     /* init first cell */
-056     if ((err = mp_init(&M[1])) != MP_OKAY) \{
-057        return err; 
-058     \}
-059   
-060     /* now init the second half of the array */
-061     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-062       if ((err = mp_init(&M[x])) != MP_OKAY) \{
-063         for (y = 1<<(winsize-1); y < x; y++) \{
-064           mp_clear (&M[y]);
-065         \}
-066         mp_clear(&M[1]);
-067         return err;
-068       \}
-069     \}
-070   
-071     /* create mu, used for Barrett reduction */
-072     if ((err = mp_init (&mu)) != MP_OKAY) \{
-073       goto LBL_M;
-074     \}
-075     
-076     if (redmode == 0) \{
-077        if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
-078           goto LBL_MU;
-079        \}
-080        redux = mp_reduce;
-081     \} else \{
-082        if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) \{
-083           goto LBL_MU;
-084        \}
-085        redux = mp_reduce_2k_l;
-086     \}    
-087   
-088     /* create M table
-089      *
-090      * The M table contains powers of the base, 
-091      * e.g. M[x] = G**x mod P
-092      *
-093      * The first half of the table is not 
-094      * computed though accept for M[0] and M[1]
-095      */
-096     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
-097       goto LBL_MU;
-098     \}
-099   
-100     /* compute the value at M[1<<(winsize-1)] by squaring 
-101      * M[1] (winsize-1) times 
-102      */
-103     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
-104       goto LBL_MU;
-105     \}
-106   
-107     for (x = 0; x < (winsize - 1); x++) \{
-108       /* square it */
-109       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
-110                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
-111         goto LBL_MU;
-112       \}
-113   
-114       /* reduce modulo P */
-115       if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
-116         goto LBL_MU;
-117       \}
-118     \}
-119   
-120     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
-121      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
-122      */
-123     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
-124       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
-125         goto LBL_MU;
-126       \}
-127       if ((err = redux (&M[x], P, &mu)) != MP_OKAY) \{
-128         goto LBL_MU;
-129       \}
-130     \}
-131   
-132     /* setup result */
-133     if ((err = mp_init (&res)) != MP_OKAY) \{
-134       goto LBL_MU;
-135     \}
-136     mp_set (&res, 1);
-137   
-138     /* set initial mode and bit cnt */
-139     mode   = 0;
-140     bitcnt = 1;
-141     buf    = 0;
-142     digidx = X->used - 1;
-143     bitcpy = 0;
-144     bitbuf = 0;
-145   
-146     for (;;) \{
-147       /* grab next digit as required */
-148       if (--bitcnt == 0) \{
-149         /* if digidx == -1 we are out of digits */
-150         if (digidx == -1) \{
-151           break;
-152         \}
-153         /* read next digit and reset the bitcnt */
-154         buf    = X->dp[digidx--];
-155         bitcnt = (int) DIGIT_BIT;
-156       \}
-157   
-158       /* grab the next msb from the exponent */
-159       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-160       buf <<= (mp_digit)1;
-161   
-162       /* if the bit is zero and mode == 0 then we ignore it
-163        * These represent the leading zero bits before the first 1 bit
-164        * in the exponent.  Technically this opt is not required but it
-165        * does lower the # of trivial squaring/reductions used
-166        */
-167       if (mode == 0 && y == 0) \{
-168         continue;
-169       \}
-170   
-171       /* if the bit is zero and mode == 1 then we square */
-172       if (mode == 1 && y == 0) \{
-173         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-174           goto LBL_RES;
-175         \}
-176         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-177           goto LBL_RES;
-178         \}
-179         continue;
-180       \}
-181   
-182       /* else we add it to the window */
-183       bitbuf |= (y << (winsize - ++bitcpy));
-184       mode    = 2;
-185   
-186       if (bitcpy == winsize) \{
-187         /* ok window is filled so square as required and multiply  */
-188         /* square first */
-189         for (x = 0; x < winsize; x++) \{
-190           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-191             goto LBL_RES;
-192           \}
-193           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-194             goto LBL_RES;
-195           \}
-196         \}
-197   
-198         /* then multiply */
-199         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
-200           goto LBL_RES;
-201         \}
-202         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-203           goto LBL_RES;
-204         \}
-205   
-206         /* empty window and reset */
-207         bitcpy = 0;
-208         bitbuf = 0;
-209         mode   = 1;
-210       \}
-211     \}
-212   
-213     /* if bits remain then square/multiply */
-214     if (mode == 2 && bitcpy > 0) \{
-215       /* square then multiply if the bit is set */
-216       for (x = 0; x < bitcpy; x++) \{
-217         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-218           goto LBL_RES;
-219         \}
-220         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-221           goto LBL_RES;
-222         \}
-223   
-224         bitbuf <<= 1;
-225         if ((bitbuf & (1 << winsize)) != 0) \{
-226           /* then multiply */
-227           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
-228             goto LBL_RES;
-229           \}
-230           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-231             goto LBL_RES;
-232           \}
-233         \}
-234       \}
-235     \}
-236   
-237     mp_exch (&res, Y);
-238     err = MP_OKAY;
-239   LBL_RES:mp_clear (&res);
-240   LBL_MU:mp_clear (&mu);
-241   LBL_M:
-242     mp_clear(&M[1]);
-243     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-244       mp_clear (&M[x]);
-245     \}
-246     return err;
-247   \}
-248   #endif
+023   \{
+024     mp_int  M[TAB_SIZE], res, mu;
+025     mp_digit buf;
+026     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+027     int (*redux)(mp_int*,mp_int*,mp_int*);
+028   
+029     /* find window size */
+030     x = mp_count_bits (X);
+031     if (x <= 7) \{
+032       winsize = 2;
+033     \} else if (x <= 36) \{
+034       winsize = 3;
+035     \} else if (x <= 140) \{
+036       winsize = 4;
+037     \} else if (x <= 450) \{
+038       winsize = 5;
+039     \} else if (x <= 1303) \{
+040       winsize = 6;
+041     \} else if (x <= 3529) \{
+042       winsize = 7;
+043     \} else \{
+044       winsize = 8;
+045     \}
+046   
+047   #ifdef MP_LOW_MEM
+048       if (winsize > 5) \{
+049          winsize = 5;
+050       \}
+051   #endif
+052   
+053     /* init M array */
+054     /* init first cell */
+055     if ((err = mp_init(&M[1])) != MP_OKAY) \{
+056        return err; 
+057     \}
+058   
+059     /* now init the second half of the array */
+060     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+061       if ((err = mp_init(&M[x])) != MP_OKAY) \{
+062         for (y = 1<<(winsize-1); y < x; y++) \{
+063           mp_clear (&M[y]);
+064         \}
+065         mp_clear(&M[1]);
+066         return err;
+067       \}
+068     \}
+069   
+070     /* create mu, used for Barrett reduction */
+071     if ((err = mp_init (&mu)) != MP_OKAY) \{
+072       goto LBL_M;
+073     \}
+074     
+075     if (redmode == 0) \{
+076        if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
+077           goto LBL_MU;
+078        \}
+079        redux = mp_reduce;
+080     \} else \{
+081        if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) \{
+082           goto LBL_MU;
+083        \}
+084        redux = mp_reduce_2k_l;
+085     \}    
+086   
+087     /* create M table
+088      *
+089      * The M table contains powers of the base, 
+090      * e.g. M[x] = G**x mod P
+091      *
+092      * The first half of the table is not 
+093      * computed though accept for M[0] and M[1]
+094      */
+095     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
+096       goto LBL_MU;
+097     \}
+098   
+099     /* compute the value at M[1<<(winsize-1)] by squaring 
+100      * M[1] (winsize-1) times 
+101      */
+102     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
+103       goto LBL_MU;
+104     \}
+105   
+106     for (x = 0; x < (winsize - 1); x++) \{
+107       /* square it */
+108       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+109                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
+110         goto LBL_MU;
+111       \}
+112   
+113       /* reduce modulo P */
+114       if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
+115         goto LBL_MU;
+116       \}
+117     \}
+118   
+119     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+120      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+121      */
+122     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
+123       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
+124         goto LBL_MU;
+125       \}
+126       if ((err = redux (&M[x], P, &mu)) != MP_OKAY) \{
+127         goto LBL_MU;
+128       \}
+129     \}
+130   
+131     /* setup result */
+132     if ((err = mp_init (&res)) != MP_OKAY) \{
+133       goto LBL_MU;
+134     \}
+135     mp_set (&res, 1);
+136   
+137     /* set initial mode and bit cnt */
+138     mode   = 0;
+139     bitcnt = 1;
+140     buf    = 0;
+141     digidx = X->used - 1;
+142     bitcpy = 0;
+143     bitbuf = 0;
+144   
+145     for (;;) \{
+146       /* grab next digit as required */
+147       if (--bitcnt == 0) \{
+148         /* if digidx == -1 we are out of digits */
+149         if (digidx == -1) \{
+150           break;
+151         \}
+152         /* read next digit and reset the bitcnt */
+153         buf    = X->dp[digidx--];
+154         bitcnt = (int) DIGIT_BIT;
+155       \}
+156   
+157       /* grab the next msb from the exponent */
+158       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+159       buf <<= (mp_digit)1;
+160   
+161       /* if the bit is zero and mode == 0 then we ignore it
+162        * These represent the leading zero bits before the first 1 bit
+163        * in the exponent.  Technically this opt is not required but it
+164        * does lower the # of trivial squaring/reductions used
+165        */
+166       if (mode == 0 && y == 0) \{
+167         continue;
+168       \}
+169   
+170       /* if the bit is zero and mode == 1 then we square */
+171       if (mode == 1 && y == 0) \{
+172         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+173           goto LBL_RES;
+174         \}
+175         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+176           goto LBL_RES;
+177         \}
+178         continue;
+179       \}
+180   
+181       /* else we add it to the window */
+182       bitbuf |= (y << (winsize - ++bitcpy));
+183       mode    = 2;
+184   
+185       if (bitcpy == winsize) \{
+186         /* ok window is filled so square as required and multiply  */
+187         /* square first */
+188         for (x = 0; x < winsize; x++) \{
+189           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+190             goto LBL_RES;
+191           \}
+192           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+193             goto LBL_RES;
+194           \}
+195         \}
+196   
+197         /* then multiply */
+198         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
+199           goto LBL_RES;
+200         \}
+201         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+202           goto LBL_RES;
+203         \}
+204   
+205         /* empty window and reset */
+206         bitcpy = 0;
+207         bitbuf = 0;
+208         mode   = 1;
+209       \}
+210     \}
+211   
+212     /* if bits remain then square/multiply */
+213     if (mode == 2 && bitcpy > 0) \{
+214       /* square then multiply if the bit is set */
+215       for (x = 0; x < bitcpy; x++) \{
+216         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+217           goto LBL_RES;
+218         \}
+219         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+220           goto LBL_RES;
+221         \}
+222   
+223         bitbuf <<= 1;
+224         if ((bitbuf & (1 << winsize)) != 0) \{
+225           /* then multiply */
+226           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
+227             goto LBL_RES;
+228           \}
+229           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+230             goto LBL_RES;
+231           \}
+232         \}
+233       \}
+234     \}
+235   
+236     mp_exch (&res, Y);
+237     err = MP_OKAY;
+238   LBL_RES:mp_clear (&res);
+239   LBL_MU:mp_clear (&mu);
+240   LBL_M:
+241     mp_clear(&M[1]);
+242     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+243       mp_clear (&M[x]);
+244     \}
+245     return err;
+246   \}
+247   #endif
+248   
 \end{alltt}
 \end{small}
 
-Lines 21 through 40 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
+Lines 31 through 45 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
 from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
-on line 32 the value of $x$ is already known to be greater than $140$.  
-
-The conditional piece of code beginning on line 48 allows the window size to be restricted to five bits.  This logic is used to ensure
+on line 37 the value of $x$ is already known to be greater than $140$.  
+
+The conditional piece of code beginning on line 47 allows the window size to be restricted to five bits.  This logic is used to ensure
 the table of precomputed powers of $G$ remains relatively small.  
 
-The for loop on line 61 initializes the $M$ array while lines 62 and 77 compute the value of $\mu$ required for
-Barrett reduction.  
+The for loop on line 60 initializes the $M$ array while lines 71 and 76 through 85 initialize the reduction
+function that will be used for this modulus.
 
 -- More later.
 
@@ -8146,6 +8194,7 @@
 041     return MP_OKAY;
 042   \}
 043   #endif
+044   
 \end{alltt}
 \end{small}
 
@@ -8666,6 +8715,7 @@
 285   #endif
 286   
 287   #endif
+288   
 \end{alltt}
 \end{small}
 
@@ -8677,23 +8727,23 @@
 mp_div(&a, &b, &c, NULL);  /* c = [a/b] */
 \end{verbatim}
 
-Lines 37 and 44 handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
-respectively.  After the two trivial cases all of the temporary variables are initialized.  Line 105 determines the sign of 
-the quotient and line 76 ensures that both $x$ and $y$ are positive.  
-
-The number of bits in the leading digit is calculated on line 105.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
+Lines 108 and 113 handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
+respectively.  After the two trivial cases all of the temporary variables are initialized.  Line 147 determines the sign of 
+the quotient and line 148 ensures that both $x$ and $y$ are positive.  
+
+The number of bits in the leading digit is calculated on line 151.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
 of precision which when reduced modulo $lg(\beta)$ produces the value of $k$.  In this case $k$ is the number of bits in the leading digit which is
 exactly what is required.  For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting
 them to the left by $lg(\beta) - 1 - k$ bits.
 
 Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively.  These are first used to produce the 
-leading digit of the quotient.  The loop beginning on line 183 will produce the remainder of the quotient digits.
-
-The conditional ``continue'' on line 114 is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
+leading digit of the quotient.  The loop beginning on line 184 will produce the remainder of the quotient digits.
+
+The conditional ``continue'' on line 186 is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
 algorithm eliminates multiple non-zero digits in a single iteration.  This ensures that $x_i$ is always non-zero since by definition the digits
 above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}.  
 
-Lines 130, 130 and 134 through 134 manually construct the high accuracy estimations by setting the digits of the two mp\_int 
+Lines 214, 216 and 222 through 225 manually construct the high accuracy estimations by setting the digits of the two mp\_int 
 variables directly.  
 
 \section{Single Digit Helpers}
@@ -8757,69 +8807,73 @@
 039        /* fix sign  */
 040        a->sign = c->sign = MP_NEG;
 041   
-042        return res;
-043     \}
+042        /* clamp */
+043        mp_clamp(c);
 044   
-045     /* old number of used digits in c */
-046     oldused = c->used;
+045        return res;
+046     \}
 047   
-048     /* sign always positive */
-049     c->sign = MP_ZPOS;
+048     /* old number of used digits in c */
+049     oldused = c->used;
 050   
-051     /* source alias */
-052     tmpa    = a->dp;
+051     /* sign always positive */
+052     c->sign = MP_ZPOS;
 053   
-054     /* destination alias */
-055     tmpc    = c->dp;
+054     /* source alias */
+055     tmpa    = a->dp;
 056   
-057     /* if a is positive */
-058     if (a->sign == MP_ZPOS) \{
-059        /* add digit, after this we're propagating
-060         * the carry.
-061         */
-062        *tmpc   = *tmpa++ + b;
-063        mu      = *tmpc >> DIGIT_BIT;
-064        *tmpc++ &= MP_MASK;
-065   
-066        /* now handle rest of the digits */
-067        for (ix = 1; ix < a->used; ix++) \{
-068           *tmpc   = *tmpa++ + mu;
-069           mu      = *tmpc >> DIGIT_BIT;
-070           *tmpc++ &= MP_MASK;
-071        \}
-072        /* set final carry */
-073        ix++;
-074        *tmpc++  = mu;
-075   
-076        /* setup size */
-077        c->used = a->used + 1;
-078     \} else \{
-079        /* a was negative and |a| < b */
-080        c->used  = 1;
-081   
-082        /* the result is a single digit */
-083        if (a->used == 1) \{
-084           *tmpc++  =  b - a->dp[0];
-085        \} else \{
-086           *tmpc++  =  b;
-087        \}
-088   
-089        /* setup count so the clearing of oldused
-090         * can fall through correctly
-091         */
-092        ix       = 1;
-093     \}
-094   
-095     /* now zero to oldused */
-096     while (ix++ < oldused) \{
-097        *tmpc++ = 0;
-098     \}
-099     mp_clamp(c);
-100   
-101     return MP_OKAY;
-102   \}
+057     /* destination alias */
+058     tmpc    = c->dp;
+059   
+060     /* if a is positive */
+061     if (a->sign == MP_ZPOS) \{
+062        /* add digit, after this we're propagating
+063         * the carry.
+064         */
+065        *tmpc   = *tmpa++ + b;
+066        mu      = *tmpc >> DIGIT_BIT;
+067        *tmpc++ &= MP_MASK;
+068   
+069        /* now handle rest of the digits */
+070        for (ix = 1; ix < a->used; ix++) \{
+071           *tmpc   = *tmpa++ + mu;
+072           mu      = *tmpc >> DIGIT_BIT;
+073           *tmpc++ &= MP_MASK;
+074        \}
+075        /* set final carry */
+076        ix++;
+077        *tmpc++  = mu;
+078   
+079        /* setup size */
+080        c->used = a->used + 1;
+081     \} else \{
+082        /* a was negative and |a| < b */
+083        c->used  = 1;
+084   
+085        /* the result is a single digit */
+086        if (a->used == 1) \{
+087           *tmpc++  =  b - a->dp[0];
+088        \} else \{
+089           *tmpc++  =  b;
+090        \}
+091   
+092        /* setup count so the clearing of oldused
+093         * can fall through correctly
+094         */
+095        ix       = 1;
+096     \}
+097   
+098     /* now zero to oldused */
+099     while (ix++ < oldused) \{
+100        *tmpc++ = 0;
+101     \}
+102     mp_clamp(c);
 103   
-104   #endif
+104     return MP_OKAY;
+105   \}
+106   
+107   #endif
+108   
 \end{alltt}
 \end{small}
 
@@ -8929,6 +8983,7 @@
 072     return MP_OKAY;
 073   \}
 074   #endif
+075   
 \end{alltt}
 \end{small}
 
@@ -9074,6 +9129,7 @@
 103   \}
 104   
 105   #endif
+106   
 \end{alltt}
 \end{small}
 
@@ -9260,6 +9316,7 @@
 125     return res;
 126   \}
 127   #endif
+128   
 \end{alltt}
 \end{small}
 
@@ -9336,6 +9393,7 @@
 048     return MP_OKAY;
 049   \}
 050   #endif
+051   
 \end{alltt}
 \end{small}
 
@@ -9425,61 +9483,65 @@
 020     int     y, res, neg;
 021     char    ch;
 022   
-023     /* make sure the radix is ok */
-024     if (radix < 2 || radix > 64) \{
-025       return MP_VAL;
-026     \}
-027   
-028     /* if the leading digit is a 
-029      * minus set the sign to negative. 
-030      */
-031     if (*str == '-') \{
-032       ++str;
-033       neg = MP_NEG;
-034     \} else \{
-035       neg = MP_ZPOS;
-036     \}
-037   
-038     /* set the integer to the default of zero */
-039     mp_zero (a);
-040     
-041     /* process each digit of the string */
-042     while (*str) \{
-043       /* if the radix < 36 the conversion is case insensitive
-044        * this allows numbers like 1AB and 1ab to represent the same  value
-045        * [e.g. in hex]
-046        */
-047       ch = (char) ((radix < 36) ? toupper (*str) : *str);
-048       for (y = 0; y < 64; y++) \{
-049         if (ch == mp_s_rmap[y]) \{
-050            break;
-051         \}
-052       \}
-053   
-054       /* if the char was found in the map 
-055        * and is less than the given radix add it
-056        * to the number, otherwise exit the loop. 
-057        */
-058       if (y < radix) \{
-059         if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) \{
-060            return res;
-061         \}
-062         if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) \{
+023     /* zero the digit bignum */
+024     mp_zero(a);
+025   
+026     /* make sure the radix is ok */
+027     if (radix < 2 || radix > 64) \{
+028       return MP_VAL;
+029     \}
+030   
+031     /* if the leading digit is a 
+032      * minus set the sign to negative. 
+033      */
+034     if (*str == '-') \{
+035       ++str;
+036       neg = MP_NEG;
+037     \} else \{
+038       neg = MP_ZPOS;
+039     \}
+040   
+041     /* set the integer to the default of zero */
+042     mp_zero (a);
+043     
+044     /* process each digit of the string */
+045     while (*str) \{
+046       /* if the radix < 36 the conversion is case insensitive
+047        * this allows numbers like 1AB and 1ab to represent the same  value
+048        * [e.g. in hex]
+049        */
+050       ch = (char) ((radix < 36) ? toupper (*str) : *str);
+051       for (y = 0; y < 64; y++) \{
+052         if (ch == mp_s_rmap[y]) \{
+053            break;
+054         \}
+055       \}
+056   
+057       /* if the char was found in the map 
+058        * and is less than the given radix add it
+059        * to the number, otherwise exit the loop. 
+060        */
+061       if (y < radix) \{
+062         if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) \{
 063            return res;
 064         \}
-065       \} else \{
-066         break;
-067       \}
-068       ++str;
-069     \}
-070     
-071     /* set the sign only if a != 0 */
-072     if (mp_iszero(a) != 1) \{
-073        a->sign = neg;
-074     \}
-075     return MP_OKAY;
-076   \}
-077   #endif
+065         if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) \{
+066            return res;
+067         \}
+068       \} else \{
+069         break;
+070       \}
+071       ++str;
+072     \}
+073     
+074     /* set the sign only if a != 0 */
+075     if (mp_iszero(a) != 1) \{
+076        a->sign = neg;
+077     \}
+078     return MP_OKAY;
+079   \}
+080   #endif
+081   
 \end{alltt}
 \end{small}
 
@@ -9599,6 +9661,7 @@
 068   \}
 069   
 070   #endif
+071   
 \end{alltt}
 \end{small}
 
@@ -9728,33 +9791,30 @@
 \textbf{Input}.   mp\_int $a$ and $b$ \\
 \textbf{Output}.  The greatest common divisor $c = (a, b)$.  \\
 \hline \\
-1.  If $a = 0$ and $b \ne 0$ then \\
-\hspace{3mm}1.1  $c \leftarrow b$ \\
+1.  If $a = 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow \vert b \vert $ \\
 \hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $a \ne 0$ and $b = 0$ then \\
-\hspace{3mm}2.1  $c \leftarrow a$ \\
+2.  If $b = 0$ then \\
+\hspace{3mm}2.1  $c \leftarrow \vert a \vert $ \\
 \hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
-3.  If $a = b = 0$ then \\
-\hspace{3mm}3.1  $c \leftarrow 1$ \\
-\hspace{3mm}3.2  Return(\textit{MP\_OKAY}). \\
-4.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
-5.  $k \leftarrow 0$ \\
-6.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}6.1  $k \leftarrow k + 1$ \\
-\hspace{3mm}6.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-\hspace{3mm}6.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-7.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}7.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-8.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}8.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-9.  While $v.used > 0$ \\
-\hspace{3mm}9.1  If $\vert u \vert > \vert v \vert$ then \\
-\hspace{6mm}9.1.1  Swap $u$ and $v$. \\
-\hspace{3mm}9.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
-\hspace{3mm}9.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{6mm}9.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-10.  $c \leftarrow u \cdot 2^k$ \\
-11.  Return(\textit{MP\_OKAY}). \\
+3.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
+4.  $k \leftarrow 0$ \\
+5.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}5.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}5.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+8.  While $v.used > 0$ \\
+\hspace{3mm}8.1  If $\vert u \vert > \vert v \vert$ then \\
+\hspace{6mm}8.1.1  Swap $u$ and $v$. \\
+\hspace{3mm}8.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
+\hspace{3mm}8.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{6mm}8.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+9.  $c \leftarrow u \cdot 2^k$ \\
+10.  Return(\textit{MP\_OKAY}). \\
 \hline
 \end{tabular}
 \end{center}
@@ -9766,17 +9826,17 @@
 Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain.  In theory it achieves the same asymptotic working time as
 Algorithm B and in practice this appears to be true.  
 
-The first three steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
+The first two steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
 largest input or zero if they are both zero.  If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of 
 $a$ and $b$ respectively and the algorithm will proceed to reduce the pair.
 
-Step six will divide out any common factors of two and keep track of the count in the variable $k$.  After this step two is no longer a
+Step five will divide out any common factors of two and keep track of the count in the variable $k$.  After this step, two is no longer a
 factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even.  Step 
-seven and eight ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while loops will iterate since 
+six and seven ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while--loops will iterate since 
 they cannot both be even.
 
-By step nine both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
-or greater than $u$.  This ensures that the subtraction on step 9.2 will always produce a positive and even result.  Step 9.3 removes any
+By step eight both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
+or greater than $u$.  This ensures that the subtraction on step 8.2 will always produce a positive and even result.  Step 8.3 removes any
 factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd.
 
 After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six.  The result
@@ -9794,108 +9854,101 @@
 021     int     k, u_lsb, v_lsb, res;
 022   
 023     /* either zero than gcd is the largest */
-024     if (mp_iszero (a) == 1 && mp_iszero (b) == 0) \{
+024     if (mp_iszero (a) == MP_YES) \{
 025       return mp_abs (b, c);
 026     \}
-027     if (mp_iszero (a) == 0 && mp_iszero (b) == 1) \{
+027     if (mp_iszero (b) == MP_YES) \{
 028       return mp_abs (a, c);
 029     \}
 030   
-031     /* optimized.  At this point if a == 0 then
-032      * b must equal zero too
-033      */
-034     if (mp_iszero (a) == 1) \{
-035       mp_zero(c);
-036       return MP_OKAY;
-037     \}
-038   
-039     /* get copies of a and b we can modify */
-040     if ((res = mp_init_copy (&u, a)) != MP_OKAY) \{
-041       return res;
-042     \}
-043   
-044     if ((res = mp_init_copy (&v, b)) != MP_OKAY) \{
-045       goto LBL_U;
-046     \}
+031     /* get copies of a and b we can modify */
+032     if ((res = mp_init_copy (&u, a)) != MP_OKAY) \{
+033       return res;
+034     \}
+035   
+036     if ((res = mp_init_copy (&v, b)) != MP_OKAY) \{
+037       goto LBL_U;
+038     \}
+039   
+040     /* must be positive for the remainder of the algorithm */
+041     u.sign = v.sign = MP_ZPOS;
+042   
+043     /* B1.  Find the common power of two for u and v */
+044     u_lsb = mp_cnt_lsb(&u);
+045     v_lsb = mp_cnt_lsb(&v);
+046     k     = MIN(u_lsb, v_lsb);
 047   
-048     /* must be positive for the remainder of the algorithm */
-049     u.sign = v.sign = MP_ZPOS;
-050   
-051     /* B1.  Find the common power of two for u and v */
-052     u_lsb = mp_cnt_lsb(&u);
-053     v_lsb = mp_cnt_lsb(&v);
-054     k     = MIN(u_lsb, v_lsb);
-055   
-056     if (k > 0) \{
-057        /* divide the power of two out */
-058        if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) \{
-059           goto LBL_V;
-060        \}
-061   
-062        if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) \{
-063           goto LBL_V;
-064        \}
-065     \}
-066   
-067     /* divide any remaining factors of two out */
-068     if (u_lsb != k) \{
-069        if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) \{
-070           goto LBL_V;
-071        \}
-072     \}
-073   
-074     if (v_lsb != k) \{
-075        if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) \{
-076           goto LBL_V;
+048     if (k > 0) \{
+049        /* divide the power of two out */
+050        if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) \{
+051           goto LBL_V;
+052        \}
+053   
+054        if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) \{
+055           goto LBL_V;
+056        \}
+057     \}
+058   
+059     /* divide any remaining factors of two out */
+060     if (u_lsb != k) \{
+061        if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) \{
+062           goto LBL_V;
+063        \}
+064     \}
+065   
+066     if (v_lsb != k) \{
+067        if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) \{
+068           goto LBL_V;
+069        \}
+070     \}
+071   
+072     while (mp_iszero(&v) == 0) \{
+073        /* make sure v is the largest */
+074        if (mp_cmp_mag(&u, &v) == MP_GT) \{
+075           /* swap u and v to make sure v is >= u */
+076           mp_exch(&u, &v);
 077        \}
-078     \}
-079   
-080     while (mp_iszero(&v) == 0) \{
-081        /* make sure v is the largest */
-082        if (mp_cmp_mag(&u, &v) == MP_GT) \{
-083           /* swap u and v to make sure v is >= u */
-084           mp_exch(&u, &v);
-085        \}
-086        
-087        /* subtract smallest from largest */
-088        if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) \{
-089           goto LBL_V;
-090        \}
-091        
-092        /* Divide out all factors of two */
-093        if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) \{
-094           goto LBL_V;
-095        \} 
-096     \} 
-097   
-098     /* multiply by 2**k which we divided out at the beginning */
-099     if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) \{
-100        goto LBL_V;
-101     \}
-102     c->sign = MP_ZPOS;
-103     res = MP_OKAY;
-104   LBL_V:mp_clear (&u);
-105   LBL_U:mp_clear (&v);
-106     return res;
-107   \}
-108   #endif
+078        
+079        /* subtract smallest from largest */
+080        if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) \{
+081           goto LBL_V;
+082        \}
+083        
+084        /* Divide out all factors of two */
+085        if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) \{
+086           goto LBL_V;
+087        \} 
+088     \} 
+089   
+090     /* multiply by 2**k which we divided out at the beginning */
+091     if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) \{
+092        goto LBL_V;
+093     \}
+094     c->sign = MP_ZPOS;
+095     res = MP_OKAY;
+096   LBL_V:mp_clear (&u);
+097   LBL_U:mp_clear (&v);
+098     return res;
+099   \}
+100   #endif
+101   
 \end{alltt}
 \end{small}
 
 This function makes use of the macros mp\_iszero and mp\_iseven.  The former evaluates to $1$ if the input mp\_int is equivalent to the 
 integer zero otherwise it evaluates to $0$.  The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise
 it evaluates to $0$.  Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero.  The three 
-trivial cases of inputs are handled on lines 24 through 37.  After those lines the inputs are assumed to be non-zero.
-
-Lines 34 and 40 make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
-must be divided out of the two inputs.  The while loop on line 80 iterates so long as both are even.  The local integer $k$ is used to
-keep track of how many factors of $2$ are pulled out of both values.  It is assumed that the number of factors will not exceed the maximum 
-value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than entries than are accessible by an ``int'' so this is not 
-a limitation.}.  
-
-At this point there are no more common factors of two in the two values.  The while loops on lines 80 and 80 remove any independent
-factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
-on line 80 performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
+trivial cases of inputs are handled on lines 23 through 29.  After those lines the inputs are assumed to be non-zero.
+
+Lines 32 and 36 make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
+must be divided out of the two inputs.  The block starting at line 43 removes common factors of two by first counting the number of trailing
+zero bits in both.  The local integer $k$ is used to keep track of how many factors of $2$ are pulled out of both values.  It is assumed that 
+the number of factors will not exceed the maximum value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than 
+entries than are accessible by an ``int'' so this is not a limitation.}.  
+
+At this point there are no more common factors of two in the two values.  The divisions by a power of two on lines 61 and 67 remove 
+any independent factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
+on line 72 performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
 place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative.
 
 \section{Least Common Multiple}
@@ -9974,6 +10027,7 @@
 053     return res;
 054   \}
 055   #endif
+056   
 \end{alltt}
 \end{small}
 
@@ -10218,6 +10272,7 @@
 098     return res;
 099   \}
 100   #endif
+101   
 \end{alltt}
 \end{small}
 
@@ -10366,6 +10421,7 @@
 036     return MP_VAL;
 037   \}
 038   #endif
+039   
 \end{alltt}
 \end{small}
 
@@ -10467,6 +10523,7 @@
 043     return MP_OKAY;
 044   \}
 045   #endif
+046   
 \end{alltt}
 \end{small}
 
@@ -10518,6 +10575,7 @@
 054   #endif
 055   \};
 056   #endif
+057   
 \end{alltt}
 \end{small}
 
@@ -10606,6 +10664,7 @@
 055     return err;
 056   \}
 057   #endif
+058   
 \end{alltt}
 \end{small}
 
@@ -10741,6 +10800,7 @@
 096     return err;
 097   \}
 098   #endif
+099   
 \end{alltt}
 \end{small}