changeset 145:a96ff234ff19 libtommath

propagate of fc94c38452d9fd684a8e1eb9e3a73120aac0d38f and 3e4de4cbef3d9035a7b2f0c25e9f86e297f9f6d1 from branch 'au.asn.ucc.matt.ltm-orig' to 'au.asn.ucc.matt.ltm-db'
author Matt Johnston <matt@ucc.asn.au>
date Sun, 19 Dec 2004 15:57:19 +0000
parents d29b64170cf0 (diff) cc04b085e7dd (current diff)
children 81bc23421b45
files bn_mp_exptmod.c bn_mp_mul.c bn_mp_sqr.c bncore.c mtest/mpi-config.h mtest/mpi.c mtest/mpi.h pics/makefile tommath.h
diffstat 166 files changed, 35135 insertions(+), 3187 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn.ilg	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,6 @@
+This is makeindex, version 2.14 [02-Oct-2002] (kpathsea + Thai support).
+Scanning input file bn.idx....done (79 entries accepted, 0 rejected).
+Sorting entries....done (511 comparisons).
+Generating output file bn.ind....done (82 lines written, 0 warnings).
+Output written in bn.ind.
+Transcript written in bn.ilg.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn.ind	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,82 @@
+\begin{theindex}
+
+  \item mp\_add, \hyperpage{29}
+  \item mp\_add\_d, \hyperpage{52}
+  \item mp\_and, \hyperpage{29}
+  \item mp\_clear, \hyperpage{11}
+  \item mp\_clear\_multi, \hyperpage{12}
+  \item mp\_cmp, \hyperpage{24}
+  \item mp\_cmp\_d, \hyperpage{25}
+  \item mp\_cmp\_mag, \hyperpage{23}
+  \item mp\_div, \hyperpage{30}
+  \item mp\_div\_2, \hyperpage{26}
+  \item mp\_div\_2d, \hyperpage{28}
+  \item mp\_div\_d, \hyperpage{52}
+  \item mp\_dr\_reduce, \hyperpage{40}
+  \item mp\_dr\_setup, \hyperpage{40}
+  \item MP\_EQ, \hyperpage{22}
+  \item mp\_error\_to\_string, \hyperpage{10}
+  \item mp\_expt\_d, \hyperpage{43}
+  \item mp\_exptmod, \hyperpage{43}
+  \item mp\_exteuclid, \hyperpage{51}
+  \item mp\_gcd, \hyperpage{51}
+  \item mp\_get\_int, \hyperpage{20}
+  \item mp\_grow, \hyperpage{16}
+  \item MP\_GT, \hyperpage{22}
+  \item mp\_init, \hyperpage{11}
+  \item mp\_init\_copy, \hyperpage{13}
+  \item mp\_init\_multi, \hyperpage{12}
+  \item mp\_init\_set, \hyperpage{21}
+  \item mp\_init\_set\_int, \hyperpage{21}
+  \item mp\_init\_size, \hyperpage{14}
+  \item mp\_int, \hyperpage{10}
+  \item mp\_invmod, \hyperpage{52}
+  \item mp\_jacobi, \hyperpage{52}
+  \item mp\_lcm, \hyperpage{51}
+  \item mp\_lshd, \hyperpage{28}
+  \item MP\_LT, \hyperpage{22}
+  \item MP\_MEM, \hyperpage{9}
+  \item mp\_mod, \hyperpage{35}
+  \item mp\_mod\_d, \hyperpage{52}
+  \item mp\_montgomery\_calc\_normalization, \hyperpage{38}
+  \item mp\_montgomery\_reduce, \hyperpage{37}
+  \item mp\_montgomery\_setup, \hyperpage{37}
+  \item mp\_mul, \hyperpage{31}
+  \item mp\_mul\_2, \hyperpage{26}
+  \item mp\_mul\_2d, \hyperpage{28}
+  \item mp\_mul\_d, \hyperpage{52}
+  \item mp\_n\_root, \hyperpage{44}
+  \item mp\_neg, \hyperpage{29}
+  \item MP\_NO, \hyperpage{9}
+  \item MP\_OKAY, \hyperpage{9}
+  \item mp\_or, \hyperpage{29}
+  \item mp\_prime\_fermat, \hyperpage{45}
+  \item mp\_prime\_is\_divisible, \hyperpage{45}
+  \item mp\_prime\_is\_prime, \hyperpage{46}
+  \item mp\_prime\_miller\_rabin, \hyperpage{45}
+  \item mp\_prime\_next\_prime, \hyperpage{46}
+  \item mp\_prime\_rabin\_miller\_trials, \hyperpage{46}
+  \item mp\_prime\_random, \hyperpage{47}
+  \item mp\_prime\_random\_ex, \hyperpage{47}
+  \item mp\_radix\_size, \hyperpage{49}
+  \item mp\_read\_radix, \hyperpage{49}
+  \item mp\_read\_unsigned\_bin, \hyperpage{50}
+  \item mp\_reduce, \hyperpage{36}
+  \item mp\_reduce\_2k, \hyperpage{41}
+  \item mp\_reduce\_2k\_setup, \hyperpage{41}
+  \item mp\_reduce\_setup, \hyperpage{36}
+  \item mp\_rshd, \hyperpage{28}
+  \item mp\_set, \hyperpage{19}
+  \item mp\_set\_int, \hyperpage{20}
+  \item mp\_shrink, \hyperpage{15}
+  \item mp\_sqr, \hyperpage{33}
+  \item mp\_sub, \hyperpage{29}
+  \item mp\_sub\_d, \hyperpage{52}
+  \item mp\_to\_unsigned\_bin, \hyperpage{50}
+  \item mp\_toradix, \hyperpage{49}
+  \item mp\_unsigned\_bin\_size, \hyperpage{50}
+  \item MP\_VAL, \hyperpage{9}
+  \item mp\_xor, \hyperpage{29}
+  \item MP\_YES, \hyperpage{9}
+
+\end{theindex}
Binary file bn.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn.tex	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,1830 @@
+\documentclass[b5paper]{book}
+\usepackage{hyperref}
+\usepackage{makeidx}
+\usepackage{amssymb}
+\usepackage{color}
+\usepackage{alltt}
+\usepackage{graphicx}
+\usepackage{layout}
+\def\union{\cup}
+\def\intersect{\cap}
+\def\getsrandom{\stackrel{\rm R}{\gets}}
+\def\cross{\times}
+\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
+\def\catn{$\|$}
+\def\divides{\hspace{0.3em} | \hspace{0.3em}}
+\def\nequiv{\not\equiv}
+\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
+\def\lcm{{\rm lcm}}
+\def\gcd{{\rm gcd}}
+\def\log{{\rm log}}
+\def\ord{{\rm ord}}
+\def\abs{{\mathit abs}}
+\def\rep{{\mathit rep}}
+\def\mod{{\mathit\ mod\ }}
+\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
+\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
+\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
+\def\Or{{\rm\ or\ }}
+\def\And{{\rm\ and\ }}
+\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
+\def\implies{\Rightarrow}
+\def\undefined{{\rm ``undefined"}}
+\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
+\let\oldphi\phi
+\def\phi{\varphi}
+\def\Pr{{\rm Pr}}
+\newcommand{\str}[1]{{\mathbf{#1}}}
+\def\F{{\mathbb F}}
+\def\N{{\mathbb N}}
+\def\Z{{\mathbb Z}}
+\def\R{{\mathbb R}}
+\def\C{{\mathbb C}}
+\def\Q{{\mathbb Q}}
+\definecolor{DGray}{gray}{0.5}
+\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
+\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
+\def\gap{\vspace{0.5ex}}
+\makeindex
+\begin{document}
+\frontmatter
+\pagestyle{empty}
+\title{LibTomMath User Manual \\ v0.32}
+\author{Tom St Denis \\ [email protected]}
+\maketitle
+This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
+formatted for B5 [176x250] paper using the \LaTeX{} {\em book} macro package.
+
+\vspace{10cm}
+
+\begin{flushright}Open Source.  Open Academia.  Open Minds.
+
+\mbox{ }
+
+Tom St Denis,
+
+Ontario, Canada
+\end{flushright}
+
+\tableofcontents
+\listoffigures
+\mainmatter
+\pagestyle{headings}
+\chapter{Introduction}
+\section{What is LibTomMath?}
+LibTomMath is a library of source code which provides a series of efficient and carefully written functions for manipulating
+large integer numbers.  It was written in portable ISO C source code so that it will build on any platform with a conforming
+C compiler.  
+
+In a nutshell the library was written from scratch with verbose comments to help instruct computer science students how
+to implement ``bignum'' math.  However, the resulting code has proven to be very useful.  It has been used by numerous 
+universities, commercial and open source software developers.  It has been used on a variety of platforms ranging from
+Linux and Windows based x86 to ARM based Gameboys and PPC based MacOS machines.  
+
+\section{License}
+As of the v0.25 the library source code has been placed in the public domain with every new release.  As of the v0.28
+release the textbook ``Implementing Multiple Precision Arithmetic'' has been placed in the public domain with every new
+release as well.  This textbook is meant to compliment the project by providing a more solid walkthrough of the development
+algorithms used in the library.
+
+Since both\footnote{Note that the MPI files under mtest/ are copyrighted by Michael Fromberger.  They are not required to use LibTomMath.} are in the 
+public domain everyone is entitled to do with them as they see fit.
+
+\section{Building LibTomMath}
+
+LibTomMath is meant to be very ``GCC friendly'' as it comes with a makefile well suited for GCC.  However, the library will
+also build in MSVC, Borland C out of the box.  For any other ISO C compiler a makefile will have to be made by the end
+developer.  
+
+\subsection{Static Libraries}
+To build as a static library for GCC issue the following
+\begin{alltt}
+make
+\end{alltt}
+
+command.  This will build the library and archive the object files in ``libtommath.a''.  Now you link against 
+that and include ``tommath.h'' within your programs.  Alternatively to build with MSVC issue the following
+\begin{alltt}
+nmake -f makefile.msvc
+\end{alltt}
+
+This will build the library and archive the object files in ``tommath.lib''.  This has been tested with MSVC 
+version 6.00 with service pack 5.  
+
+\subsection{Shared Libraries}
+To build as a shared library for GCC issue the following
+\begin{alltt}
+make -f makefile.shared
+\end{alltt}
+This requires the ``libtool'' package (common on most Linux/BSD systems).  It will build LibTomMath as both shared
+and static then install (by default) into /usr/lib as well as install the header files in /usr/include.  The shared 
+library (resource) will be called ``libtommath.la'' while the static library called ``libtommath.a''.  Generally 
+you use libtool to link your application against the shared object.  
+
+There is limited support for making a ``DLL'' in windows via the ``makefile.cygwin\_dll'' makefile.  It requires 
+Cygwin to work with since it requires the auto-export/import functionality.  The resulting DLL and import library 
+``libtommath.dll.a'' can be used to link LibTomMath dynamically to any Windows program using Cygwin.
+
+\subsection{Testing}
+To build the library and the test harness type
+
+\begin{alltt}
+make test
+\end{alltt}
+
+This will build the library, ``test'' and ``mtest/mtest''.  The ``test'' program will accept test vectors and verify the
+results.  ``mtest/mtest'' will generate test vectors using the MPI library by Michael Fromberger\footnote{A copy of MPI
+is included in the package}.  Simply pipe mtest into test using
+
+\begin{alltt}
+mtest/mtest | test
+\end{alltt}
+
+If you do not have a ``/dev/urandom'' style RNG source you will have to write your own PRNG and simply pipe that into 
+mtest.  For example, if your PRNG program is called ``myprng'' simply invoke
+
+\begin{alltt}
+myprng | mtest/mtest | test
+\end{alltt}
+
+This will output a row of numbers that are increasing.  Each column is a different test (such as addition, multiplication, etc)
+that is being performed.  The numbers represent how many times the test was invoked.  If an error is detected the program
+will exit with a dump of the relevent numbers it was working with.
+
+\section{Build Configuration}
+LibTomMath can configured at build time in three phases we shall call ``depends'', ``tweaks'' and ``trims''.  
+Each phase changes how the library is built and they are applied one after another respectively.  
+
+To make the system more powerful you can tweak the build process.  Classes are defined in the file
+``tommath\_superclass.h''.  By default, the symbol ``LTM\_ALL'' shall be defined which simply 
+instructs the system to build all of the functions.  This is how LibTomMath used to be packaged.  This will give you 
+access to every function LibTomMath offers.
+
+However, there are cases where such a build is not optional.  For instance, you want to perform RSA operations.  You 
+don't need the vast majority of the library to perform these operations.  Aside from LTM\_ALL there is 
+another pre--defined class ``SC\_RSA\_1'' which works in conjunction with the RSA from LibTomCrypt.  Additional 
+classes can be defined base on the need of the user.
+
+\subsection{Build Depends}
+In the file tommath\_class.h you will see a large list of C ``defines'' followed by a series of ``ifdefs''
+which further define symbols.  All of the symbols (technically they're macros $\ldots$) represent a given C source
+file.  For instance, BN\_MP\_ADD\_C represents the file ``bn\_mp\_add.c''.  When a define has been enabled the
+function in the respective file will be compiled and linked into the library.  Accordingly when the define
+is absent the file will not be compiled and not contribute any size to the library.
+
+You will also note that the header tommath\_class.h is actually recursively included (it includes itself twice).  
+This is to help resolve as many dependencies as possible.  In the last pass the symbol LTM\_LAST will be defined.  
+This is useful for ``trims''.
+
+\subsection{Build Tweaks}
+A tweak is an algorithm ``alternative''.  For example, to provide tradeoffs (usually between size and space).
+They can be enabled at any pass of the configuration phase.
+
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Define} & \textbf{Purpose} \\
+\hline BN\_MP\_DIV\_SMALL & Enables a slower, smaller and equally \\
+                          & functional mp\_div() function \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+
+\subsection{Build Trims}
+A trim is a manner of removing functionality from a function that is not required.  For instance, to perform
+RSA cryptography you only require exponentiation with odd moduli so even moduli support can be safely removed.  
+Build trims are meant to be defined on the last pass of the configuration which means they are to be defined
+only if LTM\_LAST has been defined.
+
+\subsubsection{Moduli Related}
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Restriction} & \textbf{Undefine} \\
+\hline Exponentiation with odd moduli only & BN\_S\_MP\_EXPTMOD\_C \\
+                                           & BN\_MP\_REDUCE\_C \\
+                                           & BN\_MP\_REDUCE\_SETUP\_C \\
+                                           & BN\_S\_MP\_MUL\_HIGH\_DIGS\_C \\
+                                           & BN\_FAST\_S\_MP\_MUL\_HIGH\_DIGS\_C \\
+\hline Exponentiation with random odd moduli & (The above plus the following) \\
+                                           & BN\_MP\_REDUCE\_2K\_C \\
+                                           & BN\_MP\_REDUCE\_2K\_SETUP\_C \\
+                                           & BN\_MP\_REDUCE\_IS\_2K\_C \\
+                                           & BN\_MP\_DR\_IS\_MODULUS\_C \\
+                                           & BN\_MP\_DR\_REDUCE\_C \\
+                                           & BN\_MP\_DR\_SETUP\_C \\
+\hline Modular inverse odd moduli only     & BN\_MP\_INVMOD\_SLOW\_C \\
+\hline Modular inverse (both, smaller/slower) & BN\_FAST\_MP\_INVMOD\_C \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+
+\subsubsection{Operand Size Related}
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Restriction} & \textbf{Undefine} \\
+\hline Moduli $\le 2560$ bits              & BN\_MP\_MONTGOMERY\_REDUCE\_C \\
+                                           & BN\_S\_MP\_MUL\_DIGS\_C \\
+                                           & BN\_S\_MP\_MUL\_HIGH\_DIGS\_C \\
+                                           & BN\_S\_MP\_SQR\_C \\
+\hline Polynomial Schmolynomial            & BN\_MP\_KARATSUBA\_MUL\_C \\
+                                           & BN\_MP\_KARATSUBA\_SQR\_C \\
+                                           & BN\_MP\_TOOM\_MUL\_C \\ 
+                                           & BN\_MP\_TOOM\_SQR\_C \\
+
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+
+
+\section{Purpose of LibTomMath}
+Unlike  GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath was not written with 
+bleeding edge performance in mind.  First and foremost LibTomMath was written to be entirely open.  Not only is the 
+source code public domain (unlike various other GPL/etc licensed code), not only is the code freely downloadable but the
+source code is also accessible for computer science students attempting to learn ``BigNum'' or multiple precision
+arithmetic techniques. 
+
+LibTomMath was written to be an instructive collection of source code.  This is why there are many comments, only one
+function per source file and often I use a ``middle-road'' approach where I don't cut corners for an extra 2\% speed
+increase.
+
+Source code alone cannot really teach how the algorithms work which is why I also wrote a textbook that accompanies
+the library (beat that!).
+
+So you may be thinking ``should I use LibTomMath?'' and the answer is a definite maybe.  Let me tabulate what I think
+are the pros and cons of LibTomMath by comparing it to the math routines from GnuPG\footnote{GnuPG v1.2.3 versus LibTomMath v0.28}.
+
+\newpage\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|c|c|l|}
+\hline \textbf{Criteria} & \textbf{Pro} & \textbf{Con} & \textbf{Notes} \\
+\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 76.04$ \\
+\hline Commented function prototypes & X && GnuPG function names are cryptic. \\
+\hline Speed && X & LibTomMath is slower.  \\
+\hline Totally free & X & & GPL has unfavourable restrictions.\\
+\hline Large function base & X & & GnuPG is barebones. \\
+\hline Four modular reduction algorithms & X & & Faster modular exponentiation. \\
+\hline Portable & X & & GnuPG requires configuration to build. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{LibTomMath Valuation}
+\end{figure}
+
+It may seem odd to compare LibTomMath to GnuPG since the math in GnuPG is only a small portion of the entire application. 
+However, LibTomMath was written with cryptography in mind.  It provides essentially all of the functions a cryptosystem
+would require when working with large integers.  
+
+So it may feel tempting to just rip the math code out of GnuPG (or GnuMP where it was taken from originally) in your
+own application but I think there are reasons not to.  While LibTomMath is slower than libraries such as GnuMP it is
+not normally significantly slower.  On x86 machines the difference is normally a factor of two when performing modular
+exponentiations.
+
+Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.
+
+\chapter{Getting Started with LibTomMath}
+\section{Building Programs}
+In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library file (typically 
+libtommath.a).  There is no library initialization required and the entire library is thread safe.
+
+\section{Return Codes}
+There are three possible return codes a function may return.
+
+\index{MP\_OKAY}\index{MP\_YES}\index{MP\_NO}\index{MP\_VAL}\index{MP\_MEM}
+\begin{figure}[here!]
+\begin{center}
+\begin{small}
+\begin{tabular}{|l|l|}
+\hline \textbf{Code} & \textbf{Meaning} \\
+\hline MP\_OKAY & The function succeeded. \\
+\hline MP\_VAL  & The function input was invalid. \\
+\hline MP\_MEM  & Heap memory exhausted. \\
+\hline &\\
+\hline MP\_YES  & Response is yes. \\
+\hline MP\_NO   & Response is no. \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Return Codes}
+\end{figure}
+
+The last two codes listed are not actually ``return'ed'' by a function.  They are placed in an integer (the caller must
+provide the address of an integer it can store to) which the caller can access.  To convert one of the three return codes
+to a string use the following function.
+
+\index{mp\_error\_to\_string}
+\begin{alltt}
+char *mp_error_to_string(int code);
+\end{alltt}
+
+This will return a pointer to a string which describes the given error code.  It will not work for the return codes 
+MP\_YES and MP\_NO.  
+
+\section{Data Types}
+The basic ``multiple precision integer'' type is known as the ``mp\_int'' within LibTomMath.  This data type is used to
+organize all of the data required to manipulate the integer it represents.  Within LibTomMath it has been prototyped
+as the following.
+
+\index{mp\_int}
+\begin{alltt}
+typedef struct  \{
+    int used, alloc, sign;
+    mp_digit *dp;
+\} mp_int;
+\end{alltt}
+
+Where ``mp\_digit'' is a data type that represents individual digits of the integer.  By default, an mp\_digit is the
+ISO C ``unsigned long'' data type and each digit is $28-$bits long.  The mp\_digit type can be configured to suit other
+platforms by defining the appropriate macros.  
+
+All LTM functions that use the mp\_int type will expect a pointer to mp\_int structure.  You must allocate memory to
+hold the structure itself by yourself (whether off stack or heap it doesn't matter).  The very first thing that must be
+done to use an mp\_int is that it must be initialized.
+
+\section{Function Organization}
+
+The arithmetic functions of the library are all organized to have the same style prototype.  That is source operands
+are passed on the left and the destination is on the right.  For instance,
+
+\begin{alltt}
+mp_add(&a, &b, &c);       /* c = a + b */
+mp_mul(&a, &a, &c);       /* c = a * a */
+mp_div(&a, &b, &c, &d);   /* c = [a/b], d = a mod b */
+\end{alltt}
+
+Another feature of the way the functions have been implemented is that source operands can be destination operands as well.
+For instance,
+
+\begin{alltt}
+mp_add(&a, &b, &b);       /* b = a + b */
+mp_div(&a, &b, &a, &c);   /* a = [a/b], c = a mod b */
+\end{alltt}
+
+This allows operands to be re-used which can make programming simpler.
+
+\section{Initialization}
+\subsection{Single Initialization}
+A single mp\_int can be initialized with the ``mp\_init'' function. 
+
+\index{mp\_init}
+\begin{alltt}
+int mp_init (mp_int * a);
+\end{alltt}
+
+This function expects a pointer to an mp\_int structure and will initialize the members of the structure so the mp\_int
+represents the default integer which is zero.  If the functions returns MP\_OKAY then the mp\_int is ready to be used
+by the other LibTomMath functions.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Single Free}
+When you are finished with an mp\_int it is ideal to return the heap it used back to the system.  The following function 
+provides this functionality.
+
+\index{mp\_clear}
+\begin{alltt}
+void mp_clear (mp_int * a);
+\end{alltt}
+
+The function expects a pointer to a previously initialized mp\_int structure and frees the heap it uses.  It sets the 
+pointer\footnote{The ``dp'' member.} within the mp\_int to \textbf{NULL} which is used to prevent double free situations. 
+Is is legal to call mp\_clear() twice on the same mp\_int in a row.  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number */
+
+   /* We're done with it. */
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Multiple Initializations}
+Certain algorithms require more than one large integer.  In these instances it is ideal to initialize all of the mp\_int
+variables in an ``all or nothing'' fashion.  That is, they are either all initialized successfully or they are all
+not initialized.
+
+The  mp\_init\_multi() function provides this functionality.
+
+\index{mp\_init\_multi} \index{mp\_clear\_multi}
+\begin{alltt}
+int mp_init_multi(mp_int *mp, ...);
+\end{alltt}
+
+It accepts a \textbf{NULL} terminated list of pointers to mp\_int structures.  It will attempt to initialize them all
+at once.  If the function returns MP\_OKAY then all of the mp\_int variables are ready to use, otherwise none of them
+are available for use.  A complementary mp\_clear\_multi() function allows multiple mp\_int variables to be free'd 
+from the heap at the same time.  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int num1, num2, num3;
+   int result;
+
+   if ((result = mp_init_multi(&num1, 
+                               &num2,
+                               &num3, NULL)) != MP\_OKAY) \{      
+      printf("Error initializing the numbers.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the numbers */
+
+   /* We're done with them. */
+   mp_clear_multi(&num1, &num2, &num3, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Other Initializers}
+To initialized and make a copy of an mp\_int the mp\_init\_copy() function has been provided.  
+
+\index{mp\_init\_copy}
+\begin{alltt}
+int mp_init_copy (mp_int * a, mp_int * b);
+\end{alltt}
+
+This function will initialize $a$ and make it a copy of $b$ if all goes well.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int num1, num2;
+   int result;
+
+   /* initialize and do work on num1 ... */
+
+   /* We want a copy of num1 in num2 now */
+   if ((result = mp_init_copy(&num2, &num1)) != MP_OKAY) \{
+     printf("Error initializing the copy.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* now num2 is ready and contains a copy of num1 */
+
+   /* We're done with them. */
+   mp_clear_multi(&num1, &num2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+Another less common initializer is mp\_init\_size() which allows the user to initialize an mp\_int with a given
+default number of digits.  By default, all initializers allocate \textbf{MP\_PREC} digits.  This function lets
+you override this behaviour.
+
+\index{mp\_init\_size}
+\begin{alltt}
+int mp_init_size (mp_int * a, int size);
+\end{alltt}
+
+The $size$ parameter must be greater than zero.  If the function succeeds the mp\_int $a$ will be initialized
+to have $size$ digits (which are all initially zero).  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   /* we need a 60-digit number */
+   if ((result = mp_init_size(&number, 60)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\section{Maintenance Functions}
+
+\subsection{Reducing Memory Usage}
+When an mp\_int is in a state where it won't be changed again\footnote{A Diffie-Hellman modulus for instance.} excess
+digits can be removed to return memory to the heap with the mp\_shrink() function.
+
+\index{mp\_shrink}
+\begin{alltt}
+int mp_shrink (mp_int * a);
+\end{alltt}
+
+This will remove excess digits of the mp\_int $a$.  If the operation fails the mp\_int should be intact without the
+excess digits being removed.  Note that you can use a shrunk mp\_int in further computations, however, such operations
+will require heap operations which can be slow.  It is not ideal to shrink mp\_int variables that you will further
+modify in the system (unless you are seriously low on memory).  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number [e.g. pre-computation]  */
+
+   /* We're done with it for now. */
+   if ((result = mp_shrink(&number)) != MP_OKAY) \{
+      printf("Error shrinking the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* use it .... */
+
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Adding additional digits}
+
+Within the mp\_int structure are two parameters which control the limitations of the array of digits that represent
+the integer the mp\_int is meant to equal.   The \textit{used} parameter dictates how many digits are significant, that is,
+contribute to the value of the mp\_int.  The \textit{alloc} parameter dictates how many digits are currently available in
+the array.  If you need to perform an operation that requires more digits you will have to mp\_grow() the mp\_int to
+your desired size.  
+
+\index{mp\_grow}
+\begin{alltt}
+int mp_grow (mp_int * a, int size);
+\end{alltt}
+
+This will grow the array of digits of $a$ to $size$.  If the \textit{alloc} parameter is already bigger than
+$size$ the function will not do anything.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number */
+
+   /* We need to add 20 digits to the number  */
+   if ((result = mp_grow(&number, number.alloc + 20)) != MP_OKAY) \{
+      printf("Error growing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+
+   /* use the number */
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\chapter{Basic Operations}
+\section{Small Constants}
+Setting mp\_ints to small constants is a relatively common operation.  To accomodate these instances there are two
+small constant assignment functions.  The first function is used to set a single digit constant while the second sets
+an ISO C style ``unsigned long'' constant.  The reason for both functions is efficiency.  Setting a single digit is quick but the
+domain of a digit can change (it's always at least $0 \ldots 127$).  
+
+\subsection{Single Digit}
+
+Setting a single digit can be accomplished with the following function.
+
+\index{mp\_set}
+\begin{alltt}
+void mp_set (mp_int * a, mp_digit b);
+\end{alltt}
+
+This will zero the contents of $a$ and make it represent an integer equal to the value of $b$.  Note that this
+function has a return type of \textbf{void}.  It cannot cause an error so it is safe to assume the function
+succeeded.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number to 5 */
+   mp_set(&number, 5);
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Long Constants}
+
+To set a constant that is the size of an ISO C ``unsigned long'' and larger than a single digit the following function 
+can be used.
+
+\index{mp\_set\_int}
+\begin{alltt}
+int mp_set_int (mp_int * a, unsigned long b);
+\end{alltt}
+
+This will assign the value of the 32-bit variable $b$ to the mp\_int $a$.  Unlike mp\_set() this function will always
+accept a 32-bit input regardless of the size of a single digit.  However, since the value may span several digits 
+this function can fail if it runs out of heap memory.
+
+To get the ``unsigned long'' copy of an mp\_int the following function can be used.
+
+\index{mp\_get\_int}
+\begin{alltt}
+unsigned long mp_get_int (mp_int * a);
+\end{alltt}
+
+This will return the 32 least significant bits of the mp\_int $a$.  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number to 654321 (note this is bigger than 127) */
+   if ((result = mp_set_int(&number, 654321)) != MP_OKAY) \{
+      printf("Error setting the value of the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   printf("number == \%lu", mp_get_int(&number));
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+This should output the following if the program succeeds.
+
+\begin{alltt}
+number == 654321
+\end{alltt}
+
+\subsection{Initialize and Setting Constants}
+To both initialize and set small constants the following two functions are available.
+\index{mp\_init\_set} \index{mp\_init\_set\_int}
+\begin{alltt}
+int mp_init_set (mp_int * a, mp_digit b);
+int mp_init_set_int (mp_int * a, unsigned long b);
+\end{alltt}
+
+Both functions work like the previous counterparts except they first mp\_init $a$ before setting the values.  
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int    result;
+
+   /* initialize and set a single digit */
+   if ((result = mp_init_set(&number1, 100)) != MP_OKAY) \{
+      printf("Error setting number1: \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}             
+
+   /* initialize and set a long */
+   if ((result = mp_init_set_int(&number2, 1023)) != MP_OKAY) \{
+      printf("Error setting number2: \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* display */
+   printf("Number1, Number2 == \%lu, \%lu",
+          mp_get_int(&number1), mp_get_int(&number2));
+
+   /* clear */
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt}
+
+If this program succeeds it shall output.
+\begin{alltt}
+Number1, Number2 == 100, 1023
+\end{alltt}
+
+\section{Comparisons}
+
+Comparisons in LibTomMath are always performed in a ``left to right'' fashion.  There are three possible return codes
+for any comparison.
+
+\index{MP\_GT} \index{MP\_EQ} \index{MP\_LT}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|c|c|}
+\hline \textbf{Result Code} & \textbf{Meaning} \\
+\hline MP\_GT & $a > b$ \\
+\hline MP\_EQ & $a = b$ \\
+\hline MP\_LT & $a < b$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Comparison Codes for $a, b$}
+\label{fig:CMP}
+\end{figure}
+
+In figure \ref{fig:CMP} two integers $a$ and $b$ are being compared.  In this case $a$ is said to be ``to the left'' of 
+$b$.  
+
+\subsection{Unsigned comparison}
+
+An unsigned comparison considers only the digits themselves and not the associated \textit{sign} flag of the 
+mp\_int structures.  This is analogous to an absolute comparison.  The function mp\_cmp\_mag() will compare two
+mp\_int variables based on their digits only. 
+
+\index{mp\_cmp\_mag}
+\begin{alltt}
+int mp_cmp(mp_int * a, mp_int * b);
+\end{alltt}
+This will compare $a$ to $b$ placing $a$ to the left of $b$.  This function cannot fail and will return one of the
+three compare codes listed in figure \ref{fig:CMP}.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int result;
+
+   if ((result = mp_init_multi(&number1, &number2, NULL)) != MP_OKAY) \{
+      printf("Error initializing the numbers.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number1 to 5 */
+   mp_set(&number1, 5);
+  
+   /* set the number2 to -6 */
+   mp_set(&number2, 6);
+   if ((result = mp_neg(&number2, &number2)) != MP_OKAY) \{
+      printf("Error negating number2.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   switch(mp_cmp_mag(&number1, &number2)) \{
+       case MP_GT:  printf("|number1| > |number2|"); break;
+       case MP_EQ:  printf("|number1| = |number2|"); break;
+       case MP_LT:  printf("|number1| < |number2|"); break;
+   \}
+
+   /* we're done with it. */ 
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+If this program\footnote{This function uses the mp\_neg() function which is discussed in section \ref{sec:NEG}.} completes 
+successfully it should print the following.
+
+\begin{alltt}
+|number1| < |number2|
+\end{alltt}
+
+This is because $\vert -6 \vert = 6$ and obviously $5 < 6$.
+
+\subsection{Signed comparison}
+
+To compare two mp\_int variables based on their signed value the mp\_cmp() function is provided.
+
+\index{mp\_cmp}
+\begin{alltt}
+int mp_cmp(mp_int * a, mp_int * b);
+\end{alltt}
+
+This will compare $a$ to the left of $b$.  It will first compare the signs of the two mp\_int variables.  If they
+differ it will return immediately based on their signs.  If the signs are equal then it will compare the digits
+individually.  This function will return one of the compare conditions codes listed in figure \ref{fig:CMP}.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int result;
+
+   if ((result = mp_init_multi(&number1, &number2, NULL)) != MP_OKAY) \{
+      printf("Error initializing the numbers.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number1 to 5 */
+   mp_set(&number1, 5);
+  
+   /* set the number2 to -6 */
+   mp_set(&number2, 6);
+   if ((result = mp_neg(&number2, &number2)) != MP_OKAY) \{
+      printf("Error negating number2.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   switch(mp_cmp(&number1, &number2)) \{
+       case MP_GT:  printf("number1 > number2"); break;
+       case MP_EQ:  printf("number1 = number2"); break;
+       case MP_LT:  printf("number1 < number2"); break;
+   \}
+
+   /* we're done with it. */ 
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+If this program\footnote{This function uses the mp\_neg() function which is discussed in section \ref{sec:NEG}.} completes 
+successfully it should print the following.
+
+\begin{alltt}
+number1 > number2
+\end{alltt}
+
+\subsection{Single Digit}
+
+To compare a single digit against an mp\_int the following function has been provided.
+
+\index{mp\_cmp\_d}
+\begin{alltt}
+int mp_cmp_d(mp_int * a, mp_digit b);
+\end{alltt}
+
+This will compare $a$ to the left of $b$ using a signed comparison.  Note that it will always treat $b$ as 
+positive.  This function is rather handy when you have to compare against small values such as $1$ (which often
+comes up in cryptography).  The function cannot fail and will return one of the tree compare condition codes
+listed in figure \ref{fig:CMP}.
+
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number to 5 */
+   mp_set(&number, 5);
+
+   switch(mp_cmp_d(&number, 7)) \{
+       case MP_GT:  printf("number > 7"); break;
+       case MP_EQ:  printf("number = 7"); break;
+       case MP_LT:  printf("number < 7"); break;
+   \}
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+If this program functions properly it will print out the following.
+
+\begin{alltt}
+number < 7
+\end{alltt}
+
+\section{Logical Operations}
+
+Logical operations are operations that can be performed either with simple shifts or boolean operators such as
+AND, XOR and OR directly.  These operations are very quick.
+
+\subsection{Multiplication by two}
+
+Multiplications and divisions by any power of two can be performed with quick logical shifts either left or
+right depending on the operation.  
+
+When multiplying or dividing by two a special case routine can be used which are as follows.
+\index{mp\_mul\_2} \index{mp\_div\_2}
+\begin{alltt}
+int mp_mul_2(mp_int * a, mp_int * b);
+int mp_div_2(mp_int * a, mp_int * b);
+\end{alltt}
+
+The former will assign twice $a$ to $b$ while the latter will assign half $a$ to $b$.  These functions are fast
+since the shift counts and maskes are hardcoded into the routines.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number to 5 */
+   mp_set(&number, 5);
+
+   /* multiply by two */
+   if ((result = mp\_mul\_2(&number, &number)) != MP_OKAY) \{
+      printf("Error multiplying the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   switch(mp_cmp_d(&number, 7)) \{
+       case MP_GT:  printf("2*number > 7"); break;
+       case MP_EQ:  printf("2*number = 7"); break;
+       case MP_LT:  printf("2*number < 7"); break;
+   \}
+
+   /* now divide by two */
+   if ((result = mp\_div\_2(&number, &number)) != MP_OKAY) \{
+      printf("Error dividing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   switch(mp_cmp_d(&number, 7)) \{
+       case MP_GT:  printf("2*number/2 > 7"); break;
+       case MP_EQ:  printf("2*number/2 = 7"); break;
+       case MP_LT:  printf("2*number/2 < 7"); break;
+   \}
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+If this program is successful it will print out the following text.
+
+\begin{alltt}
+2*number > 7
+2*number/2 < 7
+\end{alltt}
+
+Since $10 > 7$ and $5 < 7$.  To multiply by a power of two the following function can be used.
+
+\index{mp\_mul\_2d}
+\begin{alltt}
+int mp_mul_2d(mp_int * a, int b, mp_int * c);
+\end{alltt}
+
+This will multiply $a$ by $2^b$ and store the result in ``c''.  If the value of $b$ is less than or equal to 
+zero the function will copy $a$ to ``c'' without performing any further actions.  
+
+To divide by a power of two use the following.
+
+\index{mp\_div\_2d}
+\begin{alltt}
+int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d);
+\end{alltt}
+Which will divide $a$ by $2^b$, store the quotient in ``c'' and the remainder in ``d'.  If $b \le 0$ then the
+function simply copies $a$ over to ``c'' and zeroes $d$.  The variable $d$ may be passed as a \textbf{NULL}
+value to signal that the remainder is not desired.
+
+\subsection{Polynomial Basis Operations}
+
+Strictly speaking the organization of the integers within the mp\_int structures is what is known as a 
+``polynomial basis''.  This simply means a field element is stored by divisions of a radix.  For example, if
+$f(x) = \sum_{i=0}^{k} y_ix^k$ for any vector $\vec y$ then the array of digits in $\vec y$ are said to be 
+the polynomial basis representation of $z$ if $f(\beta) = z$ for a given radix $\beta$.  
+
+To multiply by the polynomial $g(x) = x$ all you have todo is shift the digits of the basis left one place.  The
+following function provides this operation.
+
+\index{mp\_lshd}
+\begin{alltt}
+int mp_lshd (mp_int * a, int b);
+\end{alltt}
+
+This will multiply $a$ in place by $x^b$ which is equivalent to shifting the digits left $b$ places and inserting zeroes
+in the least significant digits.  Similarly to divide by a power of $x$ the following function is provided.
+
+\index{mp\_rshd}
+\begin{alltt}
+void mp_rshd (mp_int * a, int b)
+\end{alltt}
+This will divide $a$ in place by $x^b$ and discard the remainder.  This function cannot fail as it performs the operations
+in place and no new digits are required to complete it.
+
+\subsection{AND, OR and XOR Operations}
+
+While AND, OR and XOR operations are not typical ``bignum functions'' they can be useful in several instances.  The
+three functions are prototyped as follows.
+
+\index{mp\_or} \index{mp\_and} \index{mp\_xor}
+\begin{alltt}
+int mp_or  (mp_int * a, mp_int * b, mp_int * c);
+int mp_and (mp_int * a, mp_int * b, mp_int * c);
+int mp_xor (mp_int * a, mp_int * b, mp_int * c);
+\end{alltt}
+
+Which compute $c = a \odot b$ where $\odot$ is one of OR, AND or XOR.  
+
+\section{Addition and Subtraction}
+
+To compute an addition or subtraction the following two functions can be used.
+
+\index{mp\_add} \index{mp\_sub}
+\begin{alltt}
+int mp_add (mp_int * a, mp_int * b, mp_int * c);
+int mp_sub (mp_int * a, mp_int * b, mp_int * c)
+\end{alltt}
+
+Which perform $c = a \odot b$ where $\odot$ is one of signed addition or subtraction.  The operations are fully sign
+aware.
+
+\section{Sign Manipulation}
+\subsection{Negation}
+\label{sec:NEG}
+Simple integer negation can be performed with the following.
+
+\index{mp\_neg}
+\begin{alltt}
+int mp_neg (mp_int * a, mp_int * b);
+\end{alltt}
+
+Which assigns $-a$ to $b$.  
+
+\subsection{Absolute}
+Simple integer absolutes can be performed with the following.
+
+\index{mp\_neg}
+\begin{alltt}
+int mp_abs (mp_int * a, mp_int * b);
+\end{alltt}
+
+Which assigns $\vert a \vert$ to $b$.  
+
+\section{Integer Division and Remainder}
+To perform a complete and general integer division with remainder use the following function.
+
+\index{mp\_div}
+\begin{alltt}
+int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d);
+\end{alltt}
+                                                        
+This divides $a$ by $b$ and stores the quotient in $c$ and $d$.  The signed quotient is computed such that 
+$bc + d = a$.  Note that either of $c$ or $d$ can be set to \textbf{NULL} if their value is not required.  If 
+$b$ is zero the function returns \textbf{MP\_VAL}.  
+
+
+\chapter{Multiplication and Squaring}
+\section{Multiplication}
+A full signed integer multiplication can be performed with the following.
+\index{mp\_mul}
+\begin{alltt}
+int mp_mul (mp_int * a, mp_int * b, mp_int * c);
+\end{alltt}
+Which assigns the full signed product $ab$ to $c$.  This function actually breaks into one of four cases which are 
+specific multiplication routines optimized for given parameters.  First there are the Toom-Cook multiplications which
+should only be used with very large inputs.  This is followed by the Karatsuba multiplications which are for moderate
+sized inputs.  Then followed by the Comba and baseline multipliers.
+
+Fortunately for the developer you don't really need to know this unless you really want to fine tune the system.  mp\_mul()
+will determine on its own\footnote{Some tweaking may be required.} what routine to use automatically when it is called.
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int result;
+
+   /* Initialize the numbers */
+   if ((result = mp_init_multi(&number1, 
+                               &number2, NULL)) != MP_OKAY) \{
+      printf("Error initializing the numbers.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* set the terms */
+   if ((result = mp_set_int(&number, 257)) != MP_OKAY) \{
+      printf("Error setting number1.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   if ((result = mp_set_int(&number2, 1023)) != MP_OKAY) \{
+      printf("Error setting number2.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* multiply them */
+   if ((result = mp_mul(&number1, &number2,
+                        &number1)) != MP_OKAY) \{
+      printf("Error multiplying terms.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* display */
+   printf("number1 * number2 == \%lu", mp_get_int(&number1));
+
+   /* free terms and return */
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt}   
+
+If this program succeeds it shall output the following.
+
+\begin{alltt}
+number1 * number2 == 262911
+\end{alltt}
+
+\section{Squaring}
+Since squaring can be performed faster than multiplication it is performed it's own function instead of just using
+mp\_mul().
+
+\index{mp\_sqr}
+\begin{alltt}
+int mp_sqr (mp_int * a, mp_int * b);
+\end{alltt}
+
+Will square $a$ and store it in $b$.  Like the case of multiplication there are four different squaring
+algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms.
+
+\section{Tuning Polynomial Basis Routines}
+
+Both of the Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that
+the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectfully they require 
+considerably less work.  For example, a 10000-digit multiplication would take roughly 724,000 single precision
+multiplications with Toom-Cook or 100,000,000 single precision multiplications with the standard Comba (a factor
+of 138).
+
+So why not always use Karatsuba or Toom-Cook?   The simple answer is that they have so much overhead that they're not
+actually faster than Comba until you hit distinct  ``cutoff'' points.  For Karatsuba with the default configuration, 
+GCC 3.3.1 and an Athlon XP processor the cutoff point is roughly 110 digits (about 70 for the Intel P4).  That is, at 
+110 digits Karatsuba and Comba multiplications just about break even and for 110+ digits Karatsuba is faster.
+
+Toom-Cook has incredible overhead and is probably only useful for very large inputs.  So far no known cutoff points 
+exist and for the most part I just set the cutoff points very high to make sure they're not called.
+
+A demo program in the ``etc/'' directory of the project called ``tune.c'' can be used to find the cutoff points.  This
+can be built with GCC as follows
+
+\begin{alltt}
+make XXX
+\end{alltt}
+Where ``XXX'' is one of the following entries from the table \ref{fig:tuning}.
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|l|l|}
+\hline \textbf{Value of XXX} & \textbf{Meaning} \\
+\hline tune & Builds portable tuning application \\
+\hline tune86 & Builds x86 (pentium and up) program for COFF \\
+\hline tune86c & Builds x86 program for Cygwin \\
+\hline tune86l & Builds x86 program for Linux (ELF format) \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Build Names for Tuning Programs}
+\label{fig:tuning}
+\end{figure}
+
+When the program is running it will output a series of measurements for different cutoff points.  It will first find
+good Karatsuba squaring and multiplication points.  Then it proceeds to find Toom-Cook points.  Note that the Toom-Cook
+tuning takes a very long time as the cutoff points are likely to be very high.
+
+\chapter{Modular Reduction}
+
+Modular reduction is process of taking the remainder of one quantity divided by another.  Expressed 
+as (\ref{eqn:mod}) the modular reduction is equivalent to the remainder of $b$ divided by $c$.  
+
+\begin{equation}
+a \equiv b \mbox{ (mod }c\mbox{)}
+\label{eqn:mod}
+\end{equation}
+
+Of particular interest to cryptography are reductions where $b$ is limited to the range $0 \le b < c^2$ since particularly 
+fast reduction algorithms can be written for the limited range.  
+
+Note that one of the four optimized reduction algorithms are automatically chosen in the modular exponentiation
+algorithm mp\_exptmod when an appropriate modulus is detected.  
+
+\section{Straight Division}
+In order to effect an arbitrary modular reduction the following algorithm is provided.
+
+\index{mp\_mod}
+\begin{alltt}
+int mp_mod(mp_int *a, mp_int *b, mp_int *c);
+\end{alltt}
+
+This reduces $a$ modulo $b$ and stores the result in $c$.  The sign of $c$ shall agree with the sign 
+of $b$.  This algorithm accepts an input $a$ of any range and is not limited by $0 \le a < b^2$.
+
+\section{Barrett Reduction}
+
+Barrett reduction is a generic optimized reduction algorithm that requires pre--computation to achieve
+a decent speedup over straight division.  First a $mu$ value must be precomputed with the following function.
+
+\index{mp\_reduce\_setup}
+\begin{alltt}
+int mp_reduce_setup(mp_int *a, mp_int *b);
+\end{alltt}
+
+Given a modulus in $b$ this produces the required $mu$ value in $a$.  For any given modulus this only has to
+be computed once.  Modular reduction can now be performed with the following.
+
+\index{mp\_reduce}
+\begin{alltt}
+int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
+\end{alltt}
+
+This will reduce $a$ in place modulo $b$ with the precomputed $mu$ value in $c$.  $a$ must be in the range
+$0 \le a < b^2$.
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int   a, b, c, mu;
+   int      result;
+
+   /* initialize a,b to desired values, mp_init mu, 
+    * c and set c to 1...we want to compute a^3 mod b 
+    */
+
+   /* get mu value */
+   if ((result = mp_reduce_setup(&mu, b)) != MP_OKAY) \{
+      printf("Error getting mu.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* square a to get c = a^2 */
+   if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{
+      printf("Error squaring.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' modulo b */
+   if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   
+   /* multiply a to get c = a^3 */
+   if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' modulo b  */
+   if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+  
+   /* c now equals a^3 mod b */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} 
+
+This program will calculate $a^3 \mbox{ mod }b$ if all the functions succeed.  
+
+\section{Montgomery Reduction}
+
+Montgomery is a specialized reduction algorithm for any odd moduli.  Like Barrett reduction a pre--computation
+step is required.  This is accomplished with the following.
+
+\index{mp\_montgomery\_setup}
+\begin{alltt}
+int mp_montgomery_setup(mp_int *a, mp_digit *mp);
+\end{alltt}
+
+For the given odd moduli $a$ the precomputation value is placed in $mp$.  The reduction is computed with the 
+following.
+
+\index{mp\_montgomery\_reduce}
+\begin{alltt}
+int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+\end{alltt}
+This reduces $a$ in place modulo $m$ with the pre--computed value $mp$.   $a$ must be in the range
+$0 \le a < b^2$.
+
+Montgomery reduction is faster than Barrett reduction for moduli smaller than the ``comba'' limit.  With the default
+setup for instance, the limit is $127$ digits ($3556$--bits).   Note that this function is not limited to
+$127$ digits just that it falls back to a baseline algorithm after that point.  
+
+An important observation is that this reduction does not return $a \mbox{ mod }m$ but $aR^{-1} \mbox{ mod }m$ 
+where $R = \beta^n$, $n$ is the n number of digits in $m$ and $\beta$ is radix used (default is $2^{28}$).  
+
+To quickly calculate $R$ the following function was provided.
+
+\index{mp\_montgomery\_calc\_normalization}
+\begin{alltt}
+int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
+\end{alltt}
+Which calculates $a = R$ for the odd moduli $b$ without using multiplication or division.  
+
+The normal modus operandi for Montgomery reductions is to normalize the integers before entering the system.  For
+example, to calculate $a^3 \mbox { mod }b$ using Montgomery reduction the value of $a$ can be normalized by
+multiplying it by $R$.  Consider the following code snippet.
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int   a, b, c, R;
+   mp_digit mp;
+   int      result;
+
+   /* initialize a,b to desired values, 
+    * mp_init R, c and set c to 1.... 
+    */
+
+   /* get normalization */
+   if ((result = mp_montgomery_calc_normalization(&R, b)) != MP_OKAY) \{
+      printf("Error getting norm.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* get mp value */
+   if ((result = mp_montgomery_setup(&c, &mp)) != MP_OKAY) \{
+      printf("Error setting up montgomery.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* normalize `a' so now a is equal to aR */
+   if ((result = mp_mulmod(&a, &R, &b, &a)) != MP_OKAY) \{
+      printf("Error computing aR.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* square a to get c = a^2R^2 */
+   if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{
+      printf("Error squaring.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' back down to c = a^2R^2 * R^-1 == a^2R */
+   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   
+   /* multiply a to get c = a^3R^2 */
+   if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' back down to c = a^3R^2 * R^-1 == a^3R */
+   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   
+   /* now reduce (again) `c' back down to c = a^3R * R^-1 == a^3 */
+   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* c now equals a^3 mod b */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} 
+
+This particular example does not look too efficient but it demonstrates the point of the algorithm.  By 
+normalizing the inputs the reduced results are always of the form $aR$ for some variable $a$.  This allows
+a single final reduction to correct for the normalization and the fast reduction used within the algorithm.
+
+For more details consider examining the file \textit{bn\_mp\_exptmod\_fast.c}.
+
+\section{Restricted Dimminished Radix}
+
+``Dimminished Radix'' reduction refers to reduction with respect to moduli that are ameniable to simple
+digit shifting and small multiplications.  In this case the ``restricted'' variant refers to moduli of the
+form $\beta^k - p$ for some $k \ge 0$ and $0 < p < \beta$ where $\beta$ is the radix (default to $2^{28}$).  
+
+As in the case of Montgomery reduction there is a pre--computation phase required for a given modulus.
+
+\index{mp\_dr\_setup}
+\begin{alltt}
+void mp_dr_setup(mp_int *a, mp_digit *d);
+\end{alltt}
+
+This computes the value required for the modulus $a$ and stores it in $d$.  This function cannot fail
+and does not return any error codes.  After the pre--computation a reduction can be performed with the
+following.
+
+\index{mp\_dr\_reduce}
+\begin{alltt}
+int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp);
+\end{alltt}
+
+This reduces $a$ in place modulo $b$ with the pre--computed value $mp$.  $b$ must be of a restricted
+dimminished radix form and $a$ must be in the range $0 \le a < b^2$.  Dimminished radix reductions are 
+much faster than both Barrett and Montgomery reductions as they have a much lower asymtotic running time.  
+
+Since the moduli are restricted this algorithm is not particularly useful for something like Rabin, RSA or
+BBS cryptographic purposes.  This reduction algorithm is useful for Diffie-Hellman and ECC where fixed
+primes are acceptable.  
+
+Note that unlike Montgomery reduction there is no normalization process.  The result of this function is
+equal to the correct residue.
+
+\section{Unrestricted Dimminshed Radix}
+
+Unrestricted reductions work much like the restricted counterparts except in this case the moduli is of the 
+form $2^k - p$ for $0 < p < \beta$.  In this sense the unrestricted reductions are more flexible as they 
+can be applied to a wider range of numbers.  
+
+\index{mp\_reduce\_2k\_setup}
+\begin{alltt}
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d);
+\end{alltt}
+
+This will compute the required $d$ value for the given moduli $a$.  
+
+\index{mp\_reduce\_2k}
+\begin{alltt}
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d);
+\end{alltt}
+
+This will reduce $a$ in place modulo $n$ with the pre--computed value $d$.  From my experience this routine is 
+slower than mp\_dr\_reduce but faster for most moduli sizes than the Montgomery reduction.  
+
+\chapter{Exponentiation}
+\section{Single Digit Exponentiation}
+\index{mp\_expt\_d}
+\begin{alltt}
+int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
+\end{alltt}
+This computes $c = a^b$ using a simple binary left-to-right algorithm.  It is faster than repeated multiplications by 
+$a$ for all values of $b$ greater than three.  
+
+\section{Modular Exponentiation}
+\index{mp\_exptmod}
+\begin{alltt}
+int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+\end{alltt}
+This computes $Y \equiv G^X \mbox{ (mod }P\mbox{)}$ using a variable width sliding window algorithm.  This function
+will automatically detect the fastest modular reduction technique to use during the operation.  For negative values of 
+$X$ the operation is performed as $Y \equiv (G^{-1} \mbox{ mod }P)^{\vert X \vert} \mbox{ (mod }P\mbox{)}$ provided that 
+$gcd(G, P) = 1$.
+
+This function is actually a shell around the two internal exponentiation functions.  This routine will automatically
+detect when Barrett, Montgomery, Restricted and Unrestricted Dimminished Radix based exponentiation can be used.  Generally
+moduli of the a ``restricted dimminished radix'' form lead to the fastest modular exponentiations.  Followed by Montgomery
+and the other two algorithms.
+
+\section{Root Finding}
+\index{mp\_n\_root}
+\begin{alltt}
+int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
+\end{alltt}
+This computes $c = a^{1/b}$ such that $c^b \le a$ and $(c+1)^b > a$.  The implementation of this function is not 
+ideal for values of $b$ greater than three.  It will work but become very slow.  So unless you are working with very small
+numbers (less than 1000 bits) I'd avoid $b > 3$ situations.  Will return a positive root only for even roots and return
+a root with the sign of the input for odd roots.  For example, performing $4^{1/2}$ will return $2$ whereas $(-8)^{1/3}$ 
+will return $-2$.  
+
+This algorithm uses the ``Newton Approximation'' method and will converge on the correct root fairly quickly.  Since
+the algorithm requires raising $a$ to the power of $b$ it is not ideal to attempt to find roots for large
+values of $b$.  If particularly large roots are required then a factor method could be used instead.  For example,
+$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$.
+
+\chapter{Prime Numbers}
+\section{Trial Division}
+\index{mp\_prime\_is\_divisible}
+\begin{alltt}
+int mp_prime_is_divisible (mp_int * a, int *result)
+\end{alltt}
+This will attempt to evenly divide $a$ by a list of primes\footnote{Default is the first 256 primes.} and store the 
+outcome in ``result''.  That is if $result = 0$ then $a$ is not divisible by the primes, otherwise it is.  Note that 
+if the function does not return \textbf{MP\_OKAY} the value in ``result'' should be considered undefined\footnote{Currently
+the default is to set it to zero first.}.
+
+\section{Fermat Test}
+\index{mp\_prime\_fermat}
+\begin{alltt}
+int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
+\end{alltt}
+Performs a Fermat primality test to the base $b$.  That is it computes $b^a \mbox{ mod }a$ and tests whether the value is
+equal to $b$ or not.  If the values are equal then $a$ is probably prime and $result$ is set to one.  Otherwise $result$
+is set to zero.
+
+\section{Miller-Rabin Test}
+\index{mp\_prime\_miller\_rabin}
+\begin{alltt}
+int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
+\end{alltt}
+Performs a Miller-Rabin test to the base $b$ of $a$.  This test is much stronger than the Fermat test and is very hard to
+fool (besides with Carmichael numbers).  If $a$ passes the test (therefore is probably prime) $result$ is set to one.  
+Otherwise $result$ is set to zero.  
+
+Note that is suggested that you use the Miller-Rabin test instead of the Fermat test since all of the failures of 
+Miller-Rabin are a subset of the failures of the Fermat test.
+
+\subsection{Required Number of Tests}
+Generally to ensure a number is very likely to be prime you have to perform the Miller-Rabin with at least a half-dozen
+or so unique bases.  However, it has been proven that the probability of failure goes down as the size of the input goes up.
+This is why a simple function has been provided to help out.
+
+\index{mp\_prime\_rabin\_miller\_trials}
+\begin{alltt}
+int mp_prime_rabin_miller_trials(int size)
+\end{alltt}
+This returns the number of trials required for a $2^{-96}$ (or lower) probability of failure for a given ``size'' expressed
+in bits.  This comes in handy specially since larger numbers are slower to test.  For example, a 512-bit number would
+require ten tests whereas a 1024-bit number would only require four tests. 
+
+You should always still perform a trial division before a Miller-Rabin test though.
+
+\section{Primality Testing}
+\index{mp\_prime\_is\_prime}
+\begin{alltt}
+int mp_prime_is_prime (mp_int * a, int t, int *result)
+\end{alltt}
+This will perform a trial division followed by $t$ rounds of Miller-Rabin tests on $a$ and store the result in $result$.  
+If $a$ passes all of the tests $result$ is set to one, otherwise it is set to zero.  Note that $t$ is bounded by 
+$1 \le t < PRIME\_SIZE$ where $PRIME\_SIZE$ is the number of primes in the prime number table (by default this is $256$).
+
+\section{Next Prime}
+\index{mp\_prime\_next\_prime}
+\begin{alltt}
+int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
+\end{alltt}
+This finds the next prime after $a$ that passes mp\_prime\_is\_prime() with $t$ tests.  Set $bbs\_style$ to one if you 
+want only the next prime congruent to $3 \mbox{ mod } 4$, otherwise set it to zero to find any next prime.  
+
+\section{Random Primes}
+\index{mp\_prime\_random}
+\begin{alltt}
+int mp_prime_random(mp_int *a, int t, int size, int bbs, 
+                    ltm_prime_callback cb, void *dat)
+\end{alltt}
+This will find a prime greater than $256^{size}$ which can be ``bbs\_style'' or not depending on $bbs$ and must pass
+$t$ rounds of tests.  The ``ltm\_prime\_callback'' is a typedef for 
+
+\begin{alltt}
+typedef int ltm_prime_callback(unsigned char *dst, int len, void *dat);
+\end{alltt}
+
+Which is a function that must read $len$ bytes (and return the amount stored) into $dst$.  The $dat$ variable is simply
+copied from the original input.  It can be used to pass RNG context data to the callback.  The function 
+mp\_prime\_random() is more suitable for generating primes which must be secret (as in the case of RSA) since there 
+is no skew on the least significant bits.
+
+\textit{Note:}  As of v0.30 of the LibTomMath library this function has been deprecated.  It is still available
+but users are encouraged to use the new mp\_prime\_random\_ex() function instead.
+
+\subsection{Extended Generation}
+\index{mp\_prime\_random\_ex}
+\begin{alltt}
+int mp_prime_random_ex(mp_int *a,    int t, 
+                       int     size, int flags, 
+                       ltm_prime_callback cb, void *dat);
+\end{alltt}
+This will generate a prime in $a$ using $t$ tests of the primality testing algorithms.  The variable $size$
+specifies the bit length of the prime desired.  The variable $flags$ specifies one of several options available
+(see fig. \ref{fig:primeopts}) which can be OR'ed together.  The callback parameters are used as in 
+mp\_prime\_random().
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|r|l|}
+\hline \textbf{Flag}         & \textbf{Meaning} \\
+\hline LTM\_PRIME\_BBS       & Make the prime congruent to $3$ modulo $4$ \\
+\hline LTM\_PRIME\_SAFE      & Make a prime $p$ such that $(p - 1)/2$ is also prime. \\
+                             & This option implies LTM\_PRIME\_BBS as well. \\
+\hline LTM\_PRIME\_2MSB\_OFF & Makes sure that the bit adjacent to the most significant bit \\
+                             & Is forced to zero.  \\
+\hline LTM\_PRIME\_2MSB\_ON  & Makes sure that the bit adjacent to the most significant bit \\
+                             & Is forced to one. \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Primality Generation Options}
+\label{fig:primeopts}
+\end{figure}
+
+\chapter{Input and Output}
+\section{ASCII Conversions}
+\subsection{To ASCII}
+\index{mp\_toradix}
+\begin{alltt}
+int mp_toradix (mp_int * a, char *str, int radix);
+\end{alltt}
+This still store $a$ in ``str'' as a base-``radix'' string of ASCII chars.  This function appends a NUL character
+to terminate the string.  Valid values of ``radix'' line in the range $[2, 64]$.  To determine the size (exact) required
+by the conversion before storing any data use the following function.
+
+\index{mp\_radix\_size}
+\begin{alltt}
+int mp_radix_size (mp_int * a, int radix, int *size)
+\end{alltt}
+This stores in ``size'' the number of characters (including space for the NUL terminator) required.  Upon error this 
+function returns an error code and ``size'' will be zero.  
+
+\subsection{From ASCII}
+\index{mp\_read\_radix}
+\begin{alltt}
+int mp_read_radix (mp_int * a, char *str, int radix);
+\end{alltt}
+This will read the base-``radix'' NUL terminated string from ``str'' into $a$.  It will stop reading when it reads a
+character it does not recognize (which happens to include th NUL char... imagine that...).  A single leading $-$ sign
+can be used to denote a negative number.
+
+\section{Binary Conversions}
+
+Converting an mp\_int to and from binary is another keen idea.
+
+\index{mp\_unsigned\_bin\_size}
+\begin{alltt}
+int mp_unsigned_bin_size(mp_int *a);
+\end{alltt}
+
+This will return the number of bytes (octets) required to store the unsigned copy of the integer $a$.
+
+\index{mp\_to\_unsigned\_bin}
+\begin{alltt}
+int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+\end{alltt}
+This will store $a$ into the buffer $b$ in big--endian format.  Fortunately this is exactly what DER (or is it ASN?)
+requires.  It does not store the sign of the integer.
+
+\index{mp\_read\_unsigned\_bin}
+\begin{alltt}
+int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+\end{alltt}
+This will read in an unsigned big--endian array of bytes (octets) from $b$ of length $c$ into $a$.  The resulting
+integer $a$ will always be positive.
+
+For those who acknowledge the existence of negative numbers (heretic!) there are ``signed'' versions of the
+previous functions.
+
+\begin{alltt}
+int mp_signed_bin_size(mp_int *a);
+int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_signed_bin(mp_int *a, unsigned char *b);
+\end{alltt}
+They operate essentially the same as the unsigned copies except they prefix the data with zero or non--zero
+byte depending on the sign.  If the sign is zpos (e.g. not negative) the prefix is zero, otherwise the prefix
+is non--zero.  
+
+\chapter{Algebraic Functions}
+\section{Extended Euclidean Algorithm}
+\index{mp\_exteuclid}
+\begin{alltt}
+int mp_exteuclid(mp_int *a, mp_int *b, 
+                 mp_int *U1, mp_int *U2, mp_int *U3);
+\end{alltt}
+
+This finds the triple U1/U2/U3 using the Extended Euclidean algorithm such that the following equation holds.
+
+\begin{equation}
+a \cdot U1 + b \cdot U2 = U3
+\end{equation}
+
+Any of the U1/U2/U3 paramters can be set to \textbf{NULL} if they are not desired.  
+
+\section{Greatest Common Divisor}
+\index{mp\_gcd}
+\begin{alltt}
+int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
+\end{alltt}
+This will compute the greatest common divisor of $a$ and $b$ and store it in $c$.
+
+\section{Least Common Multiple}
+\index{mp\_lcm}
+\begin{alltt}
+int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
+\end{alltt}
+This will compute the least common multiple of $a$ and $b$ and store it in $c$.
+
+\section{Jacobi Symbol}
+\index{mp\_jacobi}
+\begin{alltt}
+int mp_jacobi (mp_int * a, mp_int * p, int *c)
+\end{alltt}
+This will compute the Jacobi symbol for $a$ with respect to $p$.  If $p$ is prime this essentially computes the Legendre
+symbol.  The result is stored in $c$ and can take on one of three values $\lbrace -1, 0, 1 \rbrace$.  If $p$ is prime
+then the result will be $-1$ when $a$ is not a quadratic residue modulo $p$.  The result will be $0$ if $a$ divides $p$
+and the result will be $1$ if $a$ is a quadratic residue modulo $p$.  
+
+\section{Modular Inverse}
+\index{mp\_invmod}
+\begin{alltt}
+int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+\end{alltt}
+Computes the multiplicative inverse of $a$ modulo $b$ and stores the result in $c$ such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$.
+
+\section{Single Digit Functions}
+
+For those using small numbers (\textit{snicker snicker}) there are several ``helper'' functions
+
+\index{mp\_add\_d} \index{mp\_sub\_d} \index{mp\_mul\_d} \index{mp\_div\_d} \index{mp\_mod\_d}
+\begin{alltt}
+int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
+int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
+int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
+int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
+int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
+\end{alltt}
+
+These work like the full mp\_int capable variants except the second parameter $b$ is a mp\_digit.  These
+functions fairly handy if you have to work with relatively small numbers since you will not have to allocate
+an entire mp\_int to store a number like $1$ or $2$.
+
+\input{bn.ind}
+
+\end{document}
--- a/bn_error.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_error.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_ERROR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 static const struct {
      int code;
@@ -39,3 +40,4 @@
    return "Invalid error code";
 }
 
+#endif
--- a/bn_fast_mp_invmod.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_fast_mp_invmod.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_FAST_MP_INVMOD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,12 +14,11 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* computes the modular inverse via binary extended euclidean algorithm, 
  * that is c = 1/a mod b 
  *
- * Based on mp_invmod except this is optimized for the case where b is 
+ * Based on slow invmod except this is optimized for the case where b is 
  * odd as per HAC Note 14.64 on pp. 610
  */
 int
@@ -141,3 +142,4 @@
 __ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
   return res;
 }
+#endif
--- a/bn_fast_mp_montgomery_reduce.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_fast_mp_montgomery_reduce.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,11 +14,10 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* computes xR**-1 == x (mod N) via Montgomery Reduction
  *
- * This is an optimized implementation of mp_montgomery_reduce
+ * This is an optimized implementation of montgomery_reduce
  * which uses the comba method to quickly calculate the columns of the
  * reduction.
  *
@@ -165,3 +166,4 @@
   }
   return MP_OKAY;
 }
+#endif
--- a/bn_fast_s_mp_mul_digs.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_fast_s_mp_mul_digs.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_MUL_DIGS_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* Fast (comba) multiplier
  *
@@ -33,8 +34,9 @@
 int
 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  int     olduse, res, pa, ix;
-  mp_word W[MP_WARRAY];
+  int     olduse, res, pa, ix, iz;
+  mp_digit W[MP_WARRAY];
+  register mp_word  _W;
 
   /* grow the destination as required */
   if (c->alloc < digs) {
@@ -43,82 +45,52 @@
     }
   }
 
-  /* clear temp buf (the columns) */
-  memset (W, 0, sizeof (mp_word) * digs);
+  /* number of output digits to produce */
+  pa = MIN(digs, a->used + b->used);
 
-  /* calculate the columns */
-  pa = a->used;
-  for (ix = 0; ix < pa; ix++) {
-    /* this multiplier has been modified to allow you to 
-     * control how many digits of output are produced.  
-     * So at most we want to make upto "digs" digits of output.
-     *
-     * this adds products to distinct columns (at ix+iy) of W
-     * note that each step through the loop is not dependent on
-     * the previous which means the compiler can easily unroll
-     * the loop without scheduling problems
-     */
-    {
-      register mp_digit tmpx, *tmpy;
-      register mp_word *_W;
-      register int iy, pb;
+  /* clear the carry */
+  _W = 0;
+  for (ix = 0; ix <= pa; ix++) { 
+      int      tx, ty;
+      int      iy;
+      mp_digit *tmpx, *tmpy;
+
+      /* get offsets into the two bignums */
+      ty = MIN(b->used-1, ix);
+      tx = ix - ty;
 
-      /* alias for the the word on the left e.g. A[ix] * A[iy] */
-      tmpx = a->dp[ix];
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = b->dp + ty;
 
-      /* alias for the right side */
-      tmpy = b->dp;
-
-      /* alias for the columns, each step through the loop adds a new
-         term to each column
+      /* this is the number of times the loop will iterrate, essentially its 
+         while (tx++ < a->used && ty-- >= 0) { ... }
        */
-      _W = W + ix;
+      iy = MIN(a->used-tx, ty+1);
 
-      /* the number of digits is limited by their placement.  E.g.
-         we avoid multiplying digits that will end up above the # of
-         digits of precision requested
-       */
-      pb = MIN (b->used, digs - ix);
+      /* execute loop */
+      for (iz = 0; iz < iy; ++iz) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+      }
 
-      for (iy = 0; iy < pb; iy++) {
-        *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
-      }
-    }
+      /* store term */
+      W[ix] = ((mp_digit)_W) & MP_MASK;
 
+      /* make next carry */
+      _W = _W >> ((mp_word)DIGIT_BIT);
   }
 
   /* setup dest */
-  olduse = c->used;
+  olduse  = c->used;
   c->used = digs;
 
   {
     register mp_digit *tmpc;
-
-    /* At this point W[] contains the sums of each column.  To get the
-     * correct result we must take the extra bits from each column and
-     * carry them down
-     *
-     * Note that while this adds extra code to the multiplier it 
-     * saves time since the carry propagation is removed from the 
-     * above nested loop.This has the effect of reducing the work 
-     * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
-     * cost of the shifting.  On very small numbers this is slower 
-     * but on most cryptographic size numbers it is faster.
-     *
-     * In this particular implementation we feed the carries from
-     * behind which means when the loop terminates we still have one
-     * last digit to copy
-     */
     tmpc = c->dp;
-    for (ix = 1; ix < digs; ix++) {
-      /* forward the carry from the previous temp */
-      W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-
+    for (ix = 0; ix < digs; ix++) {
       /* now extract the previous digit [below the carry] */
-      *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+      *tmpc++ = W[ix];
     }
-    /* fetch the last digit */
-    *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
 
     /* clear unused digits [that existed in the old copy of c] */
     for (; ix < olduse; ix++) {
@@ -128,3 +100,4 @@
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
--- a/bn_fast_s_mp_mul_high_digs.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_fast_s_mp_mul_high_digs.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,10 +14,9 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
- #include <tommath.h>
 
-/* this is a modified version of fast_s_mp_mul_digs that only produces
- * output digits *above* digs.  See the comments for fast_s_mp_mul_digs
+/* this is a modified version of fast_s_mul_digs that only produces
+ * output digits *above* digs.  See the comments for fast_s_mul_digs
  * to see how it works.
  *
  * This is used in the Barrett reduction since for one of the multiplications
@@ -26,73 +27,69 @@
 int
 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  int     oldused, newused, res, pa, pb, ix;
-  mp_word W[MP_WARRAY];
+  int     olduse, res, pa, ix, iz;
+  mp_digit W[MP_WARRAY];
+  mp_word  _W;
 
-  /* calculate size of product and allocate more space if required */
-  newused = a->used + b->used + 1;
-  if (c->alloc < newused) {
-    if ((res = mp_grow (c, newused)) != MP_OKAY) {
+  /* grow the destination as required */
+  pa = a->used + b->used;
+  if (c->alloc < pa) {
+    if ((res = mp_grow (c, pa)) != MP_OKAY) {
       return res;
     }
   }
 
-  /* like the other comba method we compute the columns first */
-  pa = a->used;
-  pb = b->used;
-  memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
-  for (ix = 0; ix < pa; ix++) {
-    {
-      register mp_digit tmpx, *tmpy;
-      register int iy;
-      register mp_word *_W;
+  /* number of output digits to produce */
+  pa = a->used + b->used;
+  _W = 0;
+  for (ix = digs; ix <= pa; ix++) { 
+      int      tx, ty, iy;
+      mp_digit *tmpx, *tmpy;
 
-      /* work todo, that is we only calculate digits that are at "digs" or above  */
-      iy = digs - ix;
+      /* get offsets into the two bignums */
+      ty = MIN(b->used-1, ix);
+      tx = ix - ty;
 
-      /* copy of word on the left of A[ix] * B[iy] */
-      tmpx = a->dp[ix];
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = b->dp + ty;
 
-      /* alias for right side */
-      tmpy = b->dp + iy;
-     
-      /* alias for the columns of output.  Offset to be equal to or above the 
-       * smallest digit place requested 
+      /* this is the number of times the loop will iterrate, essentially its 
+         while (tx++ < a->used && ty-- >= 0) { ... }
        */
-      _W = W + digs;     
-      
-      /* skip cases below zero where ix > digs */
-      if (iy < 0) {
-         iy    = abs(iy);
-         tmpy += iy;
-         _W   += iy;
-         iy    = 0;
+      iy = MIN(a->used-tx, ty+1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
       }
 
-      /* compute column products for digits above the minimum */
-      for (; iy < pb; iy++) {
-         *_W++ += ((mp_word) tmpx) * ((mp_word)*tmpy++);
-      }
-    }
+      /* store term */
+      W[ix] = ((mp_digit)_W) & MP_MASK;
+
+      /* make next carry */
+      _W = _W >> ((mp_word)DIGIT_BIT);
   }
 
   /* setup dest */
-  oldused = c->used;
-  c->used = newused;
+  olduse  = c->used;
+  c->used = pa;
+
+  {
+    register mp_digit *tmpc;
 
-  /* now convert the array W downto what we need
-   *
-   * See comments in bn_fast_s_mp_mul_digs.c
-   */
-  for (ix = digs + 1; ix < newused; ix++) {
-    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-  }
-  c->dp[newused - 1] = (mp_digit) (W[newused - 1] & ((mp_word) MP_MASK));
+    tmpc = c->dp + digs;
+    for (ix = digs; ix <= pa; ix++) {
+      /* now extract the previous digit [below the carry] */
+      *tmpc++ = W[ix];
+    }
 
-  for (; ix < oldused; ix++) {
-    c->dp[ix] = 0;
+    /* clear unused digits [that existed in the old copy of c] */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
   }
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
--- a/bn_fast_s_mp_sqr.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_fast_s_mp_sqr.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_SQR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* fast squaring
  *
@@ -31,109 +32,98 @@
  * Based on Algorithm 14.16 on pp.597 of HAC.
  *
  */
+/* the jist of squaring...
+
+you do like mult except the offset of the tmpx [one that starts closer to zero]
+can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
+(ty-tx) so that it never happens.  You double all those you add in the inner loop
+
+After that loop you do the squares and add them in.
+
+Remove W2 and don't memset W
+
+*/
+
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
-  int     olduse, newused, res, ix, pa;
-  mp_word W2[MP_WARRAY], W[MP_WARRAY];
+  int       olduse, res, pa, ix, iz;
+  mp_digit   W[MP_WARRAY], *tmpx;
+  mp_word   W1;
 
-  /* calculate size of product and allocate as required */
-  pa = a->used;
-  newused = pa + pa + 1;
-  if (b->alloc < newused) {
-    if ((res = mp_grow (b, newused)) != MP_OKAY) {
+  /* grow the destination as required */
+  pa = a->used + a->used;
+  if (b->alloc < pa) {
+    if ((res = mp_grow (b, pa)) != MP_OKAY) {
       return res;
     }
   }
 
-  /* zero temp buffer (columns)
-   * Note that there are two buffers.  Since squaring requires
-   * a outer and inner product and the inner product requires
-   * computing a product and doubling it (a relatively expensive
-   * op to perform n**2 times if you don't have to) the inner and
-   * outer products are computed in different buffers.  This way
-   * the inner product can be doubled using n doublings instead of
-   * n**2
-   */
-  memset (W,  0, newused * sizeof (mp_word));
-  memset (W2, 0, newused * sizeof (mp_word));
+  /* number of output digits to produce */
+  W1 = 0;
+  for (ix = 0; ix <= pa; ix++) { 
+      int      tx, ty, iy;
+      mp_word  _W;
+      mp_digit *tmpy;
+
+      /* clear counter */
+      _W = 0;
+
+      /* get offsets into the two bignums */
+      ty = MIN(a->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = a->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially its 
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(a->used-tx, ty+1);
 
-  /* This computes the inner product.  To simplify the inner N**2 loop
-   * the multiplication by two is done afterwards in the N loop.
-   */
-  for (ix = 0; ix < pa; ix++) {
-    /* compute the outer product
-     *
-     * Note that every outer product is computed
-     * for a particular column only once which means that
-     * there is no need todo a double precision addition
-     * into the W2[] array.
-     */
-    W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
+      /* now for squaring tx can never equal ty 
+       * we halve the distance since they approach at a rate of 2x
+       * and we have to round because odd cases need to be executed
+       */
+      iy = MIN(iy, (ty-tx+1)>>1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+      }
 
-    {
-      register mp_digit tmpx, *tmpy;
-      register mp_word *_W;
-      register int iy;
-
-      /* copy of left side */
-      tmpx = a->dp[ix];
+      /* double the inner product and add carry */
+      _W = _W + _W + W1;
 
-      /* alias for right side */
-      tmpy = a->dp + (ix + 1);
-
-      /* the column to store the result in */
-      _W = W + (ix + ix + 1);
+      /* even columns have the square term in them */
+      if ((ix&1) == 0) {
+         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+      }
 
-      /* inner products */
-      for (iy = ix + 1; iy < pa; iy++) {
-          *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
-      }
-    }
+      /* store it */
+      W[ix] = _W;
+
+      /* make next carry */
+      W1 = _W >> ((mp_word)DIGIT_BIT);
   }
 
   /* setup dest */
   olduse  = b->used;
-  b->used = newused;
+  b->used = a->used+a->used;
 
-  /* now compute digits
-   *
-   * We have to double the inner product sums, add in the
-   * outer product sums, propagate carries and convert
-   * to single precision.
-   */
   {
-    register mp_digit *tmpb;
-
-    /* double first value, since the inner products are
-     * half of what they should be
-     */
-    W[0] += W[0] + W2[0];
-
+    mp_digit *tmpb;
     tmpb = b->dp;
-    for (ix = 1; ix < newused; ix++) {
-      /* double/add next digit */
-      W[ix] += W[ix] + W2[ix];
-
-      /* propagate carry forwards [from the previous digit] */
-      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+    for (ix = 0; ix < pa; ix++) {
+      *tmpb++ = W[ix] & MP_MASK;
+    }
 
-      /* store the current digit now that the carry isn't
-       * needed
-       */
-      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-    }
-    /* set the last value.  Note even if the carry is zero
-     * this is required since the next step will not zero
-     * it if b originally had a value at b->dp[2*a.used]
-     */
-    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
-
-    /* clear high digits of b if there were any originally */
+    /* clear unused digits [that existed in the old copy of c] */
     for (; ix < olduse; ix++) {
       *tmpb++ = 0;
     }
   }
-
   mp_clamp (b);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_2expt.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_2expt.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_2EXPT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* computes a = 2**b 
  *
@@ -36,7 +37,8 @@
   a->used = b / DIGIT_BIT + 1;
 
   /* put the single bit in its place */
-  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_abs.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_abs.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_ABS_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* b = |a| 
  *
@@ -35,3 +36,4 @@
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_add.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_add.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_ADD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* high level addition (handles signs) */
 int mp_add (mp_int * a, mp_int * b, mp_int * c)
@@ -45,3 +46,4 @@
   return res;
 }
 
+#endif
--- a/bn_mp_add_d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_add_d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_ADD_D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* single digit addition */
 int
@@ -101,3 +102,4 @@
   return MP_OKAY;
 }
 
+#endif
--- a/bn_mp_addmod.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_addmod.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_ADDMOD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* d = a + b (mod c) */
 int
@@ -33,3 +34,4 @@
   mp_clear (&t);
   return res;
 }
+#endif
--- a/bn_mp_and.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_and.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_AND_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* AND two ints together */
 int
@@ -49,3 +50,4 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_clamp.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_clamp.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_CLAMP_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* trim unused digits 
  *
@@ -36,3 +37,4 @@
     a->sign = MP_ZPOS;
   }
 }
+#endif
--- a/bn_mp_clear.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_clear.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_CLEAR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,16 +14,19 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* clear one (frees)  */
 void
 mp_clear (mp_int * a)
 {
+  int i;
+
   /* only do anything if a hasn't been freed previously */
   if (a->dp != NULL) {
     /* first zero the digits */
-    memset (a->dp, 0, sizeof (mp_digit) * a->used);
+    for (i = 0; i < a->used; i++) {
+        a->dp[i] = 0;
+    }
 
     /* free ram */
     XFREE(a->dp);
@@ -32,3 +37,4 @@
     a->sign  = MP_ZPOS;
   }
 }
+#endif
--- a/bn_mp_clear_multi.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_clear_multi.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_CLEAR_MULTI_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 #include <stdarg.h>
 
 void mp_clear_multi(mp_int *mp, ...) 
@@ -26,3 +27,4 @@
     }
     va_end(args);
 }
+#endif
--- a/bn_mp_cmp.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_cmp.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_CMP_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* compare two ints (signed)*/
 int
@@ -35,3 +36,4 @@
      return mp_cmp_mag(a, b);
   }
 }
+#endif
--- a/bn_mp_cmp_d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_cmp_d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_CMP_D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* compare a digit */
 int mp_cmp_d(mp_int * a, mp_digit b)
@@ -36,3 +37,4 @@
     return MP_EQ;
   }
 }
+#endif
--- a/bn_mp_cmp_mag.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_cmp_mag.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_CMP_MAG_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* compare maginitude of two ints (unsigned) */
 int mp_cmp_mag (mp_int * a, mp_int * b)
@@ -47,3 +48,4 @@
   }
   return MP_EQ;
 }
+#endif
--- a/bn_mp_cnt_lsb.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_cnt_lsb.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_CNT_LSB_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 static const int lnz[16] = { 
    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
@@ -45,3 +46,4 @@
    return x;
 }
 
+#endif
--- a/bn_mp_copy.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_copy.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_COPY_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* copy, b = a */
 int
@@ -60,3 +61,4 @@
   b->sign = a->sign;
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_count_bits.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_count_bits.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_COUNT_BITS_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* returns the number of bits in an int */
 int
@@ -37,3 +38,4 @@
   }
   return r;
 }
+#endif
--- a/bn_mp_div.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_div.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,78 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
+
+#ifdef BN_MP_DIV_SMALL
+
+/* slower bit-bang division... also smaller */
+int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+   mp_int ta, tb, tq, q;
+   int    res, n, n2;
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+	
+  /* init our temps */
+  if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) {
+     return res;
+  }
+
+
+  mp_set(&tq, 1);
+  n = mp_count_bits(a) - mp_count_bits(b);
+  if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
+      ((res = mp_copy(b, &tb)) != MP_OKAY) || 
+      ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
+      ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
+      goto __ERR;
+  }
+
+  while (n-- >= 0) {
+     if (mp_cmp(&tb, &ta) != MP_GT) {
+        if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
+            ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
+           goto __ERR;
+        }
+     }
+     if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
+         ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
+           goto __ERR;
+     }
+  }
+
+  /* now q == quotient and ta == remainder */
+  n  = a->sign;
+  n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
+  if (c != NULL) {
+     mp_exch(c, &q);
+     c->sign  = n2;
+  }
+  if (d != NULL) {
+     mp_exch(d, &ta);
+     d->sign = n;
+  }
+__ERR:
+   mp_clear_multi(&ta, &tb, &tq, &q, NULL);
+   return res;
+}
+
+#else
 
 /* integer signed division. 
  * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
@@ -187,7 +260,7 @@
    */
   
   /* get sign before writing to c */
-  x.sign = a->sign;
+  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
 
   if (c != NULL) {
     mp_clamp (&q);
@@ -209,3 +282,7 @@
 __Q:mp_clear (&q);
   return res;
 }
+
+#endif
+
+#endif
--- a/bn_mp_div_2.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_div_2.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_2_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* b = a/2 */
 int mp_div_2(mp_int * a, mp_int * b)
@@ -60,3 +61,4 @@
   mp_clamp (b);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_div_2d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_div_2d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_2D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
 int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
@@ -89,3 +90,4 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_div_3.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_div_3.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_3_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* divide by three (based on routine from MPI and the GMP manual) */
 int
@@ -71,3 +72,4 @@
   return res;
 }
 
+#endif
--- a/bn_mp_div_d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_div_d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 static int s_is_power_of_two(mp_digit b, int *p)
 {
@@ -54,7 +55,7 @@
   /* power of two ? */
   if (s_is_power_of_two(b, &ix) == 1) {
      if (d != NULL) {
-        *d = a->dp[0] & ((1<<ix) - 1);
+        *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
      }
      if (c != NULL) {
         return mp_div_2d(a, ix, c, NULL);
@@ -62,10 +63,12 @@
      return MP_OKAY;
   }
 
+#ifdef BN_MP_DIV_3_C
   /* three? */
   if (b == 3) {
      return mp_div_3(a, c, d);
   }
+#endif
 
   /* no easy answer [c'est la vie].  Just division */
   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
@@ -100,3 +103,4 @@
   return res;
 }
 
+#endif
--- a/bn_mp_dr_is_modulus.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_dr_is_modulus.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_DR_IS_MODULUS_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* determines if a number is a valid DR modulus */
 int mp_dr_is_modulus(mp_int *a)
@@ -35,3 +36,4 @@
    return 1;
 }
 
+#endif
--- a/bn_mp_dr_reduce.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_dr_reduce.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_DR_REDUCE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
  *
@@ -86,3 +87,4 @@
   }
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_dr_setup.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_dr_setup.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_DR_SETUP_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* determines the setup value */
 void mp_dr_setup(mp_int *a, mp_digit *d)
@@ -24,3 +25,4 @@
         ((mp_word)a->dp[0]));
 }
 
+#endif
--- a/bn_mp_exch.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_exch.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_EXCH_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* swap the elements of two integers, for cases where you can't simply swap the 
  * mp_int pointers around
@@ -26,3 +27,4 @@
   *a = *b;
   *b = t;
 }
+#endif
--- a/bn_mp_expt_d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_expt_d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_EXPT_D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* calculate c = a**b  using a square-multiply algorithm */
 int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
@@ -49,3 +50,4 @@
   mp_clear (&g);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_exptmod.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_exptmod.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_EXPTMOD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 
 /* this is a shell function that calls either the normal or Montgomery
@@ -31,6 +32,7 @@
 
   /* if exponent X is negative we have to recurse */
   if (X->sign == MP_NEG) {
+#ifdef BN_MP_INVMOD_C
      mp_int tmpG, tmpX;
      int err;
 
@@ -57,26 +59,42 @@
      err = mp_exptmod(&tmpG, &tmpX, P, Y);
      mp_clear_multi(&tmpG, &tmpX, NULL);
      return err;
+#else 
+     /* no invmod */
+     return MP_VAL
+#endif
   }
 
+#ifdef BN_MP_DR_IS_MODULUS_C
   /* is it a DR modulus? */
   dr = mp_dr_is_modulus(P);
+#else
+  dr = 0;
+#endif
 
+#ifdef BN_MP_REDUCE_IS_2K_C
   /* if not, is it a uDR modulus? */
   if (dr == 0) {
      dr = mp_reduce_is_2k(P) << 1;
   }
+#endif
     
   /* if the modulus is odd or dr != 0 use the fast method */
-#ifndef NO_FAST_EXPTMOD
+#ifdef BN_MP_EXPTMOD_FAST_C
   if (mp_isodd (P) == 1 || dr !=  0) {
     return mp_exptmod_fast (G, X, P, Y, dr);
-  }
-  else
+  } else {
 #endif
-  {
+#ifdef BN_S_MP_EXPTMOD_C
     /* otherwise use the generic Barrett reduction technique */
     return s_mp_exptmod (G, X, P, Y);
+#else
+    /* no exptmod for evens */
+    return MP_VAL;
+#endif
+#ifdef BN_MP_EXPTMOD_FAST_C
   }
+#endif
 }
 
+#endif
--- a/bn_mp_exptmod_fast.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_exptmod_fast.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_EXPTMOD_FAST_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
  *
@@ -84,29 +85,52 @@
 
   /* determine and setup reduction code */
   if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_SETUP_C     
      /* now setup montgomery  */
      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
         goto __M;
      }
+#else
+     err = MP_VAL;
+     goto __M;
+#endif
 
      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
      if (((P->used * 2 + 1) < MP_WARRAY) &&
           P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
         redux = fast_mp_montgomery_reduce;
-     } else {
+     } else 
+#endif
+     {
+#ifdef BN_MP_MONTGOMERY_REDUCE_C
         /* use slower baseline Montgomery method */
         redux = mp_montgomery_reduce;
+#else
+        err = MP_VAL;
+        goto __M;
+#endif
      }
   } else if (redmode == 1) {
+#if defined(BN_MP_DR_SETUP_C) && defined(BN_MP_DR_REDUCE_C)
      /* setup DR reduction for moduli of the form B**k - b */
      mp_dr_setup(P, &mp);
      redux = mp_dr_reduce;
+#else
+     err = MP_VAL;
+     goto __M;
+#endif
   } else {
+#if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
      /* setup DR reduction for moduli of the form 2**k - b */
      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
         goto __M;
      }
      redux = mp_reduce_2k;
+#else
+     err = MP_VAL;
+     goto __M;
+#endif
   }
 
   /* setup result */
@@ -116,16 +140,21 @@
 
   /* create M table
    *
-   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
+
    *
    * The first half of the table is not computed though accept for M[0] and M[1]
    */
 
   if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
      /* now we need R mod m */
      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
        goto __RES;
      }
+#else 
+     err = MP_VAL;
+     goto __RES;
+#endif
 
      /* now set M[1] to G * R mod m */
      if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
@@ -269,7 +298,7 @@
       * to reduce one more time to cancel out the factor
       * of R.
       */
-     if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+     if ((err = redux(&res, P, mp)) != MP_OKAY) {
        goto __RES;
      }
   }
@@ -285,3 +314,5 @@
   }
   return err;
 }
+#endif
+
--- a/bn_mp_exteuclid.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_exteuclid.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_EXTEUCLID_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* Extended euclidean algorithm of (a, b) produces 
    a*u1 + b*u2 = u3
@@ -67,3 +68,4 @@
 _ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL);
    return err;
 }
+#endif
--- a/bn_mp_fread.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_fread.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_FREAD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* read a bigint from a file stream in ASCII */
 int mp_fread(mp_int *a, int radix, FILE *stream)
@@ -59,3 +60,4 @@
    return MP_OKAY;
 }
 
+#endif
--- a/bn_mp_fwrite.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_fwrite.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_FWRITE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 int mp_fwrite(mp_int *a, int radix, FILE *stream)
 {
@@ -44,3 +45,4 @@
    return MP_OKAY;
 }
 
+#endif
--- a/bn_mp_gcd.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_gcd.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_GCD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* Greatest Common Divisor using the binary method */
 int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
@@ -105,3 +106,4 @@
 __U:mp_clear (&v);
   return res;
 }
+#endif
--- a/bn_mp_get_int.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_get_int.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_GET_INT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* get the lower 32-bits of an mp_int */
 unsigned long mp_get_int(mp_int * a) 
@@ -37,3 +38,4 @@
   /* force result to 32-bits always so it is consistent on non 32-bit platforms */
   return res & 0xFFFFFFFFUL;
 }
+#endif
--- a/bn_mp_grow.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_grow.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_GROW_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* grow as required */
 int mp_grow (mp_int * a, int size)
@@ -49,3 +50,4 @@
   }
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_init.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_init.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,17 +14,23 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
-/* init a new bigint */
+/* init a new mp_int */
 int mp_init (mp_int * a)
 {
+  int i;
+
   /* allocate memory required and clear it */
-  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), MP_PREC);
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
   if (a->dp == NULL) {
     return MP_MEM;
   }
 
+  /* set the digits to zero */
+  for (i = 0; i < MP_PREC; i++) {
+      a->dp[i] = 0;
+  }
+
   /* set the used to zero, allocated digits to the default precision
    * and sign to positive */
   a->used  = 0;
@@ -31,3 +39,4 @@
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_init_copy.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_init_copy.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_COPY_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* creates "a" then copies b into it */
 int mp_init_copy (mp_int * a, mp_int * b)
@@ -24,3 +25,4 @@
   }
   return mp_copy (b, a);
 }
+#endif
--- a/bn_mp_init_multi.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_init_multi.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_MULTI_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 #include <stdarg.h>
 
 int mp_init_multi(mp_int *mp, ...) 
@@ -51,3 +52,4 @@
     return res;                /* Assumed ok, if error flagged above. */
 }
 
+#endif
--- a/bn_mp_init_set.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_init_set.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_SET_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* initialize and set a digit */
 int mp_init_set (mp_int * a, mp_digit b)
@@ -24,3 +25,4 @@
   mp_set(a, b);
   return err;
 }
+#endif
--- a/bn_mp_init_set_int.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_init_set_int.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_SET_INT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* initialize and set a digit */
 int mp_init_set_int (mp_int * a, unsigned long b)
@@ -23,3 +24,4 @@
   }
   return mp_set_int(a, b);
 }
+#endif
--- a/bn_mp_init_size.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_init_size.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_SIZE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,22 +14,31 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* init an mp_init for a given size */
 int mp_init_size (mp_int * a, int size)
 {
+  int x;
+
   /* pad size so there are always extra digits */
   size += (MP_PREC * 2) - (size % MP_PREC);	
   
   /* alloc mem */
-  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), size);
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
   if (a->dp == NULL) {
     return MP_MEM;
   }
+
+  /* set the members */
   a->used  = 0;
   a->alloc = size;
   a->sign  = MP_ZPOS;
 
+  /* zero the digits */
+  for (x = 0; x < size; x++) {
+      a->dp[x] = 0;
+  }
+
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_invmod.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_invmod.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_INVMOD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,163 +14,26 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* hac 14.61, pp608 */
 int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int  x, y, u, v, A, B, C, D;
-  int     res;
-
   /* b cannot be negative */
   if (b->sign == MP_NEG || mp_iszero(b) == 1) {
     return MP_VAL;
   }
 
+#ifdef BN_FAST_MP_INVMOD_C
   /* if the modulus is odd we can use a faster routine instead */
   if (mp_isodd (b) == 1) {
     return fast_mp_invmod (a, b, c);
   }
-  
-  /* init temps */
-  if ((res = mp_init_multi(&x, &y, &u, &v, 
-                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
-     return res;
-  }
-
-  /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __ERR;
-  }
-  if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  /* 2. [modified] if x,y are both even then return an error! */
-  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
-    res = MP_VAL;
-    goto __ERR;
-  }
-
-  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
-  }
-  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
-  }
-  mp_set (&A, 1);
-  mp_set (&D, 1);
-
-top:
-  /* 4.  while u is even do */
-  while (mp_iseven (&u) == 1) {
-    /* 4.1 u = u/2 */
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
-    }
-    /* 4.2 if A or B is odd then */
-    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
-      /* A = (A+y)/2, B = (B-x)/2 */
-      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-         goto __ERR;
-      }
-      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-         goto __ERR;
-      }
-    }
-    /* A = A/2, B = B/2 */
-    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __ERR;
-    }
-    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
+#endif
 
-  /* 5.  while v is even do */
-  while (mp_iseven (&v) == 1) {
-    /* 5.1 v = v/2 */
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
-    }
-    /* 5.2 if C or D is odd then */
-    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
-      /* C = (C+y)/2, D = (D-x)/2 */
-      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-         goto __ERR;
-      }
-      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-         goto __ERR;
-      }
-    }
-    /* C = C/2, D = D/2 */
-    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __ERR;
-    }
-    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
-
-  /* 6.  if u >= v then */
-  if (mp_cmp (&u, &v) != MP_LT) {
-    /* u = u - v, A = A - C, B = B - D */
-    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
-    }
-
-    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __ERR;
-    }
+#ifdef BN_MP_INVMOD_SLOW_C
+  return mp_invmod_slow(a, b, c);
+#endif
 
-    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
-    }
-  } else {
-    /* v - v - u, C = C - A, D = D - B */
-    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
-    }
-
-    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __ERR;
-    }
-
-    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
-    }
-  }
-
-  /* if not zero goto step 4 */
-  if (mp_iszero (&u) == 0)
-    goto top;
-
-  /* now a = C, b = D, gcd == g*v */
-
-  /* if v != 1 then there is no inverse */
-  if (mp_cmp_d (&v, 1) != MP_EQ) {
-    res = MP_VAL;
-    goto __ERR;
-  }
-
-  /* if its too low */
-  while (mp_cmp_d(&C, 0) == MP_LT) {
-      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
-      }
-  }
-  
-  /* too big */
-  while (mp_cmp_mag(&C, b) != MP_LT) {
-      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
-      }
-  }
-  
-  /* C is now the inverse */
-  mp_exch (&C, c);
-  res = MP_OKAY;
-__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
-  return res;
+  return MP_VAL;
 }
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn_mp_invmod_slow.c	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,171 @@
+#include <tommath.h>
+#ifdef BN_MP_INVMOD_SLOW_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* hac 14.61, pp608 */
+int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x, y, u, v, A, B, C, D;
+  int     res;
+
+  /* b cannot be negative */
+  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
+    return MP_VAL;
+  }
+
+  /* init temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, 
+                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
+     return res;
+  }
+
+  /* x = a, y = b */
+  if ((res = mp_copy (a, &x)) != MP_OKAY) {
+    goto __ERR;
+  }
+  if ((res = mp_copy (b, &y)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  /* 2. [modified] if x,y are both even then return an error! */
+  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto __ERR;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto __ERR;
+  }
+  mp_set (&A, 1);
+  mp_set (&D, 1);
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 4.2 if A or B is odd then */
+    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
+      /* A = (A+y)/2, B = (B-x)/2 */
+      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
+         goto __ERR;
+      }
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+         goto __ERR;
+      }
+    }
+    /* A = A/2, B = B/2 */
+    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
+      goto __ERR;
+    }
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 5.2 if C or D is odd then */
+    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
+      /* C = (C+y)/2, D = (D-x)/2 */
+      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
+         goto __ERR;
+      }
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+         goto __ERR;
+      }
+    }
+    /* C = C/2, D = D/2 */
+    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
+      goto __ERR;
+    }
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, A = A - C, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  } else {
+    /* v - v - u, C = C - A, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0)
+    goto top;
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* if its too low */
+  while (mp_cmp_d(&C, 0) == MP_LT) {
+      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
+         goto __ERR;
+      }
+  }
+  
+  /* too big */
+  while (mp_cmp_mag(&C, b) != MP_LT) {
+      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
+         goto __ERR;
+      }
+  }
+  
+  /* C is now the inverse */
+  mp_exch (&C, c);
+  res = MP_OKAY;
+__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+  return res;
+}
+#endif
--- a/bn_mp_is_square.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_is_square.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_IS_SQUARE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* Check if remainders are possible squares - fast exclude non-squares */
 static const char rem_128[128] = {
@@ -69,7 +70,7 @@
      return MP_OKAY;
   }
 
-  /* product of primes less than 2^31 */
+
   if ((res = mp_init_set_int(&t,11L*13L*17L*19L*23L*29L*31L)) != MP_OKAY) {
      return res;
   }
@@ -101,3 +102,4 @@
 ERR:mp_clear(&t);
   return res;
 }
+#endif
--- a/bn_mp_jacobi.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_jacobi.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_JACOBI_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* computes the jacobi c = (a | n) (or Legendre if n is prime)
  * HAC pp. 73 Algorithm 2.149
@@ -97,3 +98,4 @@
 __A1:mp_clear (&a1);
   return res;
 }
+#endif
--- a/bn_mp_karatsuba_mul.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_karatsuba_mul.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_KARATSUBA_MUL_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* c = |a| * |b| using Karatsuba Multiplication using 
  * three half size multiplications
@@ -76,9 +77,6 @@
     goto X0Y0;
 
   /* now shift the digits */
-  x0.sign = x1.sign = a->sign;
-  y0.sign = y1.sign = b->sign;
-
   x0.used = y0.used = B;
   x1.used = a->used - B;
   y1.used = b->used - B;
@@ -162,3 +160,4 @@
 ERR:
   return err;
 }
+#endif
--- a/bn_mp_karatsuba_sqr.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_karatsuba_sqr.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_KARATSUBA_SQR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,12 +14,11 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* Karatsuba squaring, computes b = a*a using three 
  * half size squarings
  *
- * See comments of mp_karatsuba_mul for details.  It 
+ * See comments of karatsuba_mul for details.  It 
  * is essentially the same algorithm but merely 
  * tuned to perform recursive squarings.
  */
@@ -113,3 +114,4 @@
 ERR:
   return err;
 }
+#endif
--- a/bn_mp_lcm.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_lcm.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_LCM_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* computes least common multiple as |a*b|/(a, b) */
 int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
@@ -52,3 +53,4 @@
   mp_clear_multi (&t1, &t2, NULL);
   return res;
 }
+#endif
--- a/bn_mp_lshd.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_lshd.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_LSHD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* shift left a certain amount of digits */
 int mp_lshd (mp_int * a, int b)
@@ -59,3 +60,4 @@
   }
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_mod.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_mod.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MOD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* c = a mod b, 0 <= c < b */
 int
@@ -40,3 +41,4 @@
   mp_clear (&t);
   return res;
 }
+#endif
--- a/bn_mp_mod_2d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_mod_2d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MOD_2D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* calc a value mod 2**b */
 int
@@ -47,3 +48,4 @@
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_mod_d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_mod_d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MOD_D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,10 +14,10 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 int
 mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
 {
   return mp_div_d(a, b, NULL, c);
 }
+#endif
--- a/bn_mp_montgomery_calc_normalization.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_montgomery_calc_normalization.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,31 +14,31 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
-/* calculates a = B^n mod b for Montgomery reduction
- * Where B is the base [e.g. 2^DIGIT_BIT].
- * B^n mod b is computed by first computing
- * A = B^(n-1) which doesn't require a reduction but a simple OR.
- * then C = A * B = B^n is computed by performing upto DIGIT_BIT
+/*
  * shifts with subtractions when the result is greater than b.
  *
  * The method is slightly modified to shift B unconditionally upto just under
  * the leading bit of b.  This saves alot of multiple precision shifting.
  */
-int
-mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
+int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
 {
   int     x, bits, res;
 
   /* how many bits of last digit does b use */
   bits = mp_count_bits (b) % DIGIT_BIT;
 
-  /* compute A = B^(n-1) * 2^(bits-1) */
-  if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
-    return res;
+
+  if (b->used > 1) {
+     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
+        return res;
+     }
+  } else {
+     mp_set(a, 1);
+     bits = 1;
   }
 
+
   /* now compute C = A * B mod b */
   for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
     if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
@@ -51,3 +53,4 @@
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_montgomery_reduce.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_montgomery_reduce.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_REDUCE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
 int
@@ -23,7 +24,7 @@
 
   /* can the fast reduction [comba] method be used?
    *
-   * Note that unlike in mp_mul you're safely allowed *less*
+   * Note that unlike in mul you're safely allowed *less*
    * than the available columns [255 per default] since carries
    * are fixed up in the inner loop.
    */
@@ -46,7 +47,7 @@
     /* mu = ai * rho mod b
      *
      * The value of rho must be precalculated via
-     * bn_mp_montgomery_setup() such that
+     * montgomery_setup() such that
      * it equals -1/n0 mod b this allows the
      * following inner loop to reduce the
      * input one digit at a time
@@ -110,3 +111,4 @@
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_montgomery_setup.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_montgomery_setup.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_SETUP_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* setups the montgomery reduction stuff */
 int
@@ -47,7 +48,8 @@
 #endif
 
   /* rho = -1/m mod b */
-  *rho = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
+  *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_mul.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_mul.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MUL_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,22 +14,20 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* high level multiplication (handles sign) */
 int mp_mul (mp_int * a, mp_int * b, mp_int * c)
 {
   int     res, neg;
   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-  
-#ifndef NO_LTM_TOOM
 
   /* use Toom-Cook? */
+#ifdef BN_MP_TOOM_MUL_C
   if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
     res = mp_toom_mul(a, b, c);
   } else 
 #endif
-#ifndef NO_LTM_KARATSUBA
+#ifdef BN_MP_KARATSUBA_MUL_C
   /* use Karatsuba? */
   if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
     res = mp_karatsuba_mul (a, b, c);
@@ -42,14 +42,21 @@
      */
     int     digs = a->used + b->used + 1;
 
+#ifdef BN_FAST_S_MP_MUL_DIGS_C
     if ((digs < MP_WARRAY) &&
         MIN(a->used, b->used) <= 
         (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
       res = fast_s_mp_mul_digs (a, b, c, digs);
-    } else {
-      res = s_mp_mul (a, b, c);
-    }
+    } else 
+#endif
+#ifdef BN_S_MP_MUL_DIGS_C
+      res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
+#else
+      res = MP_VAL;
+#endif
+
   }
-  c->sign = neg;
+  c->sign = (c->used > 0) ? neg : MP_ZPOS;
   return res;
 }
+#endif
--- a/bn_mp_mul_2.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_mul_2.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MUL_2_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* b = a*2 */
 int mp_mul_2(mp_int * a, mp_int * b)
@@ -74,3 +75,4 @@
   b->sign = a->sign;
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_mul_2d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_mul_2d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MUL_2D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* shift left by a certain bit count */
 int mp_mul_2d (mp_int * a, int b, mp_int * c)
@@ -77,3 +78,4 @@
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_mul_d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_mul_d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MUL_D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* multiply by a digit */
 int
@@ -70,3 +71,4 @@
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_mulmod.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_mulmod.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_MULMOD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* d = a * b (mod c) */
 int
@@ -33,3 +34,4 @@
   mp_clear (&t);
   return res;
 }
+#endif
--- a/bn_mp_n_root.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_n_root.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_N_ROOT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* find the n'th root of an integer 
  *
@@ -124,3 +125,4 @@
 __T1:mp_clear (&t1);
   return res;
 }
+#endif
--- a/bn_mp_neg.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_neg.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_NEG_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* b = -a */
 int mp_neg (mp_int * a, mp_int * b)
@@ -26,3 +27,4 @@
   }
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_or.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_or.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_OR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* OR two ints together */
 int mp_or (mp_int * a, mp_int * b, mp_int * c)
@@ -42,3 +43,4 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_prime_fermat.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_prime_fermat.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_FERMAT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* performs one Fermat test.
  * 
@@ -54,3 +55,4 @@
 __T:mp_clear (&t);
   return err;
 }
+#endif
--- a/bn_mp_prime_is_divisible.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_prime_is_divisible.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_IS_DIVISIBLE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* determines if an integers is divisible by one 
  * of the first PRIME_SIZE primes or not
@@ -42,3 +43,4 @@
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_prime_is_prime.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_prime_is_prime.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_IS_PRIME_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,12 +14,11 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* performs a variable number of rounds of Miller-Rabin
  *
  * Probability of error after t rounds is no more than
- * (1/4)^t when 1 <= t <= PRIME_SIZE
+
  *
  * Sets result to 1 if probably prime, 0 otherwise
  */
@@ -75,3 +76,4 @@
 __B:mp_clear (&b);
   return err;
 }
+#endif
--- a/bn_mp_prime_miller_rabin.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_prime_miller_rabin.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_MILLER_RABIN_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* Miller-Rabin test of "a" to the base of "b" as described in 
  * HAC pp. 139 Algorithm 4.24
@@ -95,3 +96,4 @@
 __N1:mp_clear (&n1);
   return err;
 }
+#endif
--- a/bn_mp_prime_next_prime.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_prime_next_prime.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_NEXT_PRIME_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* finds the next prime after the number "a" using "t" trials
  * of Miller-Rabin.
@@ -162,3 +163,4 @@
    return err;
 }
 
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bn_mp_prime_rabin_miller_trials.c	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,48 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+
+static const struct {
+   int k, t;
+} sizes[] = {
+{   128,    28 },
+{   256,    16 },
+{   384,    10 },
+{   512,     7 },
+{   640,     6 },
+{   768,     5 },
+{   896,     4 },
+{  1024,     4 }
+};
+
+/* returns # of RM trials required for a given bit size */
+int mp_prime_rabin_miller_trials(int size)
+{
+   int x;
+
+   for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
+       if (sizes[x].k == size) {
+          return sizes[x].t;
+       } else if (sizes[x].k > size) {
+          return (x == 0) ? sizes[0].t : sizes[x - 1].t;
+       }
+   }
+   return sizes[x-1].t + 1;
+}
+
+
+#endif
--- a/bn_mp_prime_random_ex.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_prime_random_ex.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_RANDOM_EX_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* makes a truly random prime of a given size (bits),
  *
@@ -92,6 +93,9 @@
 
       /* is it prime? */
       if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)           { goto error; }
+      if (res == MP_NO) {  
+         continue;
+      }
 
       if (flags & LTM_PRIME_SAFE) {
          /* see if (a-1)/2 is prime */
@@ -116,3 +120,4 @@
 }
 
 
+#endif
--- a/bn_mp_radix_size.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_radix_size.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_RADIX_SIZE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* returns size of ASCII reprensentation */
 int mp_radix_size (mp_int * a, int radix, int *size)
@@ -63,3 +64,4 @@
   return MP_OKAY;
 }
 
+#endif
--- a/bn_mp_radix_smap.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_radix_smap.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_RADIX_SMAP_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,7 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* chars used in radix conversions */
 const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+#endif
--- a/bn_mp_rand.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_rand.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_RAND_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* makes a pseudo-random int of a given size */
 int
@@ -47,3 +48,4 @@
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_read_radix.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_read_radix.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_READ_RADIX_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* read a string [ASCII] in a given radix */
 int mp_read_radix (mp_int * a, char *str, int radix)
@@ -74,3 +75,4 @@
   }
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_read_signed_bin.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_read_signed_bin.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_READ_SIGNED_BIN_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* read signed bin, big endian, first byte is 0==positive or 1==negative */
 int
@@ -34,3 +35,4 @@
 
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_read_unsigned_bin.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_read_unsigned_bin.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_READ_UNSIGNED_BIN_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
 int
@@ -48,3 +49,4 @@
   mp_clamp (a);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_reduce.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_reduce.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* reduces x mod m, assumes 0 < x < m**2, mu is 
  * precomputed via mp_reduce_setup.
@@ -38,9 +39,20 @@
       goto CLEANUP;
     }
   } else {
+#ifdef BN_S_MP_MUL_HIGH_DIGS_C
     if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
       goto CLEANUP;
     }
+#elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+#else 
+    { 
+      res = MP_VAL;
+      goto CLEANUP;
+    }
+#endif
   }
 
   /* q3 = q2 / b**(k+1) */
@@ -82,3 +94,4 @@
 
   return res;
 }
+#endif
--- a/bn_mp_reduce_2k.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_reduce_2k.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* reduces a modulo n where n is of the form 2**p - d */
 int
@@ -54,3 +55,4 @@
    return res;
 }
 
+#endif
--- a/bn_mp_reduce_2k_setup.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_reduce_2k_setup.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* determines the setup value */
 int 
@@ -40,3 +41,4 @@
    mp_clear(&tmp);
    return MP_OKAY;
 }
+#endif
--- a/bn_mp_reduce_is_2k.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_reduce_is_2k.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,12 +14,12 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* determines if mp_reduce_2k can be used */
 int mp_reduce_is_2k(mp_int *a)
 {
-   int ix, iy, iz, iw;
+   int ix, iy, iw;
+   mp_digit iz;
    
    if (a->used == 0) {
       return 0;
@@ -34,7 +36,7 @@
              return 0;
           }
           iz <<= 1;
-          if (iz > (int)MP_MASK) {
+          if (iz > (mp_digit)MP_MASK) {
              ++iw;
              iz = 1;
           }
@@ -43,3 +45,4 @@
    return 1;
 }
 
+#endif
--- a/bn_mp_reduce_setup.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_reduce_setup.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_SETUP_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,13 +14,11 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* pre-calculate the value required for Barrett reduction
  * For a given modulus "b" it calulates the value required in "a"
  */
-int
-mp_reduce_setup (mp_int * a, mp_int * b)
+int mp_reduce_setup (mp_int * a, mp_int * b)
 {
   int     res;
   
@@ -27,3 +27,4 @@
   }
   return mp_div (a, b, a, NULL);
 }
+#endif
--- a/bn_mp_rshd.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_rshd.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_RSHD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* shift right a certain amount of digits */
 void mp_rshd (mp_int * a, int b)
@@ -64,3 +65,4 @@
   /* remove excess digits */
   a->used -= b;
 }
+#endif
--- a/bn_mp_set.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_set.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SET_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* set to a digit */
 void mp_set (mp_int * a, mp_digit b)
@@ -21,3 +22,4 @@
   a->dp[0] = b & MP_MASK;
   a->used  = (a->dp[0] != 0) ? 1 : 0;
 }
+#endif
--- a/bn_mp_set_int.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_set_int.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SET_INT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* set a 32-bit const */
 int mp_set_int (mp_int * a, unsigned long b)
@@ -40,3 +41,4 @@
   mp_clamp (a);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_shrink.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_shrink.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SHRINK_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* shrink a bignum */
 int mp_shrink (mp_int * a)
@@ -27,3 +28,4 @@
   }
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_signed_bin_size.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_signed_bin_size.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SIGNED_BIN_SIZE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,10 +14,10 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* get the size for an signed equivalent */
 int mp_signed_bin_size (mp_int * a)
 {
   return 1 + mp_unsigned_bin_size (a);
 }
+#endif
--- a/bn_mp_sqr.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_sqr.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SQR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,35 +14,41 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* computes b = a*a */
 int
 mp_sqr (mp_int * a, mp_int * b)
 {
   int     res;
-#ifndef NO_LTM_TOOM
+
+#ifdef BN_MP_TOOM_SQR_C
   /* use Toom-Cook? */
   if (a->used >= TOOM_SQR_CUTOFF) {
     res = mp_toom_sqr(a, b);
-  } else
+  /* Karatsuba? */
+  } else 
 #endif
-#ifndef NO_LTM_KARATSUBA
-  /* Karatsuba? */
-  if (a->used >= KARATSUBA_SQR_CUTOFF) {
+#ifdef BN_MP_KARATSUBA_SQR_C
+if (a->used >= KARATSUBA_SQR_CUTOFF) {
     res = mp_karatsuba_sqr (a, b);
   } else 
 #endif
   {
+#ifdef BN_FAST_S_MP_SQR_C
     /* can we use the fast comba multiplier? */
     if ((a->used * 2 + 1) < MP_WARRAY && 
          a->used < 
          (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
       res = fast_s_mp_sqr (a, b);
-    } else {
+    } else
+#endif
+#ifdef BN_S_MP_SQR_C
       res = s_mp_sqr (a, b);
-    }
+#else
+      res = MP_VAL;
+#endif
   }
   b->sign = MP_ZPOS;
   return res;
 }
+#endif
--- a/bn_mp_sqrmod.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_sqrmod.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SQRMOD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* c = a * a (mod b) */
 int
@@ -33,3 +34,4 @@
   mp_clear (&t);
   return res;
 }
+#endif
--- a/bn_mp_sqrt.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_sqrt.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SQRT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* this function is less generic than mp_n_root, simpler and faster */
 int mp_sqrt(mp_int *arg, mp_int *ret) 
@@ -73,3 +74,4 @@
   return res;
 }
 
+#endif
--- a/bn_mp_sub.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_sub.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SUB_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* high level subtraction (handles signs) */
 int
@@ -51,3 +52,4 @@
   return res;
 }
 
+#endif
--- a/bn_mp_sub_d.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_sub_d.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SUB_D_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* single digit subtraction */
 int
@@ -81,3 +82,4 @@
   return MP_OKAY;
 }
 
+#endif
--- a/bn_mp_submod.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_submod.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_SUBMOD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* d = a - b (mod c) */
 int
@@ -34,3 +35,4 @@
   mp_clear (&t);
   return res;
 }
+#endif
--- a/bn_mp_to_signed_bin.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_to_signed_bin.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* store in signed [big endian] format */
 int
@@ -26,3 +27,4 @@
   b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_to_unsigned_bin.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_to_unsigned_bin.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* store in unsigned [big endian] format */
 int
@@ -41,3 +42,4 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_toom_mul.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_toom_mul.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_TOOM_MUL_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,9 +14,13 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
-/* multiplication using the Toom-Cook 3-way algorithm */
+/* multiplication using the Toom-Cook 3-way algorithm 
+ *
+ * Much more complicated than Karatsuba but has a lower asymptotic running time of 
+ * O(N**1.464).  This algorithm is only particularly useful on VERY large
+ * inputs (we're talking 1000s of digits here...).
+*/
 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
     mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
@@ -270,3 +276,4 @@
      return res;
 }     
      
+#endif
--- a/bn_mp_toom_sqr.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_toom_sqr.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_TOOM_SQR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* squaring using Toom-Cook 3-way algorithm */
 int
@@ -218,3 +219,4 @@
      return res;
 }
 
+#endif
--- a/bn_mp_toradix.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_toradix.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_TORADIX_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* stores a bignum as a ASCII string in a given radix (2..64) */
 int mp_toradix (mp_int * a, char *str, int radix)
@@ -67,3 +68,4 @@
   return MP_OKAY;
 }
 
+#endif
--- a/bn_mp_toradix_n.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_toradix_n.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_TORADIX_N_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* stores a bignum as a ASCII string in a given radix (2..64) 
  *
@@ -81,3 +82,4 @@
   return MP_OKAY;
 }
 
+#endif
--- a/bn_mp_unsigned_bin_size.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_unsigned_bin_size.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_UNSIGNED_BIN_SIZE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* get the size for an unsigned equivalent */
 int
@@ -21,3 +22,4 @@
   int     size = mp_count_bits (a);
   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
 }
+#endif
--- a/bn_mp_xor.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_xor.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_XOR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* XOR two ints together */
 int
@@ -36,10 +37,11 @@
   }
 
   for (ix = 0; ix < px; ix++) {
-    t.dp[ix] ^= x->dp[ix];
+
   }
   mp_clamp (&t);
   mp_exch (c, &t);
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
--- a/bn_mp_zero.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_mp_zero.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_MP_ZERO_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* set to zero */
 void
@@ -22,3 +23,4 @@
   a->used = 0;
   memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
 }
+#endif
--- a/bn_prime_tab.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_prime_tab.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_PRIME_TAB_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 const mp_digit __prime_tab[] = {
   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
@@ -53,3 +54,4 @@
   0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
 #endif
 };
+#endif
--- a/bn_reverse.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_reverse.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_REVERSE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* reverse an array, used for radix code */
 void
@@ -31,3 +32,4 @@
     --iy;
   }
 }
+#endif
--- a/bn_s_mp_add.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_s_mp_add.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_S_MP_ADD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
 int
@@ -101,3 +102,4 @@
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
--- a/bn_s_mp_exptmod.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_s_mp_exptmod.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_S_MP_EXPTMOD_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 #ifdef MP_LOW_MEM
    #define TAB_SIZE 32
@@ -232,3 +233,4 @@
   }
   return err;
 }
+#endif
--- a/bn_s_mp_mul_digs.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_s_mp_mul_digs.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_S_MP_MUL_DIGS_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* multiplies |a| * |b| and only computes upto digs digits of result
  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
@@ -83,3 +84,4 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
--- a/bn_s_mp_mul_high_digs.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_s_mp_mul_high_digs.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_S_MP_MUL_HIGH_DIGS_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* multiplies |a| * |b| and does not compute the lower digs digits
  * [meant to get the higher part of the product]
@@ -27,10 +28,12 @@
   mp_digit tmpx, *tmpt, *tmpy;
 
   /* can we use the fast multiplier? */
+#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
   if (((a->used + b->used + 1) < MP_WARRAY)
       && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
     return fast_s_mp_mul_high_digs (a, b, c, digs);
   }
+#endif
 
   if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
     return res;
@@ -71,3 +74,4 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
--- a/bn_s_mp_sqr.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_s_mp_sqr.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_S_MP_SQR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
 int
@@ -77,3 +78,4 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
--- a/bn_s_mp_sub.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bn_s_mp_sub.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BN_S_MP_SUB_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,7 +14,6 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
 int
@@ -81,3 +82,4 @@
   return MP_OKAY;
 }
 
+#endif
--- a/bncore.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/bncore.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,5 @@
+#include <tommath.h>
+#ifdef BNCORE_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
@@ -12,32 +14,18 @@
  *
  * Tom St Denis, [email protected], http://math.libtomcrypt.org
  */
-#include <tommath.h>
 
 /* Known optimal configurations
 
  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
- Intel P4               /GCC v3.2     /        70/       108
- AMD Athlon XP          /GCC v3.2     /       109/       127
- Intel Celeron          /GCC v3.2.1   /        97/       127
- Mendocino 366mhz (evil)
- Intel P3 750mhz        /GCC v3.2.1   /        95/       110
- Coppermine (mussel)
- Intel Celeron          /GCC v3.2.1   /        85/       125
- Coppermine 700mhz
- Alpha                  /compaq       /        54/        87
- Compaq C V6.4-014 on Compaq Tru64 UNIX V5.1A (Rev. 1885)
- AlphaServer 1000A 5/300
- morwong
- Pentium classic 75     /GCC v3.2.1   /        73/       127
- plod
-
+ Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
+ 
 */
 
-/* configured for a AMD XP Thoroughbred core with etc/tune.c */
-int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 88,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 128,     /* Min. number of digits before Karatsuba squaring is used. */
         
         TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
         TOOM_SQR_CUTOFF      = 400; 
+#endif
--- a/booker.pl	Fri Dec 17 06:27:22 2004 +0000
+++ b/booker.pl	Sun Dec 19 15:57:19 2004 +0000
@@ -82,8 +82,9 @@
          # scan till next end of comment, e.g. skip license 
          while (<SRC>) {
             $text[$line++] = $_;
-            last if ($_ =~ /tommath\.h/);
+            last if ($_ =~ /math\.libtomcrypt\.org/);
          }
+         <SRC>;   
       }
       
       $inline = 0;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/callgraph.txt	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,10168 @@
+BN_PRIME_TAB_C
+
+
+BN_MP_SQRT_C
++--->BN_MP_N_ROOT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_EXPT_D_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_ZERO_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_SET_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_CMP_D_C
+
+
+BN_MP_EXCH_C
+
+
+BN_MP_IS_SQUARE_C
++--->BN_MP_MOD_D_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_SET_INT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_SET_INT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_GET_INT_C
++--->BN_MP_SQRT_C
+|   +--->BN_MP_N_ROOT_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_EXPT_D_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_SUB_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_NEG_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
+
+
+BN_MP_EXPTMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_INVMOD_C
+|   +--->BN_FAST_MP_INVMOD_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_CMP_D_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INVMOD_SLOW_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_CMP_D_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_ABS_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_MULTI_C
++--->BN_MP_DR_IS_MODULUS_C
++--->BN_MP_REDUCE_IS_2K_C
+|   +--->BN_MP_REDUCE_2K_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_COUNT_BITS_C
++--->BN_MP_EXPTMOD_FAST_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_MONTGOMERY_SETUP_C
+|   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   +--->BN_MP_DR_SETUP_C
+|   +--->BN_MP_DR_REDUCE_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_2K_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MULMOD_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SQR_C
+|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
++--->BN_S_MP_EXPTMOD_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_REDUCE_SETUP_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SQR_C
+|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_REDUCE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_D_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_OR_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_ZERO_C
+
+
+BN_MP_GROW_C
+
+
+BN_MP_COUNT_BITS_C
+
+
+BN_MP_PRIME_FERMAT_C
++--->BN_MP_CMP_D_C
++--->BN_MP_INIT_C
++--->BN_MP_EXPTMOD_C
+|   +--->BN_MP_INVMOD_C
+|   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_DR_IS_MODULUS_C
+|   +--->BN_MP_REDUCE_IS_2K_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_EXPTMOD_FAST_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_DR_SETUP_C
+|   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MULMOD_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_EXPTMOD_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_REDUCE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_SUBMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_MOD_2D_C
++--->BN_MP_ZERO_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_TORADIX_N_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_DIV_D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_CMP_C
++--->BN_MP_CMP_MAG_C
+
+
+BNCORE_C
+
+
+BN_MP_TORADIX_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_DIV_D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_ADD_D_C
++--->BN_MP_GROW_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_DIV_3_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_FAST_S_MP_MUL_DIGS_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_SQRMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_INVMOD_C
++--->BN_FAST_MP_INVMOD_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ABS_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INVMOD_SLOW_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+
+
+BN_MP_AND_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_MUL_D_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_FAST_MP_INVMOD_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ABS_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_CMP_D_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_FWRITE_C
++--->BN_MP_RADIX_SIZE_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_TORADIX_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_S_MP_SQR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_N_ROOT_C
++--->BN_MP_INIT_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_EXPT_D_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_SQR_C
+|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+
+
+BN_MP_RADIX_SIZE_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_DIV_D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_READ_SIGNED_BIN_C
++--->BN_MP_READ_UNSIGNED_BIN_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_PRIME_RANDOM_EX_C
++--->BN_MP_READ_UNSIGNED_BIN_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_PRIME_IS_PRIME_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_PRIME_IS_DIVISIBLE_C
+|   |   +--->BN_MP_MOD_D_C
+|   |   |   +--->BN_MP_DIV_D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_PRIME_MILLER_RABIN_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SUB_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CNT_LSB_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXPTMOD_C
+|   |   |   +--->BN_MP_INVMOD_C
+|   |   |   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_DR_IS_MODULUS_C
+|   |   |   +--->BN_MP_REDUCE_IS_2K_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_DR_SETUP_C
+|   |   |   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_SQRMOD_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_2_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_KARATSUBA_SQR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
++--->BN_MP_ADD_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_INIT_COPY_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
+
+
+BN_MP_CLAMP_C
+
+
+BN_MP_TOOM_SQR_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_MOD_2D_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_MUL_2_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_3_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_MOD_C
++--->BN_MP_INIT_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_SET_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
+
+
+BN_MP_INIT_C
+
+
+BN_MP_TOOM_MUL_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_MOD_2D_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_MUL_2_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_3_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_PRIME_IS_PRIME_C
++--->BN_MP_CMP_D_C
++--->BN_MP_PRIME_IS_DIVISIBLE_C
+|   +--->BN_MP_MOD_D_C
+|   |   +--->BN_MP_DIV_D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_PRIME_MILLER_RABIN_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CNT_LSB_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXPTMOD_C
+|   |   +--->BN_MP_INVMOD_C
+|   |   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_DR_IS_MODULUS_C
+|   |   +--->BN_MP_REDUCE_IS_2K_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_DR_SETUP_C
+|   |   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_SQRMOD_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_COPY_C
++--->BN_MP_GROW_C
+
+
+BN_S_MP_SUB_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_READ_UNSIGNED_BIN_C
++--->BN_MP_GROW_C
++--->BN_MP_ZERO_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_EXPTMOD_FAST_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_INIT_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MONTGOMERY_SETUP_C
++--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
++--->BN_MP_MONTGOMERY_REDUCE_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
++--->BN_MP_DR_SETUP_C
++--->BN_MP_DR_REDUCE_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
++--->BN_MP_REDUCE_2K_SETUP_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_2K_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MULMOD_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_EXCH_C
+
+
+BN_MP_TO_UNSIGNED_BIN_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_SET_INT_C
++--->BN_MP_ZERO_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_MOD_D_C
++--->BN_MP_DIV_D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_SQR_C
++--->BN_MP_TOOM_SQR_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_KARATSUBA_SQR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_C
++--->BN_FAST_S_MP_SQR_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_S_MP_SQR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_MULMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_DIV_2D_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ZERO_C
++--->BN_MP_INIT_C
++--->BN_MP_MOD_2D_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
++--->BN_MP_RSHD_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
+
+
+BN_S_MP_ADD_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_FAST_S_MP_SQR_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_S_MP_MUL_DIGS_C
++--->BN_FAST_S_MP_MUL_DIGS_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_XOR_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_RADIX_SMAP_C
+
+
+BN_MP_DR_IS_MODULUS_C
+
+
+BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_2EXPT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_MUL_2_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_SUB_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_INIT_MULTI_C
++--->BN_MP_INIT_C
++--->BN_MP_CLEAR_C
+
+
+BN_S_MP_MUL_HIGH_DIGS_C
++--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_PRIME_NEXT_PRIME_C
++--->BN_MP_CMP_D_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MOD_D_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_PRIME_MILLER_RABIN_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CNT_LSB_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXPTMOD_C
+|   |   +--->BN_MP_INVMOD_C
+|   |   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_DR_IS_MODULUS_C
+|   |   +--->BN_MP_REDUCE_IS_2K_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_DR_SETUP_C
+|   |   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_SQRMOD_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_SIGNED_BIN_SIZE_C
++--->BN_MP_UNSIGNED_BIN_SIZE_C
+|   +--->BN_MP_COUNT_BITS_C
+
+
+BN_MP_INVMOD_SLOW_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_CMP_D_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_LCM_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_GCD_C
+|   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CNT_LSB_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_SET_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_REVERSE_C
+
+
+BN_MP_PRIME_IS_DIVISIBLE_C
++--->BN_MP_MOD_D_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+
+
+BN_MP_SET_C
++--->BN_MP_ZERO_C
+
+
+BN_MP_GCD_C
++--->BN_MP_ABS_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_ZERO_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CNT_LSB_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_EXCH_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_READ_RADIX_C
++--->BN_MP_ZERO_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_FAST_S_MP_MUL_HIGH_DIGS_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_FAST_MP_MONTGOMERY_REDUCE_C
++--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+
+
+BN_MP_DIV_D_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_DIV_3_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_REDUCE_2K_SETUP_C
++--->BN_MP_INIT_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_2EXPT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_INIT_SET_C
++--->BN_MP_INIT_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
+
+
+BN_MP_REDUCE_2K_C
++--->BN_MP_INIT_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_ERROR_C
+
+
+BN_MP_EXPT_D_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+
+
+BN_S_MP_EXPTMOD_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_INIT_C
++--->BN_MP_CLEAR_C
++--->BN_MP_REDUCE_SETUP_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_REDUCE_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_EXCH_C
+
+
+BN_MP_ABS_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
+
+
+BN_MP_INIT_SET_INT_C
++--->BN_MP_INIT_C
++--->BN_MP_SET_INT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_SUB_D_C
++--->BN_MP_GROW_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_TO_SIGNED_BIN_C
++--->BN_MP_TO_UNSIGNED_BIN_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_DIV_2_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_REDUCE_IS_2K_C
++--->BN_MP_REDUCE_2K_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_COUNT_BITS_C
+
+
+BN_MP_INIT_SIZE_C
++--->BN_MP_INIT_C
+
+
+BN_MP_DIV_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ZERO_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SET_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_INIT_C
++--->BN_MP_INIT_COPY_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
++--->BN_MP_RSHD_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_CLEAR_C
+
+
+BN_MP_MONTGOMERY_REDUCE_C
++--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+
+
+BN_MP_MUL_2_C
++--->BN_MP_GROW_C
+
+
+BN_MP_UNSIGNED_BIN_SIZE_C
++--->BN_MP_COUNT_BITS_C
+
+
+BN_MP_ADDMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_ADD_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_RAND_C
++--->BN_MP_ZERO_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+
+
+BN_MP_CNT_LSB_C
+
+
+BN_MP_2EXPT_C
++--->BN_MP_ZERO_C
++--->BN_MP_GROW_C
+
+
+BN_MP_RSHD_C
++--->BN_MP_ZERO_C
+
+
+BN_MP_SHRINK_C
+
+
+BN_MP_REDUCE_C
++--->BN_MP_REDUCE_SETUP_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_S_MP_MUL_HIGH_DIGS_C
+|   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MOD_2D_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_S_MP_MUL_DIGS_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_D_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_MUL_2D_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_GROW_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_GET_INT_C
+
+
+BN_MP_JACOBI_C
++--->BN_MP_CMP_D_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CNT_LSB_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_CLEAR_MULTI_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_MUL_C
++--->BN_MP_TOOM_MUL_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_KARATSUBA_MUL_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_CLEAR_C
++--->BN_FAST_S_MP_MUL_DIGS_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_S_MP_MUL_DIGS_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_EXTEUCLID_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_DR_REDUCE_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+
+
+BN_MP_FREAD_C
++--->BN_MP_ZERO_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_D_C
+
+
+BN_MP_REDUCE_SETUP_C
++--->BN_MP_2EXPT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_SET_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_MONTGOMERY_SETUP_C
+
+
+BN_MP_KARATSUBA_MUL_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_LSHD_C
++--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
+
+
+BN_MP_PRIME_MILLER_RABIN_C
++--->BN_MP_CMP_D_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CNT_LSB_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_EXPTMOD_C
+|   +--->BN_MP_INVMOD_C
+|   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_DR_IS_MODULUS_C
+|   +--->BN_MP_REDUCE_IS_2K_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_EXPTMOD_FAST_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_DR_SETUP_C
+|   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MULMOD_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_EXPTMOD_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_REDUCE_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_SQRMOD_C
+|   +--->BN_MP_SQR_C
+|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_DR_SETUP_C
+
+
+BN_MP_CMP_MAG_C
+
+
--- a/changes.txt	Fri Dec 17 06:27:22 2004 +0000
+++ b/changes.txt	Sun Dec 19 15:57:19 2004 +0000
@@ -1,3 +1,27 @@
+October 29th, 2004
+v0.32  -- Added "makefile.shared" for shared object support
+       -- Added more to the build options/configs in the manual
+       -- Started the Depends framework, wrote dep.pl to scan deps and 
+          produce "callgraph.txt" ;-)
+       -- Wrote SC_RSA_1 which will enable close to the minimum required to perform
+          RSA on 32-bit [or 64-bit] platforms with LibTomCrypt
+       -- Merged in the small/slower mp_div replacement.  You can now toggle which
+          you want to use as your mp_div() at build time.  Saves roughly 8KB or so.
+       -- Renamed a few files and changed some comments to make depends system work better.
+          (No changes to function names)
+       -- Merged in new Combas that perform 2 reads per inner loop instead of the older 
+          3reads/2writes per inner loop of the old code.  Really though if you want speed
+          learn to use TomsFastMath ;-)
+
+August 9th, 2004
+v0.31  -- "profiled" builds now :-) new timings for Intel Northwoods
+       -- Added "pretty" build target
+       -- Update mp_init() to actually assign 0's instead of relying on calloc()
+       -- "Wolfgang Ehrhardt" <[email protected]> found a bug in mp_mul() where if
+          you multiply a negative by zero you get negative zero as the result.  Oops.
+       -- J Harper from PeerSec let me toy with his AMD64 and I got 60-bit digits working properly
+          [this also means that I fixed a bug where if sizeof(int) < sizeof(mp_digit) it would bug]
+
 April 11th, 2004
 v0.30  -- Added "mp_toradix_n" which stores upto "n-1" least significant digits of an mp_int
        -- Johan Lindh sent a patch so MSVC wouldn't whine about redefining malloc [in weird dll modes]
--- a/demo/demo.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/demo/demo.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,7 +1,5 @@
 #include <time.h>
 
-#define TESTING
-
 #ifdef IOWNANATHLON
 #include <unistd.h>
 #define SLEEP sleep(4)
@@ -11,49 +9,6 @@
 
 #include "tommath.h"
 
-#ifdef TIMER
-ulong64 _tt;
-
-#if defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
-/* RDTSC from Scott Duplichan */
-static ulong64 TIMFUNC (void)
-   {
-   #if defined __GNUC__
-      #ifdef __i386__
-         ulong64 a;
-         __asm__ __volatile__ ("rdtsc ":"=A" (a));
-         return a;
-      #else /* gcc-IA64 version */
-         unsigned long result;
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         while (__builtin_expect ((int) result == -1, 0))
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         return result;
-      #endif
-
-   // Microsoft and Intel Windows compilers
-   #elif defined _M_IX86
-     __asm rdtsc
-   #elif defined _M_AMD64
-     return __rdtsc ();
-   #elif defined _M_IA64
-     #if defined __INTEL_COMPILER
-       #include <ia64intrin.h>
-     #endif
-      return __getReg (3116);
-   #else
-     #error need rdtsc function for this build
-   #endif
-   }
-#else
-#define TIMFUNC clock
-#endif
-
-ulong64 rdtsc(void) { return TIMFUNC() - _tt; }
-void reset(void) { _tt = TIMFUNC(); }
-
-#endif
-
 void ndraw(mp_int *a, char *name)
 {
    char buf[4096];
@@ -89,10 +44,6 @@
 }
 
 
-#define DO2(x) x; x;
-#define DO4(x) DO2(x); DO2(x);
-#define DO8(x) DO4(x); DO4(x);
-#define DO(x)  DO8(x); DO8(x);
 
    char cmd[4096], buf[4096];
 int main(void)
@@ -103,10 +54,6 @@
    unsigned rr;
    int i, n, err, cnt, ix, old_kara_m, old_kara_s;
 
-#ifdef TIMER
-   ulong64 tt, CLK_PER_SEC;
-   FILE *log, *logb, *logc;
-#endif
 
    mp_init(&a);
    mp_init(&b);
@@ -117,11 +64,11 @@
 
    srand(time(NULL));
 
-#ifdef TESTING
+#if 0
   // test mp_get_int
   printf("Testing: mp_get_int\n");
   for(i=0;i<1000;++i) {
-    t = (unsigned long)rand()*rand()+1;
+    t = ((unsigned long)rand()*rand()+1)&0xFFFFFFFF;
     mp_set_int(&a,t);
     if (t!=mp_get_int(&a)) { 
       printf("mp_get_int() bad result!\n");
@@ -141,7 +88,7 @@
 
   // test mp_sqrt
   printf("Testing: mp_sqrt\n");
-  for (i=0;i<10000;++i) { 
+  for (i=0;i<1000;++i) { 
     printf("%6d\r", i); fflush(stdout);
     n = (rand()&15)+1;
     mp_rand(&a,n);
@@ -157,7 +104,7 @@
   }
 
   printf("\nTesting: mp_is_square\n");
-  for (i=0;i<100000;++i) {
+  for (i=0;i<1000;++i) {
     printf("%6d\r", i); fflush(stdout);
 
     /* test mp_is_square false negatives */
@@ -186,11 +133,9 @@
 
   }
   printf("\n\n");
-#endif
 
-#ifdef TESTING 
    /* test for size */
-   for (ix = 16; ix < 512; ix++) {
+   for (ix = 10; ix < 256; ix++) {
        printf("Testing (not safe-prime): %9d bits    \r", ix); fflush(stdout);
        err = mp_prime_random_ex(&a, 8, ix, (rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON, myrng, NULL);
        if (err != MP_OKAY) {
@@ -203,7 +148,7 @@
        }
    }
 
-   for (ix = 16; ix < 512; ix++) {
+   for (ix = 16; ix < 256; ix++) {
        printf("Testing (   safe-prime): %9d bits    \r", ix); fflush(stdout);
        err = mp_prime_random_ex(&a, 8, ix, ((rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON)|LTM_PRIME_SAFE, myrng, NULL);
        if (err != MP_OKAY) {
@@ -225,9 +170,7 @@
    }
 
    printf("\n\n");
-#endif
 
-#ifdef TESTING
    mp_read_radix(&a, "123456", 10);
    mp_toradix_n(&a, buf, 10, 3);
    printf("a == %s\n", buf);
@@ -235,7 +178,6 @@
    printf("a == %s\n", buf);
    mp_toradix_n(&a, buf, 10, 30);
    printf("a == %s\n", buf);
-#endif
 
 
 #if 0
@@ -248,22 +190,6 @@
    }
 #endif
 
-#if 0
-{
-   mp_word aa, bb;
-
-   for (;;) {
-       aa = abs(rand()) & MP_MASK;
-       bb = abs(rand()) & MP_MASK;
-      if (MULT(aa,bb) != (aa*bb)) {
-             printf("%llu * %llu == %llu or %llu?\n", aa, bb, (ulong64)MULT(aa,bb), (ulong64)(aa*bb));
-             return 0;
-          }
-   }
-}
-#endif
-
-#ifdef TESTING
    /* test mp_cnt_lsb */
    printf("testing mp_cnt_lsb...\n");
    mp_set(&a, 1);
@@ -274,12 +200,10 @@
        }
        mp_mul_2(&a, &a);
    }
-#endif
 
 /* test mp_reduce_2k */
-#ifdef TESTING
    printf("Testing mp_reduce_2k...\n");
-   for (cnt = 3; cnt <= 384; ++cnt) {
+   for (cnt = 3; cnt <= 128; ++cnt) {
        mp_digit tmp;
        mp_2expt(&a, cnt);
        mp_sub_d(&a, 2, &a);  /* a = 2**cnt - 2 */
@@ -289,7 +213,7 @@
        printf("(%d)", mp_reduce_is_2k(&a));
        mp_reduce_2k_setup(&a, &tmp);
        printf("(%d)", tmp);
-       for (ix = 0; ix < 10000; ix++) {
+       for (ix = 0; ix < 1000; ix++) {
            if (!(ix & 127)) {printf("."); fflush(stdout); }
            mp_rand(&b, (cnt/DIGIT_BIT  + 1) * 2);
            mp_copy(&c, &b);
@@ -301,14 +225,11 @@
            }
         }
     }
-#endif
-
 
 /* test mp_div_3  */
-#ifdef TESTING
    printf("Testing mp_div_3...\n");
    mp_set(&d, 3);
-   for (cnt = 0; cnt < 1000000; ) {
+   for (cnt = 0; cnt < 10000; ) {
       mp_digit r1, r2;
 
       if (!(++cnt & 127)) printf("%9d\r", cnt);
@@ -321,12 +242,10 @@
       }
    }
    printf("\n\nPassed div_3 testing\n");
-#endif
 
 /* test the DR reduction */
-#ifdef TESTING
    printf("testing mp_dr_reduce...\n");
-   for (cnt = 2; cnt < 128; cnt++) {
+   for (cnt = 2; cnt < 32; cnt++) {
        printf("%d digit modulus\n", cnt);
        mp_grow(&a, cnt);
        mp_zero(&a);
@@ -334,7 +253,7 @@
            a.dp[ix] = MP_MASK;
        }
        a.used = cnt;
-       mp_prime_next_prime(&a, 3, 0);
+       a.dp[0] = 3;
 
        mp_rand(&b, cnt - 1);
        mp_copy(&b, &c);
@@ -346,204 +265,15 @@
          mp_copy(&b, &c);
 
          mp_mod(&b, &a, &b);
-         mp_dr_reduce(&c, &a, (1<<DIGIT_BIT)-a.dp[0]);
+         mp_dr_reduce(&c, &a, (((mp_digit)1)<<DIGIT_BIT)-a.dp[0]);
 
          if (mp_cmp(&b, &c) != MP_EQ) {
             printf("Failed on trial %lu\n", rr); exit(-1);
 
          }
-      } while (++rr < 100000);
+      } while (++rr < 500);
       printf("Passed DR test for %d digits\n", cnt);
    }
-#endif
-
-#ifdef TIMER
-      /* temp. turn off TOOM */
-      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
-
-      reset();
-      sleep(1);
-      CLK_PER_SEC = rdtsc();
-
-      printf("CLK_PER_SEC == %lu\n", CLK_PER_SEC);
-      
-
-      log = fopen("logs/add.log", "w");
-      for (cnt = 8; cnt <= 128; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         reset();
-         rr = 0;
-         do {
-            DO(mp_add(&a,&b,&c));
-            rr += 16;
-         } while (rdtsc() < (CLK_PER_SEC * 2));
-         tt = rdtsc();
-         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt); fflush(log);
-      }
-      fclose(log);
-
-      log = fopen("logs/sub.log", "w");
-      for (cnt = 8; cnt <= 128; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         reset();
-         rr = 0;
-         do {
-            DO(mp_sub(&a,&b,&c));
-            rr += 16;
-         } while (rdtsc() < (CLK_PER_SEC * 2));
-         tt = rdtsc();
-         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
-      }
-      fclose(log);
-
-   /* do mult/square twice, first without karatsuba and second with */
-mult_test:   
-   old_kara_m = KARATSUBA_MUL_CUTOFF;
-   old_kara_s = KARATSUBA_SQR_CUTOFF;
-   for (ix = 0; ix < 2; ix++) {
-      printf("With%s Karatsuba\n", (ix==0)?"out":"");
-
-      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
-      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
-
-      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
-      for (cnt = 32; cnt <= 288; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         reset();
-         rr = 0;
-         do {
-            DO(mp_mul(&a, &b, &c));
-            rr += 16;
-         } while (rdtsc() < (CLK_PER_SEC * 2));
-         tt = rdtsc();
-         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
-      }
-      fclose(log);
-
-      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
-      for (cnt = 32; cnt <= 288; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         reset();
-         rr = 0;
-         do {
-            DO(mp_sqr(&a, &b));
-            rr += 16;
-         } while (rdtsc() < (CLK_PER_SEC * 2));
-         tt = rdtsc();
-         printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
-      }
-      fclose(log);
-
-   }
-expt_test:
-  {
-      char *primes[] = {
-         /* 2K moduli mersenne primes */
-         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
-         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
-         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
-         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
-         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
-         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
-
-         /* DR moduli */
-         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
-         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
-         "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
-         "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
-         "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
-         "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
-         "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
-
-         /* generic unrestricted moduli */
-         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
-         "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
-         "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
-         "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
-         "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
-         "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
-         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
-         NULL
-      };
-   log = fopen("logs/expt.log", "w");
-   logb = fopen("logs/expt_dr.log", "w");
-   logc = fopen("logs/expt_2k.log", "w");
-   for (n = 0; primes[n]; n++) {
-      SLEEP;
-      mp_read_radix(&a, primes[n], 10);
-      mp_zero(&b);
-      for (rr = 0; rr < mp_count_bits(&a); rr++) {
-         mp_mul_2(&b, &b);
-         b.dp[0] |= lbit();
-         b.used  += 1;
-      }
-      mp_sub_d(&a, 1, &c);
-      mp_mod(&b, &c, &b);
-      mp_set(&c, 3);
-      reset();
-      rr = 0;
-      do {
-         DO(mp_exptmod(&c, &b, &a, &d));
-         rr += 16;
-      } while (rdtsc() < (CLK_PER_SEC * 2));
-      tt = rdtsc();
-      mp_sub_d(&a, 1, &e);
-      mp_sub(&e, &b, &b);
-      mp_exptmod(&c, &b, &a, &e);  /* c^(p-1-b) mod a */
-      mp_mulmod(&e, &d, &a, &d);   /* c^b * c^(p-1-b) == c^p-1 == 1 */
-      if (mp_cmp_d(&d, 1)) {
-         printf("Different (%d)!!!\n", mp_count_bits(&a));
-         draw(&d);
-         exit(0);
-      }
-      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);
-   }
-   }
-   fclose(log);
-   fclose(logb);
-   fclose(logc);
-
-   log = fopen("logs/invmod.log", "w");
-   for (cnt = 4; cnt <= 128; cnt += 4) {
-      SLEEP;
-      mp_rand(&a, cnt);
-      mp_rand(&b, cnt);
-
-      do {
-         mp_add_d(&b, 1, &b);
-         mp_gcd(&a, &b, &c);
-      } while (mp_cmp_d(&c, 1) != MP_EQ);
-
-      reset();
-      rr = 0;
-      do {
-         DO(mp_invmod(&b, &a, &c));
-         rr += 16;
-      } while (rdtsc() < (CLK_PER_SEC * 2));
-      tt = rdtsc();
-      mp_mulmod(&b, &c, &a, &d);
-      if (mp_cmp_d(&d, 1) != MP_EQ) {
-         printf("Failed to invert\n");
-         return 0;
-      }
-      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt);
-   }
-   fclose(log);
-
-   return 0;
 
 #endif
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/demo/timing.c	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,291 @@
+#include <tommath.h>
+#include <time.h>
+
+ulong64 _tt;
+
+#ifdef IOWNANATHLON
+#include <unistd.h>
+#define SLEEP sleep(4)
+#else
+#define SLEEP
+#endif
+
+
+void ndraw(mp_int *a, char *name)
+{
+   char buf[4096];
+   printf("%s: ", name);
+   mp_toradix(a, buf, 64);
+   printf("%s\n", buf);
+}
+
+static void draw(mp_int *a)
+{
+   ndraw(a, "");
+}
+
+
+unsigned long lfsr = 0xAAAAAAAAUL;
+
+int lbit(void)
+{
+   if (lfsr & 0x80000000UL) {
+      lfsr = ((lfsr << 1) ^ 0x8000001BUL) & 0xFFFFFFFFUL;
+      return 1;
+   } else {
+      lfsr <<= 1;
+      return 0;
+   }
+}
+
+#if defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
+/* RDTSC from Scott Duplichan */
+static ulong64 TIMFUNC (void)
+   {
+   #if defined __GNUC__
+      #ifdef __i386__
+         ulong64 a;
+         __asm__ __volatile__ ("rdtsc ":"=A" (a));
+         return a;
+      #else /* gcc-IA64 version */
+         unsigned long result;
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         while (__builtin_expect ((int) result == -1, 0))
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         return result;
+      #endif
+
+   // Microsoft and Intel Windows compilers
+   #elif defined _M_IX86
+     __asm rdtsc
+   #elif defined _M_AMD64
+     return __rdtsc ();
+   #elif defined _M_IA64
+     #if defined __INTEL_COMPILER
+       #include <ia64intrin.h>
+     #endif
+      return __getReg (3116);
+   #else
+     #error need rdtsc function for this build
+   #endif
+   }
+#else
+#define TIMFUNC clock
+#endif
+
+#define DO(x) x; x;
+//#define DO4(x) DO2(x); DO2(x);
+//#define DO8(x) DO4(x); DO4(x);
+//#define DO(x)  DO8(x); DO8(x);
+
+int main(void)
+{
+   ulong64 tt, gg, CLK_PER_SEC;
+   FILE *log, *logb, *logc;
+   mp_int a, b, c, d, e, f;
+   int n, cnt, ix, old_kara_m, old_kara_s;
+   unsigned rr;
+
+   mp_init(&a);
+   mp_init(&b);
+   mp_init(&c);
+   mp_init(&d);
+   mp_init(&e);
+   mp_init(&f);
+
+   srand(time(NULL));
+ 
+
+      /* temp. turn off TOOM */
+      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
+
+      CLK_PER_SEC = TIMFUNC();
+      sleep(1);
+      CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC;
+
+      printf("CLK_PER_SEC == %llu\n", CLK_PER_SEC);
+      
+      log = fopen("logs/add.log", "w");
+      for (cnt = 8; cnt <= 128; cnt += 8) {
+         SLEEP;
+         mp_rand(&a, cnt);
+         mp_rand(&b, cnt);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_add(&a,&b,&c));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 100000);
+         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt); fflush(log);
+      }
+      fclose(log);
+
+      log = fopen("logs/sub.log", "w");
+      for (cnt = 8; cnt <= 128; cnt += 8) {
+         SLEEP;
+         mp_rand(&a, cnt);
+         mp_rand(&b, cnt);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_sub(&a,&b,&c));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 100000);
+
+         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);  fflush(log);
+      }
+      fclose(log);
+
+   /* do mult/square twice, first without karatsuba and second with */
+   old_kara_m = KARATSUBA_MUL_CUTOFF;
+   old_kara_s = KARATSUBA_SQR_CUTOFF;
+   for (ix = 0; ix < 1; ix++) {
+      printf("With%s Karatsuba\n", (ix==0)?"out":"");
+
+      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
+      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
+
+      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
+      for (cnt = 4; cnt <= 288; cnt += 2) {
+         SLEEP;
+         mp_rand(&a, cnt);
+         mp_rand(&b, cnt);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_mul(&a, &b, &c));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 100);
+         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
+      }
+      fclose(log);
+
+      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
+      for (cnt = 4; cnt <= 288; cnt += 2) {
+         SLEEP;
+         mp_rand(&a, cnt);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_sqr(&a, &b));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 100);
+         printf("Squaring\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
+      }
+      fclose(log);
+
+   }
+
+  {
+      char *primes[] = {
+         /* 2K moduli mersenne primes */
+         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
+         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
+         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
+         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
+         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
+         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
+
+         /* DR moduli */
+         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
+         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
+         "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
+         "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
+         "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
+         "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
+         "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
+
+         /* generic unrestricted moduli */
+         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
+         "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
+         "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
+         "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
+         "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
+         "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
+         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
+         NULL
+      };
+   log = fopen("logs/expt.log", "w");
+   logb = fopen("logs/expt_dr.log", "w");
+   logc = fopen("logs/expt_2k.log", "w");
+   for (n = 0; primes[n]; n++) {
+      SLEEP;
+      mp_read_radix(&a, primes[n], 10);
+      mp_zero(&b);
+      for (rr = 0; rr < (unsigned)mp_count_bits(&a); rr++) {
+         mp_mul_2(&b, &b);
+         b.dp[0] |= lbit();
+         b.used  += 1;
+      }
+      mp_sub_d(&a, 1, &c);
+      mp_mod(&b, &c, &b);
+      mp_set(&c, 3);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_exptmod(&c, &b, &a, &d));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 10);
+      mp_sub_d(&a, 1, &e);
+      mp_sub(&e, &b, &b);
+      mp_exptmod(&c, &b, &a, &e);  /* c^(p-1-b) mod a */
+      mp_mulmod(&e, &d, &a, &d);   /* c^b * c^(p-1-b) == c^p-1 == 1 */
+      if (mp_cmp_d(&d, 1)) {
+         printf("Different (%d)!!!\n", mp_count_bits(&a));
+         draw(&d);
+         exit(0);
+      }
+      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), tt);
+   }
+   }
+   fclose(log);
+   fclose(logb);
+   fclose(logc);
+
+   log = fopen("logs/invmod.log", "w");
+   for (cnt = 4; cnt <= 128; cnt += 4) {
+      SLEEP;
+      mp_rand(&a, cnt);
+      mp_rand(&b, cnt);
+
+      do {
+         mp_add_d(&b, 1, &b);
+         mp_gcd(&a, &b, &c);
+      } while (mp_cmp_d(&c, 1) != MP_EQ);
+
+         rr = 0;
+         tt = -1;
+      do {
+         gg = TIMFUNC();
+         DO(mp_invmod(&b, &a, &c));
+         gg = (TIMFUNC() - gg)>>1;
+         if (tt > gg) tt = gg;
+      } while (++rr < 1000);
+      mp_mulmod(&b, &c, &a, &d);
+      if (mp_cmp_d(&d, 1) != MP_EQ) {
+         printf("Failed to invert\n");
+         return 0;
+      }
+      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);
+   }
+   fclose(log);
+
+   return 0;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dep.pl	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,121 @@
+#!/usr/bin/perl 
+#
+# Walk through source, add labels and make classes
+#
+#use strict;
+
+my %deplist;
+
+#open class file and write preamble 
+open(CLASS, ">tommath_class.h") or die "Couldn't open tommath_class.h for writing\n";
+print CLASS "#if !(defined(LTM1) && defined(LTM2) && defined(LTM3))\n#if defined(LTM2)\n#define LTM3\n#endif\n#if defined(LTM1)\n#define LTM2\n#endif\n#define LTM1\n\n#if defined(LTM_ALL)\n";
+
+foreach my $filename (glob "bn*.c") {
+   my $define = $filename;
+
+   # convert filename to upper case so we can use it as a define 
+   $define =~ tr/[a-z]/[A-Z]/;
+   $define =~ tr/\./_/;
+   print CLASS "#define $define\n";
+
+   # now copy text and apply #ifdef as required 
+   my $apply = 0;
+   open(SRC, "<$filename");
+   open(OUT, ">tmp");
+
+   # first line will be the #ifdef
+   my $line = <SRC>;
+   if ($line =~ /include/) {
+      print OUT $line;
+   } else {
+      print OUT "#include <tommath.h>\n#ifdef $define\n$line";
+      $apply = 1;
+   }
+   while (<SRC>) {
+      if (!($_ =~ /tommath\.h/)) {
+         print OUT $_;
+      }
+   }
+   if ($apply == 1) {
+      print OUT "#endif\n";
+   }
+   close SRC;
+   close OUT;
+
+   unlink($filename);
+   rename("tmp", $filename);
+}
+print CLASS "#endif\n\n";
+
+# now do classes 
+
+foreach my $filename (glob "bn*.c") {
+   open(SRC, "<$filename") or die "Can't open source file!\n"; 
+
+   # convert filename to upper case so we can use it as a define 
+   $filename =~ tr/[a-z]/[A-Z]/;
+   $filename =~ tr/\./_/;
+
+   print CLASS "#if defined($filename)\n";
+   my $list = $filename;
+
+   # scan for mp_* and make classes
+   while (<SRC>) {
+      my $line = $_;
+      while ($line =~ m/(fast_)*(s_)*mp\_[a-z_0-9]*/) {
+          $line = $';
+          # now $& is the match, we want to skip over LTM keywords like
+          # mp_int, mp_word, mp_digit
+          if (!($& eq "mp_digit") && !($& eq "mp_word") && !($& eq "mp_int")) {
+             my $a = $&;
+             $a =~ tr/[a-z]/[A-Z]/;
+             $a = "BN_" . $a . "_C";
+             if (!($list =~ /$a/)) {
+                print CLASS "   #define $a\n";
+             }
+             $list = $list . "," . $a;
+          }
+      }
+   }
+   @deplist{$filename} = $list;
+
+   print CLASS "#endif\n\n";
+   close SRC;
+}
+
+print CLASS "#ifdef LTM3\n#define LTM_LAST\n#endif\n#include <tommath_superclass.h>\n#include <tommath_class.h>\n#else\n#define LTM_LAST\n#endif\n";
+close CLASS;
+
+#now let's make a cool call graph... 
+
+open(OUT,">callgraph.txt");
+$indent = 0;
+foreach (keys %deplist) {
+   $list = "";
+   draw_func(@deplist{$_});
+   print OUT "\n\n";
+}
+close(OUT);
+
+sub draw_func()
+{
+   my @funcs = split(",", $_[0]);
+   if ($list =~ /@funcs[0]/) {
+      return;
+   } else {
+      $list = $list . @funcs[0];
+   }
+   if ($indent == 0) { }
+   elsif ($indent >= 1) { print OUT "|   " x ($indent - 1) . "+--->"; }
+   print OUT @funcs[0] . "\n";   
+   shift @funcs;
+      my $temp = $list;
+   foreach my $i (@funcs) {
+      ++$indent;
+      draw_func(@deplist{$i});
+      --$indent;
+   }
+      $list = $temp;
+}
+
+
--- a/etc/makefile	Fri Dec 17 06:27:22 2004 +0000
+++ b/etc/makefile	Sun Dec 19 15:57:19 2004 +0000
@@ -46,4 +46,5 @@
 
         
 clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat \
+         *.da *.dyn *.dpi *~
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/etc/makefile.icc	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,67 @@
+CC = icc
+
+CFLAGS += -I../
+
+# optimize for SPEED
+#
+# -mcpu= can be pentium, pentiumpro (covers PII through PIII) or pentium4
+# -ax?   specifies make code specifically for ? but compatible with IA-32
+# -x?    specifies compile solely for ? [not specifically IA-32 compatible]
+#
+# where ? is 
+#   K - PIII
+#   W - first P4 [Williamette]
+#   N - P4 Northwood
+#   P - P4 Prescott
+#   B - Blend of P4 and PM [mobile]
+#
+# Default to just generic max opts
+CFLAGS += -O3 -xN -ip
+
+# default lib name (requires install with root)
+# LIBNAME=-ltommath
+
+# libname when you can't install the lib with install
+LIBNAME=../libtommath.a
+
+#provable primes
+pprime: pprime.o
+	$(CC) pprime.o $(LIBNAME) -o pprime
+
+# portable [well requires clock()] tuning app
+tune: tune.o
+	$(CC) tune.o $(LIBNAME) -o tune
+	
+# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
+tune86: tune.c
+	nasm -f coff timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+	
+# for cygwin
+tune86c: tune.c
+	nasm -f gnuwin32 timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+
+#make tune86 for linux or any ELF format
+tune86l: tune.c
+	nasm -f elf -DUSE_ELF timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
+        
+# spits out mersenne primes
+mersenne: mersenne.o
+	$(CC) mersenne.o $(LIBNAME) -o mersenne
+
+# fines DR safe primes for the given config
+drprime: drprime.o
+	$(CC) drprime.o $(LIBNAME) -o drprime
+	
+# fines 2k safe primes for the given config
+2kprime: 2kprime.o
+	$(CC) 2kprime.o $(LIBNAME) -o 2kprime
+
+mont: mont.o
+	$(CC) mont.o $(LIBNAME) -o mont
+
+        
+clean:
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat *.il
--- a/etc/tune.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/etc/tune.c	Sun Dec 19 15:57:19 2004 +0000
@@ -8,7 +8,7 @@
 /* how many times todo each size mult.  Depends on your computer.  For slow computers
  * this can be low like 5 or 10.  For fast [re: Athlon] should be 25 - 50 or so 
  */
-#define TIMES 50
+#define TIMES (1UL<<14UL)
 
 
 #ifndef X86_TIMER
@@ -23,154 +23,85 @@
 extern ulong64 t_read(void);
 #endif
 
-ulong64
-time_mult (int max)
+ulong64 time_mult(int size, int s)
 {
-  int     x, y;
+  unsigned long     x;
   mp_int  a, b, c;
+  ulong64 t1;
 
   mp_init (&a);
   mp_init (&b);
   mp_init (&c);
 
+  mp_rand (&a, size);
+  mp_rand (&b, size);
+
+  if (s == 1) { 
+      KARATSUBA_MUL_CUTOFF = size;
+  } else {
+      KARATSUBA_MUL_CUTOFF = 100000;
+  }
+
   t_start();
-  for (x = 32; x <= max; x += 4) {
-    mp_rand (&a, x);
-    mp_rand (&b, x);
-    for (y = 0; y < TIMES; y++) {
-      mp_mul (&a, &b, &c);
-    }
+  for (x = 0; x < TIMES; x++) {
+      mp_mul(&a,&b,&c);
   }
+  t1 = t_read();
   mp_clear (&a);
   mp_clear (&b);
   mp_clear (&c);
-  return t_read();
+  return t1;
 }
 
-ulong64
-time_sqr (int max)
+ulong64 time_sqr(int size, int s)
 {
-  int     x, y;
+  unsigned long     x;
   mp_int  a, b;
+  ulong64 t1;
 
   mp_init (&a);
   mp_init (&b);
 
+  mp_rand (&a, size);
+
+  if (s == 1) { 
+      KARATSUBA_SQR_CUTOFF = size;
+  } else {
+      KARATSUBA_SQR_CUTOFF = 100000;
+  }
+
   t_start();
-  for (x = 32; x <= max; x += 4) {
-    mp_rand (&a, x);
-    for (y = 0; y < TIMES; y++) {
-      mp_sqr (&a, &b);
-    }
+  for (x = 0; x < TIMES; x++) {
+      mp_sqr(&a,&b);
   }
+  t1 = t_read();
   mp_clear (&a);
   mp_clear (&b);
-  return t_read();
+  return t1;
 }
 
 int
 main (void)
 {
-  int     best_kmult, best_tmult, best_ksquare, best_tsquare, counter;
-  ulong64 best, ti;
-  FILE   *log;
+  ulong64 t1, t2;
+  int x, y;
 
-  best_kmult = best_ksquare = best_tmult = best_tsquare = 0;
-  /* tune multiplication first */
-  
-  /* effectively turn TOOM off */
-  TOOM_SQR_CUTOFF = TOOM_MUL_CUTOFF = 100000;
-    
-  log = fopen ("mult.log", "w");
-  best = -1;
-  counter = 16;
-  for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 200; KARATSUBA_MUL_CUTOFF++) {
-    ti = time_mult (300);
-    printf ("%4d : %9llu            \r", KARATSUBA_MUL_CUTOFF, ti);
-    fprintf (log, "%d, %llu\n", KARATSUBA_MUL_CUTOFF, ti);
-    fflush (stdout);
-    if (ti < best) {
-      printf ("New best: %llu, %d         \r", ti, KARATSUBA_MUL_CUTOFF);
-      best = ti;
-      best_kmult = KARATSUBA_MUL_CUTOFF;
-      counter = 16;
-    } else if (--counter == 0) {
-       printf("No better found in 16 trials.\n");
-       break;
-    }
-  }
-  fclose (log);
-  printf("Karatsuba Multiplier Cutoff (KARATSUBA_MUL_CUTOFF) == %d\n", best_kmult);
-  
-  /* tune squaring */
-  log = fopen ("sqr.log", "w");
-  best = -1;
-  counter = 16;
-  for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 200; KARATSUBA_SQR_CUTOFF++) {
-    ti = time_sqr (300);
-    printf ("%4d : %9llu             \r", KARATSUBA_SQR_CUTOFF, ti);
-    fprintf (log, "%d, %llu\n", KARATSUBA_SQR_CUTOFF, ti);
-    fflush (stdout);
-    if (ti < best) {
-      printf ("New best: %llu, %d         \r", ti, KARATSUBA_SQR_CUTOFF);
-      best = ti;
-      best_ksquare = KARATSUBA_SQR_CUTOFF;
-      counter = 16;
-    } else if (--counter == 0) {
-       printf("No better found in 16 trials.\n");
-       break;
-    }
+  for (x = 8; ; x += 2) { 
+     t1 = time_mult(x, 0);
+     t2 = time_mult(x, 1);
+     printf("%d: %9llu %9llu, %9llu\n", x, t1, t2, t2 - t1);
+     if (t2 < t1) break;
   }
-  fclose (log);
-  printf("Karatsuba Squaring Cutoff (KARATSUBA_SQR_CUTOFF) == %d\n", best_ksquare);
-  
-  KARATSUBA_MUL_CUTOFF = best_kmult;
-  KARATSUBA_SQR_CUTOFF = best_ksquare;
-    
-  /* tune TOOM mult */
-  counter = 16;
-  log = fopen ("tmult.log", "w");
-  best = -1;
-  for (TOOM_MUL_CUTOFF = best_kmult*5; TOOM_MUL_CUTOFF <= 800; TOOM_MUL_CUTOFF++) {
-    ti = time_mult (1200);
-    printf ("%4d : %9llu          \r", TOOM_MUL_CUTOFF, ti);
-    fprintf (log, "%d, %llu\n", TOOM_MUL_CUTOFF, ti);
-    fflush (stdout);
-    if (ti < best) {
-      printf ("New best: %llu, %d         \r", ti, TOOM_MUL_CUTOFF);
-      best = ti;
-      best_tmult = TOOM_MUL_CUTOFF;
-      counter = 16;
-    } else if (--counter == 0) {
-       printf("No better found in 16 trials.\n");
-       break;
-    }
+  y = x;
+
+  for (x = 8; ; x += 2) { 
+     t1 = time_sqr(x, 0);
+     t2 = time_sqr(x, 1);
+     printf("%d: %9llu %9llu, %9llu\n", x, t1, t2, t2 - t1);
+     if (t2 < t1) break;
   }
-  fclose (log);   
-  printf("Toom-Cook Multiplier Cutoff (TOOM_MUL_CUTOFF) == %d\n", best_tmult);
-  
-  /* tune TOOM sqr */
-  log = fopen ("tsqr.log", "w");
-  best = -1;
-  counter = 16;
-  for (TOOM_SQR_CUTOFF = best_ksquare*3; TOOM_SQR_CUTOFF <= 800; TOOM_SQR_CUTOFF++) {
-    ti = time_sqr (1200);
-    printf ("%4d : %9llu           \r", TOOM_SQR_CUTOFF, ti);
-    fprintf (log, "%d, %llu\n", TOOM_SQR_CUTOFF, ti);
-    fflush (stdout);
-    if (ti < best) {
-      printf ("New best: %llu, %d         \r", ti, TOOM_SQR_CUTOFF);
-      best = ti;
-      best_tsquare = TOOM_SQR_CUTOFF;
-      counter = 16;
-    } else if (--counter == 0) {
-       printf("No better found in 16 trials.\n");
-       break;
-    }
-  }
-  fclose (log);   
-  printf("Toom-Cook Squaring Cutoff (TOOM_SQR_CUTOFF) == %d\n", best_tsquare);
-
+  printf("KARATSUBA_MUL_CUTOFF = %d\n", y);
+  printf("KARATSUBA_SQR_CUTOFF = %d\n", x);
 
   return 0;
 }
--- a/logs/add.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/add.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,16 +1,16 @@
-224  20297071
-448  15151383
-672  13088682
-896  11111587
-1120   9240621
-1344   8221878
-1568   7227434
-1792   6718051
-2016   6042524
-2240   5685200
-2464   5240465
-2688   4818032
-2912   4412794
-3136   4155883
-3360   3927078
-3584   3722138
+224       222
+448       330
+672       436
+896       520
+1120       612
+1344       696
+1568       810
+1792       912
+2016      1006
+2240      1116
+2464      1152
+2688      1284
+2912      1348
+3136      1486
+3360      1580
+3584      1636
Binary file logs/addsub.png has changed
--- a/logs/expt.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/expt.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,7 +0,0 @@
-513       745
-769       282
-1025       130
-2049        20
-2561        11
-3073         6
-4097         2
Binary file logs/expt.png has changed
--- a/logs/expt_2k.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/expt_2k.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,6 +0,0 @@
-521       783
-607       585
-1279       138
-2203        39
-3217        15
-4253         6
--- a/logs/expt_dr.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/expt_dr.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,7 +0,0 @@
-532      1296
-784       551
-1036       283
-1540       109
-2072        52
-3080        18
-4116         7
--- a/logs/graphs.dem	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/graphs.dem	Sun Dec 19 15:57:19 2004 +0000
@@ -1,17 +1,17 @@
-set terminal png
-set size 1.75
-set ylabel "Operations per Second"
-set xlabel "Operand size (bits)"
-
-set output "addsub.png"
-plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
-
-set output "mult.png"
-plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
-
-set output "expt.png"
-plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"
-
-set output "invmod.png"
-plot 'invmod.log' smooth bezier title "Modular Inverse"
-
+set terminal png
+set size 1.75
+set ylabel "Cycles per Operation"
+set xlabel "Operand size (bits)"
+
+set output "addsub.png"
+plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
+
+set output "mult.png"
+plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
+
+set output "expt.png"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"
+
+set output "invmod.png"
+plot 'invmod.log' smooth bezier title "Modular Inverse"
+
--- a/logs/invmod.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/invmod.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,32 +0,0 @@
-112     17364
-224      8643
-336      8867
-448      6228
-560      4737
-672      2259
-784      2899
-896      1497
-1008      1238
-1120      1010
-1232       870
-1344      1265
-1456      1102
-1568       981
-1680       539
-1792       484
-1904       722
-2016       392
-2128       604
-2240       551
-2352       511
-2464       469
-2576       263
-2688       247
-2800       227
-2912       354
-3024       336
-3136       312
-3248       296
-3360       166
-3472       155
-3584       248
Binary file logs/invmod.png has changed
--- a/logs/mult.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/mult.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,33 +1,143 @@
-920    374785
-1142    242737
-1371    176704
-1596    134341
-1816    105537
-2044     85089
-2268     70051
-2490     58671
-2716     49851
-2937     42881
-3162     37288
-3387     32697
-3608     28915
-3836     25759
-4057     23088
-4284     20800
-4508     18827
-4730     17164
-4956     15689
-5180     14397
-5398     13260
-5628     12249
-5852     11346
-6071     10537
-6298      9812
-6522      9161
-6742      8572
-6971      8038
-7195      2915
-7419      2744
-7644      2587
-7866      2444
-8090      2311
+140      1272
+195      1428
+252      1996
+307      2586
+364      3464
+420      4420
+476      5260
+532      6430
+588      7692
+644      8704
+699     10226
+755     11670
+812     13190
+865     14834
+924     16738
+979     18362
+1036     20660
+1092     22776
+1148     24848
+1204     27168
+1260     29930
+1316     32258
+1370     35172
+1422     37534
+1482     40390
+1537     43990
+1589     46946
+1652     50438
+1703     52902
+1764     56646
+1820     59892
+1876     63248
+1932     66872
+1988     72596
+2042     74662
+2100     78512
+2156     82944
+2211     87444
+2268     92170
+2324     95534
+2380    100484
+2435    105024
+2491    109460
+2546    114154
+2603    118946
+2660    124110
+2716    129300
+2771    134274
+2828    139594
+2883    145234
+2939    150332
+2996    155750
+3048    161718
+3108    167492
+3162    173882
+3219    179766
+3276    185560
+3330    191826
+3388    197822
+3442    204176
+3500    210682
+3556    217236
+3612    223484
+3666    230714
+3724    237744
+3779    244080
+3835    250970
+3890    257914
+3947    265162
+4001    272128
+4060    279108
+4116    287606
+4171    294716
+4227    302806
+4284    310260
+4340    318564
+4395    326164
+4443    334034
+4508    342108
+4561    351810
+4618    358828
+4675    367332
+4732    376140
+4787    384172
+4841    393308
+4899    402036
+4955    411286
+5010    420290
+5067    429688
+5124    438810
+5180    448130
+5235    457264
+5290    467390
+5348    476586
+5404    486120
+5459    496512
+5516    506624
+5569    516346
+5628    526604
+5684    536544
+5740    546936
+5796    557284
+5852    568106
+5907    578824
+5963    589204
+6019    600176
+6076    610564
+6127    621972
+6188    633564
+6244    644730
+6300    655288
+6354    667402
+6412    678824
+6467    690594
+6522    702718
+6580    714148
+6636    725608
+6690    737834
+6747    750100
+6804    762202
+6860    774184
+6916    787298
+6971    798734
+7028    811162
+7083    824570
+7139    837738
+7196   2579488
+7245   2626714
+7308   2643582
+7364   2698746
+7416   2734106
+7476   2773372
+7530   2816738
+7588   2859204
+7643   2938596
+7698   2919716
+7754   2988542
+7812   3026520
+7867   3058304
+7924   3115790
+7977   3161450
+8035   3203138
+8092   3244056
Binary file logs/mult.png has changed
--- a/logs/mult_kara.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/mult_kara.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,33 +1,33 @@
-924    374171
-1147    243163
-1371    177111
-1596    134465
-1819    105619
-2044     85145
-2266     70086
-2488     58717
-2715     49869
-2939     42894
-3164     37389
-3387     33510
-3610     29993
-3836     27205
-4060     24751
-4281     22576
-4508     20670
-4732     19019
-4954     17527
-5180     16217
-5404     15044
-5624     14003
-5849     13051
-6076     12067
-6300     11438
-6524     10772
-6748     10298
-6972      9715
-7195      9330
-7416      8836
-7644      8465
-7864      8042
-8091      7735
+924     16686
+1146     25334
+1371     35304
+1591     47122
+1820     61500
+2044     75254
+2266     91732
+2492    111656
+2716    129428
+2937    147508
+3164    167758
+3388    188248
+3612    210826
+3836    233814
+4059    256898
+4284    280210
+4508    310372
+4731    333902
+4955    376502
+5179    402854
+5404    432004
+5626    459010
+5849    491868
+6076    520550
+6300    547400
+6524    575968
+6747    608482
+6971    642850
+7196    673670
+7419    710680
+7644    743942
+7868    780394
+8092    817342
--- a/logs/sqr.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/sqr.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,33 +1,143 @@
-922    471095
-1147    337137
-1366    254327
-1596    199732
-1819    161225
-2044    132852
-2268    111493
-2490     94864
-2715     81745
-2940     71187
-3162     62575
-3387     55418
-3612     14540
-3836     12944
-4060     11627
-4281     10546
-4508      9502
-4730      8688
-4954      7937
-5180      7273
-5402      6701
-5627      6189
-5850      5733
-6076      5310
-6300      4933
-6522      4631
-6748      4313
-6971      4064
-7196      3801
-7420      3576
-7642      3388
-7868      3191
-8092      3020
+139       806
+195      1212
+252      1604
+307      2260
+364      2892
+420      3308
+476      4152
+532      4814
+588      5754
+644      6684
+700      7226
+756      8324
+808      9092
+866     10068
+924     11204
+976     12918
+1036     13656
+1092     15248
+1148     15956
+1204     17270
+1260     19894
+1316     20516
+1370     21864
+1428     25554
+1483     26138
+1540     27086
+1596     29246
+1652     32210
+1707     32704
+1764     35142
+1820     39050
+1876     39256
+1931     41574
+1985     45070
+2044     46352
+2099     48114
+2155     51332
+2212     53268
+2267     55890
+2324     59054
+2380     60206
+2434     63540
+2491     66084
+2547     68590
+2604     74332
+2660     74784
+2715     77974
+2772     79924
+2826     82914
+2884     87210
+2929     89076
+2996     92480
+3052     96814
+3108     99990
+3162    102550
+3219    105396
+3276    109284
+3332    113752
+3387    116628
+3444    120782
+3500    122938
+3556    127940
+3612    303656
+3667    312212
+3724    324376
+3779    329204
+3833    340910
+3892    353850
+3943    362348
+4003    367780
+4056    380448
+4114    393616
+4172    404104
+4227    415148
+4284    409770
+4339    436648
+4394    442970
+4451    463096
+4507    472056
+4564    485780
+4616    496286
+4675    507612
+4732    519524
+4788    536768
+4843    542754
+4899    553090
+4956    571986
+5012    586340
+5068    599606
+5124    613670
+5179    624256
+5235    636266
+5292    655518
+5348    668142
+5403    677266
+5460    696040
+5516    712772
+5570    723942
+5628    739052
+5684    755350
+5739    769962
+5790    775258
+5851    790128
+5908    814536
+5962    827278
+6018    844510
+6076    851606
+6130    865748
+6188    894752
+6244    900474
+6300    928174
+6356    928440
+6410    957758
+6468    981134
+6524    994088
+6580   1011124
+6636   1027178
+6692   1045466
+6747   1056910
+6804   1083784
+6860   1104706
+6915   1116450
+6972   1137894
+7028   1154670
+7084   1158064
+7138   1188734
+7196   1214218
+7249   1226822
+7307   1247528
+7363   1255338
+7420   1291104
+7475   1297940
+7532   1324994
+7587   1340274
+7644   1342596
+7698   1381418
+7756   1382904
+7812   1432588
+7867   1443632
+7922   1465092
+7979   1496804
+8036   1520142
+8092   1539566
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/logs/sqr.old	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,17 @@
+896    382617
+1344    207161
+1792    131522
+2240     90775
+2688     66652
+3136     50955
+3584     11678
+4032      9342
+4480      7684
+4928      6382
+5376      5399
+5824      4545
+6272      3994
+6720      3490
+7168      3075
+7616      2733
+8064      2428
--- a/logs/sqr_kara.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/sqr_kara.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,33 +1,33 @@
-922    470930
-1148    337217
-1372    254433
-1596    199827
-1820    161204
-2043    132871
-2267    111522
-2488     94932
-2714     81814
-2939     71231
-3164     62616
-3385     55467
-3611     44426
-3836     40695
-4060     37391
-4283     34371
-4508     31779
-4732     29499
-4956     27426
-5177     25598
-5403     23944
-5628     22416
-5851     21052
-6076     19781
-6299     18588
-6523     17539
-6746     16618
-6972     15705
-7196     13582
-7420     13004
-7643     12496
-7868     11963
-8092     11497
+922     11272
+1148     16004
+1370     21958
+1596     28684
+1817     37832
+2044     46386
+2262     56218
+2492     66388
+2716     77478
+2940     89380
+3163    103680
+3385    116274
+3612    135334
+3836    151332
+4057    164938
+4284    183178
+4508    198864
+4731    215222
+4954    231986
+5180    251660
+5404    269414
+5626    288454
+5850    307806
+6076    329458
+6299    347726
+6523    369864
+6748    387832
+6971    413010
+7194    453310
+7415    476936
+7643    497118
+7867    521394
+8091    540224
--- a/logs/sub.log	Fri Dec 17 06:27:22 2004 +0000
+++ b/logs/sub.log	Sun Dec 19 15:57:19 2004 +0000
@@ -1,16 +1,16 @@
-224  16370431
-448  13327848
-672  11009401
-896   9125342
-1120   7930419
-1344   7114040
-1568   6506998
-1792   5899346
-2016   5435327
-2240   5038931
-2464   4696364
-2688   4425678
-2912   4134476
-3136   3913280
-3360   3692536
-3584   3505219
+224       216
+448       324
+672       428
+896       532
+1120       648
+1344       766
+1568       862
+1792       928
+2016      1070
+2240      1128
+2464      1250
+2688      1344
+2912      1436
+3136      1542
+3360      1628
+3584      1696
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/makefile	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,152 @@
+#Makefile for GCC
+#
+#Tom St Denis
+CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
+
+#for speed 
+CFLAGS += -O3 -funroll-loops
+
+#for size 
+#CFLAGS += -Os
+
+#x86 optimizations [should be valid for any GCC install though]
+CFLAGS  += -fomit-frame-pointer
+
+#debug
+#CFLAGS += -g3
+
+VERSION=0.32
+
+default: libtommath.a
+
+#default files to install
+LIBNAME=libtommath.a
+HEADERS=tommath.h
+
+#LIBPATH-The directory for libtommath to be installed to.
+#INCPATH-The directory to install the header files for libtommath.
+#DATAPATH-The directory to install the pdf docs.
+DESTDIR=
+LIBPATH=/usr/lib
+INCPATH=/usr/include
+DATAPATH=/usr/share/doc/libtommath/pdf
+
+OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
+bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
+bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
+bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
+bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
+bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
+bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
+bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
+bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
+bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
+bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
+bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
+bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
+bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
+bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
+bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+
+libtommath.a:  $(OBJECTS)
+	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
+	ranlib libtommath.a
+
+
+#make a profiled library (takes a while!!!)
+#
+# This will build the library with profile generation
+# then run the test demo and rebuild the library.
+# 
+# So far I've seen improvements in the MP math
+profiled:
+	make CFLAGS="$(CFLAGS) -fprofile-arcs -DTESTING" timing
+	./ltmtest
+	rm -f *.a *.o ltmtest
+	make CFLAGS="$(CFLAGS) -fbranch-probabilities"
+
+#make a single object profiled library 
+profiled_single:
+	perl gen.pl
+	$(CC) $(CFLAGS) -fprofile-arcs -DTESTING -c mpi.c -o mpi.o
+	$(CC) $(CFLAGS) -DTESTING -DTIMER demo/timing.c mpi.o -o ltmtest
+	./ltmtest
+	rm -f *.o ltmtest
+	$(CC) $(CFLAGS) -fbranch-probabilities -DTESTING -c mpi.c -o mpi.o
+	$(AR) $(ARFLAGS) libtommath.a mpi.o
+	ranlib libtommath.a	
+
+install: libtommath.a
+	install -d -g root -o root $(DESTDIR)$(LIBPATH)
+	install -d -g root -o root $(DESTDIR)$(INCPATH)
+	install -g root -o root $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
+
+test: libtommath.a demo/demo.o
+	$(CC) demo/demo.o libtommath.a -o test
+	
+mtest: test	
+	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest -s
+        
+timing: libtommath.a
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest -s
+
+# makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think]
+docdvi: tommath.src
+	cd pics ; make 
+	echo "hello" > tommath.ind
+	perl booker.pl
+	latex tommath > /dev/null
+	latex tommath > /dev/null
+	makeindex tommath
+	latex tommath > /dev/null
+
+# poster, makes the single page PDF poster
+poster: poster.tex
+	pdflatex poster
+	rm -f poster.aux poster.log 
+
+# makes the LTM book PDF file, requires tetex, cleans up the LaTeX temp files
+docs:   docdvi
+	dvipdf tommath
+	rm -f tommath.log tommath.aux tommath.dvi tommath.idx tommath.toc tommath.lof tommath.ind tommath.ilg
+	cd pics ; make clean
+	
+#LTM user manual
+mandvi: bn.tex
+	echo "hello" > bn.ind
+	latex bn > /dev/null
+	latex bn > /dev/null
+	makeindex bn
+	latex bn > /dev/null
+
+#LTM user manual [pdf]
+manual:	mandvi
+	pdflatex bn >/dev/null
+	rm -f bn.aux bn.dvi bn.log bn.idx bn.lof bn.out bn.toc
+
+pretty: 
+	perl pretty.build
+
+clean:
+	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
+        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.da *.dyn *.dpi tommath.tex `find -type f | grep [~] | xargs` *.lo *.la
+	rm -rf .libs
+	cd etc ; make clean
+	cd pics ; make clean
+
+zipup: clean manual poster docs
+	perl gen.pl ; mv mpi.c pre_gen/ ; \
+	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
+	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; \
+	tar -c libtommath-$(VERSION)/* | bzip2 -9vvc > ltm-$(VERSION).tar.bz2 ; \
+	zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
--- a/makefile.bcc	Fri Dec 17 06:27:22 2004 +0000
+++ b/makefile.bcc	Sun Dec 19 15:57:19 2004 +0000
@@ -29,8 +29,9 @@
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
-bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
-bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj
+bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
+bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj
 
 TARGET = libtommath.lib
 
--- a/makefile.cygwin_dll	Fri Dec 17 06:27:22 2004 +0000
+++ b/makefile.cygwin_dll	Sun Dec 19 15:57:19 2004 +0000
@@ -34,8 +34,9 @@
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
-bn_mp_init_multi.o bn_mp_clear_multi.o bn_prime_sizes_tab.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
-bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
 
 # make a Windows DLL via Cygwin
 windll:  $(OBJECTS)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/makefile.icc	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,110 @@
+#Makefile for ICC
+#
+#Tom St Denis
+CC=icc
+
+CFLAGS  +=  -I./
+
+# optimize for SPEED
+#
+# -mcpu= can be pentium, pentiumpro (covers PII through PIII) or pentium4
+# -ax?   specifies make code specifically for ? but compatible with IA-32
+# -x?    specifies compile solely for ? [not specifically IA-32 compatible]
+#
+# where ? is 
+#   K - PIII
+#   W - first P4 [Williamette]
+#   N - P4 Northwood
+#   P - P4 Prescott
+#   B - Blend of P4 and PM [mobile]
+#
+# Default to just generic max opts
+CFLAGS += -O3 -xN
+
+default: libtommath.a
+
+#default files to install
+LIBNAME=libtommath.a
+HEADERS=tommath.h
+
+#LIBPATH-The directory for libtomcrypt to be installed to.
+#INCPATH-The directory to install the header files for libtommath.
+#DATAPATH-The directory to install the pdf docs.
+DESTDIR=
+LIBPATH=/usr/lib
+INCPATH=/usr/include
+DATAPATH=/usr/share/doc/libtommath/pdf
+
+OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
+bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
+bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
+bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
+bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
+bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
+bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
+bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
+bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
+bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
+bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
+bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
+bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
+bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
+bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
+bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+
+libtommath.a:  $(OBJECTS)
+	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
+	ranlib libtommath.a
+
+#make a profiled library (takes a while!!!)
+#
+# This will build the library with profile generation
+# then run the test demo and rebuild the library.
+# 
+# So far I've seen improvements in the MP math
+profiled:
+	make -f makefile.icc CFLAGS="$(CFLAGS) -prof_gen -DTESTING" timing
+	./ltmtest
+	rm -f *.a *.o ltmtest
+	make -f makefile.icc CFLAGS="$(CFLAGS) -prof_use"
+
+#make a single object profiled library 
+profiled_single:
+	perl gen.pl
+	$(CC) $(CFLAGS) -prof_gen -DTESTING -c mpi.c -o mpi.o
+	$(CC) $(CFLAGS) -DTESTING -DTIMER demo/demo.c mpi.o -o ltmtest
+	./ltmtest
+	rm -f *.o ltmtest
+	$(CC) $(CFLAGS) -prof_use -ip -DTESTING -c mpi.c -o mpi.o
+	$(AR) $(ARFLAGS) libtommath.a mpi.o
+	ranlib libtommath.a	
+
+install: libtommath.a
+	install -d -g root -o root $(DESTDIR)$(LIBPATH)
+	install -d -g root -o root $(DESTDIR)$(INCPATH)
+	install -g root -o root $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
+
+test: libtommath.a demo/demo.o
+	$(CC) demo/demo.o libtommath.a -o test
+	
+mtest: test	
+	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest
+        
+timing: libtommath.a
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest
+
+clean:
+	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
+        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.il etc/*.il *.dyn
+	cd etc ; make clean
+	cd pics ; make clean
--- a/makefile.msvc	Fri Dec 17 06:27:22 2004 +0000
+++ b/makefile.msvc	Sun Dec 19 15:57:19 2004 +0000
@@ -28,8 +28,9 @@
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
-bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
-bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj
+bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
+bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj
 
 library: $(OBJECTS)
 	lib /out:tommath.lib $(OBJECTS)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/makefile.shared	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,74 @@
+#Makefile for GCC
+#
+#Tom St Denis
+VERSION=0:32
+
+CC = libtool --mode=compile gcc
+
+CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
+
+#for speed 
+CFLAGS += -O3 -funroll-loops
+
+#for size 
+#CFLAGS += -Os
+
+#x86 optimizations [should be valid for any GCC install though]
+CFLAGS  += -fomit-frame-pointer
+
+default: libtommath.la
+
+#default files to install
+LIBNAME=libtommath.la
+HEADERS=tommath.h
+
+#LIBPATH-The directory for libtommath to be installed to.
+#INCPATH-The directory to install the header files for libtommath.
+#DATAPATH-The directory to install the pdf docs.
+DESTDIR=
+LIBPATH=/usr/lib
+INCPATH=/usr/include
+DATAPATH=/usr/share/doc/libtommath/pdf
+
+OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
+bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
+bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
+bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
+bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
+bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
+bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
+bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
+bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
+bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
+bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
+bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
+bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
+bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
+bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
+bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+
+libtommath.la:  $(OBJECTS)
+	libtool --mode=link gcc *.lo -o libtommath.la -rpath $(LIBPATH) -version-info $(VERSION)
+	libtool --mode=link gcc *.o -o libtommath.a 
+	libtool --mode=install install -c libtommath.la $(LIBPATH)/libtommath.la
+	install -d -g root -o root $(DESTDIR)$(INCPATH)
+	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
+
+test: libtommath.a demo/demo.o
+	gcc $(CFLAGS) -c demo/demo.c -o demo/demo.o
+	libtool --mode=link gcc -o test demo/demo.o libtommath.la
+	
+mtest: test	
+	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
+        
+timing: libtommath.la
+	gcc $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest -s
--- a/mtest/mpi-config.h	Fri Dec 17 06:27:22 2004 +0000
+++ b/mtest/mpi-config.h	Sun Dec 19 15:57:19 2004 +0000
@@ -1,5 +1,5 @@
 /* Default configuration for MPI library */
-/* $Id: mpi-config.h,v 1.1.1.1 2003/05/19 04:01:19 matt Exp $ */
+/* $Id: mpi-config.h,v 1.8 2000/07/11 04:28:14 sting Exp sting $ */
 
 #ifndef MPI_CONFIG_H_
 #define MPI_CONFIG_H_
--- a/mtest/mpi.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/mtest/mpi.c	Sun Dec 19 15:57:19 2004 +0000
@@ -6,7 +6,7 @@
 
     Arbitrary precision integer arithmetic library
 
-    $Id: mpi.c,v 1.1.1.1 2003/05/19 04:01:19 matt Exp $
+    $Id: mpi.c,v 1.22 2001/09/14 15:11:20 sting Exp sting $
  */
 
 #include "mpi.h"
--- a/mtest/mpi.h	Fri Dec 17 06:27:22 2004 +0000
+++ b/mtest/mpi.h	Sun Dec 19 15:57:19 2004 +0000
@@ -6,7 +6,7 @@
 
     Arbitrary precision integer arithmetic library
 
-    $Id: mpi.h,v 1.1.1.1 2003/05/19 04:01:19 matt Exp $
+    $Id: mpi.h,v 1.15 2001/09/17 14:16:22 sting Exp $
  */
 
 #ifndef _H_MPI_
--- a/mtest/mtest.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/mtest/mtest.c	Sun Dec 19 15:57:19 2004 +0000
@@ -58,7 +58,7 @@
    int n, size;
    unsigned char buf[2048];
 
-   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 97;
+   size = 10 + ((fgetc(rng)<<8) + fgetc(rng)) % 97;
    buf[0] = (fgetc(rng)&1)?1:0;
    fread(buf+1, 1, size, rng);
    while (buf[1] == 0) buf[1] = fgetc(rng);
@@ -109,11 +109,12 @@
 
    t1 = clock();
    for (;;) {
+#if 0
       if (clock() - t1 > CLOCKS_PER_SEC) {
          sleep(2);
          t1 = clock();
       }
-
+#endif
        n = fgetc(rng) % 15;
 
    if (n == 0) {
--- a/pics/makefile	Fri Dec 17 06:27:22 2004 +0000
+++ b/pics/makefile	Sun Dec 19 15:57:19 2004 +0000
@@ -32,3 +32,4 @@
 
 clean:
 	rm -rf *.ps *.pdf .xvpics
+   
\ No newline at end of file
Binary file pics/sliding_window.tif has changed
Binary file poster.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/poster.tex	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,35 @@
+\documentclass[landscape,11pt]{article}
+\usepackage{amsmath, amssymb}
+\usepackage{hyperref}
+\begin{document}
+\hspace*{-3in}
+\begin{tabular}{llllll}
+$c = a + b$  & {\tt mp\_add(\&a, \&b, \&c)} & $b = 2a$  & {\tt mp\_mul\_2(\&a, \&b)} & \\
+$c = a - b$  & {\tt mp\_sub(\&a, \&b, \&c)} & $b = a/2$ & {\tt mp\_div\_2(\&a, \&b)} & \\
+$c = ab $   & {\tt mp\_mul(\&a, \&b, \&c)}  & $c = 2^ba$  & {\tt mp\_mul\_2d(\&a, b, \&c)}  \\
+$b = a^2 $  & {\tt mp\_sqr(\&a, \&b)}       & $c = a/2^b, d = a \mod 2^b$ & {\tt mp\_div\_2d(\&a, b, \&c, \&d)} \\
+$c = \lfloor a/b \rfloor, d = a \mod b$ & {\tt mp\_div(\&a, \&b, \&c, \&d)} & $c = a \mod 2^b $  & {\tt mp\_mod\_2d(\&a, b, \&c)}  \\
+ && \\
+$a = b $  & {\tt mp\_set\_int(\&a, b)}  & $c = a \vee b$  & {\tt mp\_or(\&a, \&b, \&c)}  \\
+$b = a $  & {\tt mp\_copy(\&a, \&b)} & $c = a \wedge b$  & {\tt mp\_and(\&a, \&b, \&c)}  \\
+ && $c = a \oplus b$  & {\tt mp\_xor(\&a, \&b, \&c)}  \\
+ & \\
+$b = -a $  & {\tt mp\_neg(\&a, \&b)}  & $d = a + b \mod c$  & {\tt mp\_addmod(\&a, \&b, \&c, \&d)}  \\
+$b = |a| $  & {\tt mp\_abs(\&a, \&b)} & $d = a - b \mod c$  & {\tt mp\_submod(\&a, \&b, \&c, \&d)}  \\
+ && $d = ab \mod c$  & {\tt mp\_mulmod(\&a, \&b, \&c, \&d)}  \\
+Compare $a$ and $b$ & {\tt mp\_cmp(\&a, \&b)} & $c = a^2 \mod b$  & {\tt mp\_sqrmod(\&a, \&b, \&c)}  \\
+Is Zero? & {\tt mp\_iszero(\&a)} & $c = a^{-1} \mod b$  & {\tt mp\_invmod(\&a, \&b, \&c)} \\
+Is Even? & {\tt mp\_iseven(\&a)} & $d = a^b \mod c$ & {\tt mp\_exptmod(\&a, \&b, \&c, \&d)} \\
+Is Odd ? & {\tt mp\_isodd(\&a)} \\
+&\\
+$\vert \vert a \vert \vert$ & {\tt mp\_unsigned\_bin\_size(\&a)} & $res$ = 1 if $a$ prime to $t$ rounds? & {\tt mp\_prime\_is\_prime(\&a, t, \&res)} \\
+$buf \leftarrow a$          & {\tt mp\_to\_unsigned\_bin(\&a, buf)} & Next prime after $a$ to $t$ rounds. & {\tt mp\_prime\_next\_prime(\&a, t, bbs\_style)} \\
+$a \leftarrow buf[0..len-1]$          & {\tt mp\_read\_unsigned\_bin(\&a, buf, len)} \\
+&\\
+$b = \sqrt{a}$ & {\tt mp\_sqrt(\&a, \&b)}  & $c = \mbox{gcd}(a, b)$ & {\tt mp\_gcd(\&a, \&b, \&c)} \\
+$c = a^{1/b}$ & {\tt mp\_n\_root(\&a, b, \&c)} & $c = \mbox{lcm}(a, b)$ & {\tt mp\_lcm(\&a, \&b, \&c)} \\
+&\\
+Greater Than & MP\_GT & Equal To & MP\_EQ \\
+Less Than & MP\_LT & Bits per digit & DIGIT\_BIT \\
+\end{tabular}
+\end{document}
--- a/pre_gen/mpi.c	Fri Dec 17 06:27:22 2004 +0000
+++ b/pre_gen/mpi.c	Sun Dec 19 15:57:19 2004 +0000
@@ -1,19 +1,20 @@
 /* Start: bn_error.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_ERROR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 static const struct {
      int code;
@@ -40,30 +41,32 @@
    return "Invalid error code";
 }
 
+#endif
 
 /* End: bn_error.c */
 
 /* Start: bn_fast_mp_invmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_FAST_MP_INVMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* computes the modular inverse via binary extended euclidean algorithm, 
  * that is c = 1/a mod b 
  *
- * Based on mp_invmod except this is optimized for the case where b is 
+ * Based on slow invmod except this is optimized for the case where b is 
  * odd as per HAC Note 14.64 on pp. 610
  */
 int
@@ -187,29 +190,31 @@
 __ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
   return res;
 }
+#endif
 
 /* End: bn_fast_mp_invmod.c */
 
 /* Start: bn_fast_mp_montgomery_reduce.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* computes xR**-1 == x (mod N) via Montgomery Reduction
  *
- * This is an optimized implementation of mp_montgomery_reduce
+ * This is an optimized implementation of montgomery_reduce
  * which uses the comba method to quickly calculate the columns of the
  * reduction.
  *
@@ -358,25 +363,27 @@
   }
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_fast_mp_montgomery_reduce.c */
 
 /* Start: bn_fast_s_mp_mul_digs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_MUL_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* Fast (comba) multiplier
  *
@@ -397,8 +404,9 @@
 int
 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  int     olduse, res, pa, ix;
-  mp_word W[MP_WARRAY];
+  int     olduse, res, pa, ix, iz;
+  mp_digit W[MP_WARRAY];
+  register mp_word  _W;
 
   /* grow the destination as required */
   if (c->alloc < digs) {
@@ -407,82 +415,52 @@
     }
   }
 
-  /* clear temp buf (the columns) */
-  memset (W, 0, sizeof (mp_word) * digs);
-
-  /* calculate the columns */
-  pa = a->used;
-  for (ix = 0; ix < pa; ix++) {
-    /* this multiplier has been modified to allow you to 
-     * control how many digits of output are produced.  
-     * So at most we want to make upto "digs" digits of output.
-     *
-     * this adds products to distinct columns (at ix+iy) of W
-     * note that each step through the loop is not dependent on
-     * the previous which means the compiler can easily unroll
-     * the loop without scheduling problems
-     */
-    {
-      register mp_digit tmpx, *tmpy;
-      register mp_word *_W;
-      register int iy, pb;
-
-      /* alias for the the word on the left e.g. A[ix] * A[iy] */
-      tmpx = a->dp[ix];
-
-      /* alias for the right side */
-      tmpy = b->dp;
-
-      /* alias for the columns, each step through the loop adds a new
-         term to each column
+  /* number of output digits to produce */
+  pa = MIN(digs, a->used + b->used);
+
+  /* clear the carry */
+  _W = 0;
+  for (ix = 0; ix <= pa; ix++) { 
+      int      tx, ty;
+      int      iy;
+      mp_digit *tmpx, *tmpy;
+
+      /* get offsets into the two bignums */
+      ty = MIN(b->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = b->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially its 
+         while (tx++ < a->used && ty-- >= 0) { ... }
        */
-      _W = W + ix;
-
-      /* the number of digits is limited by their placement.  E.g.
-         we avoid multiplying digits that will end up above the # of
-         digits of precision requested
-       */
-      pb = MIN (b->used, digs - ix);
-
-      for (iy = 0; iy < pb; iy++) {
-        *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
+      iy = MIN(a->used-tx, ty+1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; ++iz) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
       }
-    }
-
+
+      /* store term */
+      W[ix] = ((mp_digit)_W) & MP_MASK;
+
+      /* make next carry */
+      _W = _W >> ((mp_word)DIGIT_BIT);
   }
 
   /* setup dest */
-  olduse = c->used;
+  olduse  = c->used;
   c->used = digs;
 
   {
     register mp_digit *tmpc;
-
-    /* At this point W[] contains the sums of each column.  To get the
-     * correct result we must take the extra bits from each column and
-     * carry them down
-     *
-     * Note that while this adds extra code to the multiplier it 
-     * saves time since the carry propagation is removed from the 
-     * above nested loop.This has the effect of reducing the work 
-     * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
-     * cost of the shifting.  On very small numbers this is slower 
-     * but on most cryptographic size numbers it is faster.
-     *
-     * In this particular implementation we feed the carries from
-     * behind which means when the loop terminates we still have one
-     * last digit to copy
-     */
     tmpc = c->dp;
-    for (ix = 1; ix < digs; ix++) {
-      /* forward the carry from the previous temp */
-      W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-
+    for (ix = 0; ix < digs; ix++) {
       /* now extract the previous digit [below the carry] */
-      *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-    }
-    /* fetch the last digit */
-    *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
+      *tmpc++ = W[ix];
+    }
 
     /* clear unused digits [that existed in the old copy of c] */
     for (; ix < olduse; ix++) {
@@ -492,28 +470,30 @@
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_fast_s_mp_mul_digs.c */
 
 /* Start: bn_fast_s_mp_mul_high_digs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
- #include <tommath.h>
-
-/* this is a modified version of fast_s_mp_mul_digs that only produces
- * output digits *above* digs.  See the comments for fast_s_mp_mul_digs
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* this is a modified version of fast_s_mul_digs that only produces
+ * output digits *above* digs.  See the comments for fast_s_mul_digs
  * to see how it works.
  *
  * This is used in the Barrett reduction since for one of the multiplications
@@ -524,95 +504,92 @@
 int
 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  int     oldused, newused, res, pa, pb, ix;
-  mp_word W[MP_WARRAY];
-
-  /* calculate size of product and allocate more space if required */
-  newused = a->used + b->used + 1;
-  if (c->alloc < newused) {
-    if ((res = mp_grow (c, newused)) != MP_OKAY) {
+  int     olduse, res, pa, ix, iz;
+  mp_digit W[MP_WARRAY];
+  mp_word  _W;
+
+  /* grow the destination as required */
+  pa = a->used + b->used;
+  if (c->alloc < pa) {
+    if ((res = mp_grow (c, pa)) != MP_OKAY) {
       return res;
     }
   }
 
-  /* like the other comba method we compute the columns first */
-  pa = a->used;
-  pb = b->used;
-  memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
-  for (ix = 0; ix < pa; ix++) {
-    {
-      register mp_digit tmpx, *tmpy;
-      register int iy;
-      register mp_word *_W;
-
-      /* work todo, that is we only calculate digits that are at "digs" or above  */
-      iy = digs - ix;
-
-      /* copy of word on the left of A[ix] * B[iy] */
-      tmpx = a->dp[ix];
-
-      /* alias for right side */
-      tmpy = b->dp + iy;
-     
-      /* alias for the columns of output.  Offset to be equal to or above the 
-       * smallest digit place requested 
+  /* number of output digits to produce */
+  pa = a->used + b->used;
+  _W = 0;
+  for (ix = digs; ix <= pa; ix++) { 
+      int      tx, ty, iy;
+      mp_digit *tmpx, *tmpy;
+
+      /* get offsets into the two bignums */
+      ty = MIN(b->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = b->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially its 
+         while (tx++ < a->used && ty-- >= 0) { ... }
        */
-      _W = W + digs;     
-      
-      /* skip cases below zero where ix > digs */
-      if (iy < 0) {
-         iy    = abs(iy);
-         tmpy += iy;
-         _W   += iy;
-         iy    = 0;
+      iy = MIN(a->used-tx, ty+1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
       }
 
-      /* compute column products for digits above the minimum */
-      for (; iy < pb; iy++) {
-         *_W++ += ((mp_word) tmpx) * ((mp_word)*tmpy++);
-      }
-    }
+      /* store term */
+      W[ix] = ((mp_digit)_W) & MP_MASK;
+
+      /* make next carry */
+      _W = _W >> ((mp_word)DIGIT_BIT);
   }
 
   /* setup dest */
-  oldused = c->used;
-  c->used = newused;
-
-  /* now convert the array W downto what we need
-   *
-   * See comments in bn_fast_s_mp_mul_digs.c
-   */
-  for (ix = digs + 1; ix < newused; ix++) {
-    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-  }
-  c->dp[newused - 1] = (mp_digit) (W[newused - 1] & ((mp_word) MP_MASK));
-
-  for (; ix < oldused; ix++) {
-    c->dp[ix] = 0;
+  olduse  = c->used;
+  c->used = pa;
+
+  {
+    register mp_digit *tmpc;
+
+    tmpc = c->dp + digs;
+    for (ix = digs; ix <= pa; ix++) {
+      /* now extract the previous digit [below the carry] */
+      *tmpc++ = W[ix];
+    }
+
+    /* clear unused digits [that existed in the old copy of c] */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
   }
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_fast_s_mp_mul_high_digs.c */
 
 /* Start: bn_fast_s_mp_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* fast squaring
  *
@@ -631,131 +608,121 @@
  * Based on Algorithm 14.16 on pp.597 of HAC.
  *
  */
+/* the jist of squaring...
+
+you do like mult except the offset of the tmpx [one that starts closer to zero]
+can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
+(ty-tx) so that it never happens.  You double all those you add in the inner loop
+
+After that loop you do the squares and add them in.
+
+Remove W2 and don't memset W
+
+*/
+
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
-  int     olduse, newused, res, ix, pa;
-  mp_word W2[MP_WARRAY], W[MP_WARRAY];
-
-  /* calculate size of product and allocate as required */
-  pa = a->used;
-  newused = pa + pa + 1;
-  if (b->alloc < newused) {
-    if ((res = mp_grow (b, newused)) != MP_OKAY) {
+  int       olduse, res, pa, ix, iz;
+  mp_digit   W[MP_WARRAY], *tmpx;
+  mp_word   W1;
+
+  /* grow the destination as required */
+  pa = a->used + a->used;
+  if (b->alloc < pa) {
+    if ((res = mp_grow (b, pa)) != MP_OKAY) {
       return res;
     }
   }
 
-  /* zero temp buffer (columns)
-   * Note that there are two buffers.  Since squaring requires
-   * a outer and inner product and the inner product requires
-   * computing a product and doubling it (a relatively expensive
-   * op to perform n**2 times if you don't have to) the inner and
-   * outer products are computed in different buffers.  This way
-   * the inner product can be doubled using n doublings instead of
-   * n**2
-   */
-  memset (W,  0, newused * sizeof (mp_word));
-  memset (W2, 0, newused * sizeof (mp_word));
-
-  /* This computes the inner product.  To simplify the inner N**2 loop
-   * the multiplication by two is done afterwards in the N loop.
-   */
-  for (ix = 0; ix < pa; ix++) {
-    /* compute the outer product
-     *
-     * Note that every outer product is computed
-     * for a particular column only once which means that
-     * there is no need todo a double precision addition
-     * into the W2[] array.
-     */
-    W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
-
-    {
-      register mp_digit tmpx, *tmpy;
-      register mp_word *_W;
-      register int iy;
-
-      /* copy of left side */
-      tmpx = a->dp[ix];
-
-      /* alias for right side */
-      tmpy = a->dp + (ix + 1);
-
-      /* the column to store the result in */
-      _W = W + (ix + ix + 1);
-
-      /* inner products */
-      for (iy = ix + 1; iy < pa; iy++) {
-          *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
+  /* number of output digits to produce */
+  W1 = 0;
+  for (ix = 0; ix <= pa; ix++) { 
+      int      tx, ty, iy;
+      mp_word  _W;
+      mp_digit *tmpy;
+
+      /* clear counter */
+      _W = 0;
+
+      /* get offsets into the two bignums */
+      ty = MIN(a->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = a->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially its 
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(a->used-tx, ty+1);
+
+      /* now for squaring tx can never equal ty 
+       * we halve the distance since they approach at a rate of 2x
+       * and we have to round because odd cases need to be executed
+       */
+      iy = MIN(iy, (ty-tx+1)>>1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
       }
-    }
+
+      /* double the inner product and add carry */
+      _W = _W + _W + W1;
+
+      /* even columns have the square term in them */
+      if ((ix&1) == 0) {
+         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+      }
+
+      /* store it */
+      W[ix] = _W;
+
+      /* make next carry */
+      W1 = _W >> ((mp_word)DIGIT_BIT);
   }
 
   /* setup dest */
   olduse  = b->used;
-  b->used = newused;
-
-  /* now compute digits
-   *
-   * We have to double the inner product sums, add in the
-   * outer product sums, propagate carries and convert
-   * to single precision.
-   */
+  b->used = a->used+a->used;
+
   {
-    register mp_digit *tmpb;
-
-    /* double first value, since the inner products are
-     * half of what they should be
-     */
-    W[0] += W[0] + W2[0];
-
+    mp_digit *tmpb;
     tmpb = b->dp;
-    for (ix = 1; ix < newused; ix++) {
-      /* double/add next digit */
-      W[ix] += W[ix] + W2[ix];
-
-      /* propagate carry forwards [from the previous digit] */
-      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-
-      /* store the current digit now that the carry isn't
-       * needed
-       */
-      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-    }
-    /* set the last value.  Note even if the carry is zero
-     * this is required since the next step will not zero
-     * it if b originally had a value at b->dp[2*a.used]
-     */
-    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
-
-    /* clear high digits of b if there were any originally */
+    for (ix = 0; ix < pa; ix++) {
+      *tmpb++ = W[ix] & MP_MASK;
+    }
+
+    /* clear unused digits [that existed in the old copy of c] */
     for (; ix < olduse; ix++) {
       *tmpb++ = 0;
     }
   }
-
   mp_clamp (b);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_fast_s_mp_sqr.c */
 
 /* Start: bn_mp_2expt.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_2EXPT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* computes a = 2**b 
  *
@@ -779,29 +746,31 @@
   a->used = b / DIGIT_BIT + 1;
 
   /* put the single bit in its place */
-  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_2expt.c */
 
 /* Start: bn_mp_abs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_ABS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* b = |a| 
  *
@@ -824,25 +793,27 @@
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_abs.c */
 
 /* Start: bn_mp_add.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_ADD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* high level addition (handles signs) */
 int mp_add (mp_int * a, mp_int * b, mp_int * c)
@@ -875,25 +846,27 @@
   return res;
 }
 
+#endif
 
 /* End: bn_mp_add.c */
 
 /* Start: bn_mp_add_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_ADD_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* single digit addition */
 int
@@ -982,25 +955,27 @@
   return MP_OKAY;
 }
 
+#endif
 
 /* End: bn_mp_add_d.c */
 
 /* Start: bn_mp_addmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_ADDMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* d = a + b (mod c) */
 int
@@ -1021,25 +996,27 @@
   mp_clear (&t);
   return res;
 }
+#endif
 
 /* End: bn_mp_addmod.c */
 
 /* Start: bn_mp_and.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_AND_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* AND two ints together */
 int
@@ -1076,25 +1053,27 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_and.c */
 
 /* Start: bn_mp_clamp.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_CLAMP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* trim unused digits 
  *
@@ -1118,34 +1097,40 @@
     a->sign = MP_ZPOS;
   }
 }
+#endif
 
 /* End: bn_mp_clamp.c */
 
 /* Start: bn_mp_clear.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_CLEAR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* clear one (frees)  */
 void
 mp_clear (mp_int * a)
 {
+  int i;
+
   /* only do anything if a hasn't been freed previously */
   if (a->dp != NULL) {
     /* first zero the digits */
-    memset (a->dp, 0, sizeof (mp_digit) * a->used);
+    for (i = 0; i < a->used; i++) {
+        a->dp[i] = 0;
+    }
 
     /* free ram */
     XFREE(a->dp);
@@ -1156,25 +1141,27 @@
     a->sign  = MP_ZPOS;
   }
 }
+#endif
 
 /* End: bn_mp_clear.c */
 
 /* Start: bn_mp_clear_multi.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_CLEAR_MULTI_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 #include <stdarg.h>
 
 void mp_clear_multi(mp_int *mp, ...) 
@@ -1188,25 +1175,27 @@
     }
     va_end(args);
 }
+#endif
 
 /* End: bn_mp_clear_multi.c */
 
 /* Start: bn_mp_cmp.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_CMP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* compare two ints (signed)*/
 int
@@ -1229,25 +1218,27 @@
      return mp_cmp_mag(a, b);
   }
 }
+#endif
 
 /* End: bn_mp_cmp.c */
 
 /* Start: bn_mp_cmp_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_CMP_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* compare a digit */
 int mp_cmp_d(mp_int * a, mp_digit b)
@@ -1271,25 +1262,27 @@
     return MP_EQ;
   }
 }
+#endif
 
 /* End: bn_mp_cmp_d.c */
 
 /* Start: bn_mp_cmp_mag.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_CMP_MAG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* compare maginitude of two ints (unsigned) */
 int mp_cmp_mag (mp_int * a, mp_int * b)
@@ -1324,25 +1317,27 @@
   }
   return MP_EQ;
 }
+#endif
 
 /* End: bn_mp_cmp_mag.c */
 
 /* Start: bn_mp_cnt_lsb.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_CNT_LSB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 static const int lnz[16] = { 
    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
@@ -1375,25 +1370,27 @@
    return x;
 }
 
+#endif
 
 /* End: bn_mp_cnt_lsb.c */
 
 /* Start: bn_mp_copy.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_COPY_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* copy, b = a */
 int
@@ -1441,25 +1438,27 @@
   b->sign = a->sign;
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_copy.c */
 
 /* Start: bn_mp_count_bits.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_COUNT_BITS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* returns the number of bits in an int */
 int
@@ -1484,25 +1483,99 @@
   }
   return r;
 }
+#endif
 
 /* End: bn_mp_count_bits.c */
 
 /* Start: bn_mp_div.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_DIV_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+#ifdef BN_MP_DIV_SMALL
+
+/* slower bit-bang division... also smaller */
+int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+   mp_int ta, tb, tq, q;
+   int    res, n, n2;
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+	
+  /* init our temps */
+  if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) {
+     return res;
+  }
+
+
+  mp_set(&tq, 1);
+  n = mp_count_bits(a) - mp_count_bits(b);
+  if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
+      ((res = mp_copy(b, &tb)) != MP_OKAY) || 
+      ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
+      ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
+      goto __ERR;
+  }
+
+  while (n-- >= 0) {
+     if (mp_cmp(&tb, &ta) != MP_GT) {
+        if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
+            ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
+           goto __ERR;
+        }
+     }
+     if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
+         ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
+           goto __ERR;
+     }
+  }
+
+  /* now q == quotient and ta == remainder */
+  n  = a->sign;
+  n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
+  if (c != NULL) {
+     mp_exch(c, &q);
+     c->sign  = n2;
+  }
+  if (d != NULL) {
+     mp_exch(d, &ta);
+     d->sign = n;
+  }
+__ERR:
+   mp_clear_multi(&ta, &tb, &tq, &q, NULL);
+   return res;
+}
+
+#else
 
 /* integer signed division. 
  * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
@@ -1677,7 +1750,7 @@
    */
   
   /* get sign before writing to c */
-  x.sign = a->sign;
+  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
 
   if (c != NULL) {
     mp_clamp (&q);
@@ -1700,24 +1773,29 @@
   return res;
 }
 
+#endif
+
+#endif
+
 /* End: bn_mp_div.c */
 
 /* Start: bn_mp_div_2.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_DIV_2_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* b = a/2 */
 int mp_div_2(mp_int * a, mp_int * b)
@@ -1765,25 +1843,27 @@
   mp_clamp (b);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_div_2.c */
 
 /* Start: bn_mp_div_2d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_DIV_2D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
 int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
@@ -1860,25 +1940,27 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_div_2d.c */
 
 /* Start: bn_mp_div_3.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_DIV_3_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* divide by three (based on routine from MPI and the GMP manual) */
 int
@@ -1937,25 +2019,27 @@
   return res;
 }
 
+#endif
 
 /* End: bn_mp_div_3.c */
 
 /* Start: bn_mp_div_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_DIV_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 static int s_is_power_of_two(mp_digit b, int *p)
 {
@@ -1997,7 +2081,7 @@
   /* power of two ? */
   if (s_is_power_of_two(b, &ix) == 1) {
      if (d != NULL) {
-        *d = a->dp[0] & ((1<<ix) - 1);
+        *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
      }
      if (c != NULL) {
         return mp_div_2d(a, ix, c, NULL);
@@ -2005,10 +2089,12 @@
      return MP_OKAY;
   }
 
+#ifdef BN_MP_DIV_3_C
   /* three? */
   if (b == 3) {
      return mp_div_3(a, c, d);
   }
+#endif
 
   /* no easy answer [c'est la vie].  Just division */
   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
@@ -2043,25 +2129,27 @@
   return res;
 }
 
+#endif
 
 /* End: bn_mp_div_d.c */
 
 /* Start: bn_mp_dr_is_modulus.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_DR_IS_MODULUS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* determines if a number is a valid DR modulus */
 int mp_dr_is_modulus(mp_int *a)
@@ -2084,25 +2172,27 @@
    return 1;
 }
 
+#endif
 
 /* End: bn_mp_dr_is_modulus.c */
 
 /* Start: bn_mp_dr_reduce.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_DR_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
  *
@@ -2176,25 +2266,27 @@
   }
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_dr_reduce.c */
 
 /* Start: bn_mp_dr_setup.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_DR_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* determines the setup value */
 void mp_dr_setup(mp_int *a, mp_digit *d)
@@ -2206,25 +2298,27 @@
         ((mp_word)a->dp[0]));
 }
 
+#endif
 
 /* End: bn_mp_dr_setup.c */
 
 /* Start: bn_mp_exch.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_EXCH_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* swap the elements of two integers, for cases where you can't simply swap the 
  * mp_int pointers around
@@ -2238,25 +2332,27 @@
   *a = *b;
   *b = t;
 }
+#endif
 
 /* End: bn_mp_exch.c */
 
 /* Start: bn_mp_expt_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_EXPT_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* calculate c = a**b  using a square-multiply algorithm */
 int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
@@ -2293,25 +2389,27 @@
   mp_clear (&g);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_expt_d.c */
 
 /* Start: bn_mp_exptmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_EXPTMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 
 /* this is a shell function that calls either the normal or Montgomery
@@ -2330,6 +2428,7 @@
 
   /* if exponent X is negative we have to recurse */
   if (X->sign == MP_NEG) {
+#ifdef BN_MP_INVMOD_C
      mp_int tmpG, tmpX;
      int err;
 
@@ -2356,44 +2455,65 @@
      err = mp_exptmod(&tmpG, &tmpX, P, Y);
      mp_clear_multi(&tmpG, &tmpX, NULL);
      return err;
-  }
-
+#else 
+     /* no invmod */
+     return MP_VAL
+#endif
+  }
+
+#ifdef BN_MP_DR_IS_MODULUS_C
   /* is it a DR modulus? */
   dr = mp_dr_is_modulus(P);
-
+#else
+  dr = 0;
+#endif
+
+#ifdef BN_MP_REDUCE_IS_2K_C
   /* if not, is it a uDR modulus? */
   if (dr == 0) {
      dr = mp_reduce_is_2k(P) << 1;
   }
+#endif
     
   /* if the modulus is odd or dr != 0 use the fast method */
+#ifdef BN_MP_EXPTMOD_FAST_C
   if (mp_isodd (P) == 1 || dr !=  0) {
     return mp_exptmod_fast (G, X, P, Y, dr);
   } else {
+#endif
+#ifdef BN_S_MP_EXPTMOD_C
     /* otherwise use the generic Barrett reduction technique */
     return s_mp_exptmod (G, X, P, Y);
-  }
+#else
+    /* no exptmod for evens */
+    return MP_VAL;
+#endif
+#ifdef BN_MP_EXPTMOD_FAST_C
+  }
+#endif
 }
 
+#endif
 
 /* End: bn_mp_exptmod.c */
 
 /* Start: bn_mp_exptmod_fast.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_EXPTMOD_FAST_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
  *
@@ -2465,29 +2585,52 @@
 
   /* determine and setup reduction code */
   if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_SETUP_C     
      /* now setup montgomery  */
      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
         goto __M;
      }
+#else
+     err = MP_VAL;
+     goto __M;
+#endif
 
      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
      if (((P->used * 2 + 1) < MP_WARRAY) &&
           P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
         redux = fast_mp_montgomery_reduce;
-     } else {
+     } else 
+#endif
+     {
+#ifdef BN_MP_MONTGOMERY_REDUCE_C
         /* use slower baseline Montgomery method */
         redux = mp_montgomery_reduce;
+#else
+        err = MP_VAL;
+        goto __M;
+#endif
      }
   } else if (redmode == 1) {
+#if defined(BN_MP_DR_SETUP_C) && defined(BN_MP_DR_REDUCE_C)
      /* setup DR reduction for moduli of the form B**k - b */
      mp_dr_setup(P, &mp);
      redux = mp_dr_reduce;
+#else
+     err = MP_VAL;
+     goto __M;
+#endif
   } else {
+#if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
      /* setup DR reduction for moduli of the form 2**k - b */
      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
         goto __M;
      }
      redux = mp_reduce_2k;
+#else
+     err = MP_VAL;
+     goto __M;
+#endif
   }
 
   /* setup result */
@@ -2497,16 +2640,21 @@
 
   /* create M table
    *
-   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
+
    *
    * The first half of the table is not computed though accept for M[0] and M[1]
    */
 
   if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
      /* now we need R mod m */
      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
        goto __RES;
      }
+#else 
+     err = MP_VAL;
+     goto __RES;
+#endif
 
      /* now set M[1] to G * R mod m */
      if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
@@ -2650,7 +2798,7 @@
       * to reduce one more time to cancel out the factor
       * of R.
       */
-     if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+     if ((err = redux(&res, P, mp)) != MP_OKAY) {
        goto __RES;
      }
   }
@@ -2666,25 +2814,28 @@
   }
   return err;
 }
+#endif
+
 
 /* End: bn_mp_exptmod_fast.c */
 
 /* Start: bn_mp_exteuclid.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_EXTEUCLID_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* Extended euclidean algorithm of (a, b) produces 
    a*u1 + b*u2 = u3
@@ -2739,25 +2890,27 @@
 _ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL);
    return err;
 }
+#endif
 
 /* End: bn_mp_exteuclid.c */
 
 /* Start: bn_mp_fread.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_FREAD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* read a bigint from a file stream in ASCII */
 int mp_fread(mp_int *a, int radix, FILE *stream)
@@ -2804,25 +2957,27 @@
    return MP_OKAY;
 }
 
+#endif
 
 /* End: bn_mp_fread.c */
 
 /* Start: bn_mp_fwrite.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_FWRITE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 int mp_fwrite(mp_int *a, int radix, FILE *stream)
 {
@@ -2854,25 +3009,27 @@
    return MP_OKAY;
 }
 
+#endif
 
 /* End: bn_mp_fwrite.c */
 
 /* Start: bn_mp_gcd.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_GCD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* Greatest Common Divisor using the binary method */
 int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
@@ -2965,25 +3122,27 @@
 __U:mp_clear (&v);
   return res;
 }
+#endif
 
 /* End: bn_mp_gcd.c */
 
 /* Start: bn_mp_get_int.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_GET_INT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* get the lower 32-bits of an mp_int */
 unsigned long mp_get_int(mp_int * a) 
@@ -3008,25 +3167,27 @@
   /* force result to 32-bits always so it is consistent on non 32-bit platforms */
   return res & 0xFFFFFFFFUL;
 }
+#endif
 
 /* End: bn_mp_get_int.c */
 
 /* Start: bn_mp_grow.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_GROW_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* grow as required */
 int mp_grow (mp_int * a, int size)
@@ -3063,35 +3224,44 @@
   }
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_grow.c */
 
 /* Start: bn_mp_init.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* init a new bigint */
+#include <tommath.h>
+#ifdef BN_MP_INIT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* init a new mp_int */
 int mp_init (mp_int * a)
 {
+  int i;
+
   /* allocate memory required and clear it */
-  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), MP_PREC);
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
   if (a->dp == NULL) {
     return MP_MEM;
   }
 
+  /* set the digits to zero */
+  for (i = 0; i < MP_PREC; i++) {
+      a->dp[i] = 0;
+  }
+
   /* set the used to zero, allocated digits to the default precision
    * and sign to positive */
   a->used  = 0;
@@ -3100,25 +3270,27 @@
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_init.c */
 
 /* Start: bn_mp_init_copy.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_INIT_COPY_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* creates "a" then copies b into it */
 int mp_init_copy (mp_int * a, mp_int * b)
@@ -3130,25 +3302,27 @@
   }
   return mp_copy (b, a);
 }
+#endif
 
 /* End: bn_mp_init_copy.c */
 
 /* Start: bn_mp_init_multi.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_INIT_MULTI_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 #include <stdarg.h>
 
 int mp_init_multi(mp_int *mp, ...) 
@@ -3187,25 +3361,27 @@
     return res;                /* Assumed ok, if error flagged above. */
 }
 
+#endif
 
 /* End: bn_mp_init_multi.c */
 
 /* Start: bn_mp_init_set.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_INIT_SET_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* initialize and set a digit */
 int mp_init_set (mp_int * a, mp_digit b)
@@ -3217,25 +3393,27 @@
   mp_set(a, b);
   return err;
 }
+#endif
 
 /* End: bn_mp_init_set.c */
 
 /* Start: bn_mp_init_set_int.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_INIT_SET_INT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* initialize and set a digit */
 int mp_init_set_int (mp_int * a, unsigned long b)
@@ -3246,66 +3424,122 @@
   }
   return mp_set_int(a, b);
 }
+#endif
 
 /* End: bn_mp_init_set_int.c */
 
 /* Start: bn_mp_init_size.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_INIT_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* init an mp_init for a given size */
 int mp_init_size (mp_int * a, int size)
 {
+  int x;
+
   /* pad size so there are always extra digits */
   size += (MP_PREC * 2) - (size % MP_PREC);	
   
   /* alloc mem */
-  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), size);
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
   if (a->dp == NULL) {
     return MP_MEM;
   }
+
+  /* set the members */
   a->used  = 0;
   a->alloc = size;
   a->sign  = MP_ZPOS;
 
+  /* zero the digits */
+  for (x = 0; x < size; x++) {
+      a->dp[x] = 0;
+  }
+
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_init_size.c */
 
 /* Start: bn_mp_invmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_INVMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* hac 14.61, pp608 */
 int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
+  /* b cannot be negative */
+  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
+    return MP_VAL;
+  }
+
+#ifdef BN_FAST_MP_INVMOD_C
+  /* if the modulus is odd we can use a faster routine instead */
+  if (mp_isodd (b) == 1) {
+    return fast_mp_invmod (a, b, c);
+  }
+#endif
+
+#ifdef BN_MP_INVMOD_SLOW_C
+  return mp_invmod_slow(a, b, c);
+#endif
+
+  return MP_VAL;
+}
+#endif
+
+/* End: bn_mp_invmod.c */
+
+/* Start: bn_mp_invmod_slow.c */
+#include <tommath.h>
+#ifdef BN_MP_INVMOD_SLOW_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* hac 14.61, pp608 */
+int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
+{
   mp_int  x, y, u, v, A, B, C, D;
   int     res;
 
@@ -3314,11 +3548,6 @@
     return MP_VAL;
   }
 
-  /* if the modulus is odd we can use a faster routine instead */
-  if (mp_isodd (b) == 1) {
-    return fast_mp_invmod (a, b, c);
-  }
-  
   /* init temps */
   if ((res = mp_init_multi(&x, &y, &u, &v, 
                            &A, &B, &C, &D, NULL)) != MP_OKAY) {
@@ -3461,25 +3690,27 @@
 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
-
-/* End: bn_mp_invmod.c */
+#endif
+
+/* End: bn_mp_invmod_slow.c */
 
 /* Start: bn_mp_is_square.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_IS_SQUARE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* Check if remainders are possible squares - fast exclude non-squares */
 static const char rem_128[128] = {
@@ -3536,7 +3767,7 @@
      return MP_OKAY;
   }
 
-  /* product of primes less than 2^31 */
+
   if ((res = mp_init_set_int(&t,11L*13L*17L*19L*23L*29L*31L)) != MP_OKAY) {
      return res;
   }
@@ -3568,25 +3799,27 @@
 ERR:mp_clear(&t);
   return res;
 }
+#endif
 
 /* End: bn_mp_is_square.c */
 
 /* Start: bn_mp_jacobi.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_JACOBI_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* computes the jacobi c = (a | n) (or Legendre if n is prime)
  * HAC pp. 73 Algorithm 2.149
@@ -3671,25 +3904,27 @@
 __A1:mp_clear (&a1);
   return res;
 }
+#endif
 
 /* End: bn_mp_jacobi.c */
 
 /* Start: bn_mp_karatsuba_mul.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_KARATSUBA_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* c = |a| * |b| using Karatsuba Multiplication using 
  * three half size multiplications
@@ -3753,9 +3988,6 @@
     goto X0Y0;
 
   /* now shift the digits */
-  x0.sign = x1.sign = a->sign;
-  y0.sign = y1.sign = b->sign;
-
   x0.used = y0.used = B;
   x1.used = a->used - B;
   y1.used = b->used - B;
@@ -3839,30 +4071,32 @@
 ERR:
   return err;
 }
+#endif
 
 /* End: bn_mp_karatsuba_mul.c */
 
 /* Start: bn_mp_karatsuba_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_KARATSUBA_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* Karatsuba squaring, computes b = a*a using three 
  * half size squarings
  *
- * See comments of mp_karatsuba_mul for details.  It 
+ * See comments of karatsuba_mul for details.  It 
  * is essentially the same algorithm but merely 
  * tuned to perform recursive squarings.
  */
@@ -3958,25 +4192,27 @@
 ERR:
   return err;
 }
+#endif
 
 /* End: bn_mp_karatsuba_sqr.c */
 
 /* Start: bn_mp_lcm.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_LCM_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* computes least common multiple as |a*b|/(a, b) */
 int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
@@ -4016,25 +4252,27 @@
   mp_clear_multi (&t1, &t2, NULL);
   return res;
 }
+#endif
 
 /* End: bn_mp_lcm.c */
 
 /* Start: bn_mp_lshd.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_LSHD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* shift left a certain amount of digits */
 int mp_lshd (mp_int * a, int b)
@@ -4081,25 +4319,27 @@
   }
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_lshd.c */
 
 /* Start: bn_mp_mod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* c = a mod b, 0 <= c < b */
 int
@@ -4127,25 +4367,27 @@
   mp_clear (&t);
   return res;
 }
+#endif
 
 /* End: bn_mp_mod.c */
 
 /* Start: bn_mp_mod_2d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MOD_2D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* calc a value mod 2**b */
 int
@@ -4180,73 +4422,78 @@
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_mod_2d.c */
 
 /* Start: bn_mp_mod_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MOD_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 int
 mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
 {
   return mp_div_d(a, b, NULL, c);
 }
+#endif
 
 /* End: bn_mp_mod_d.c */
 
 /* Start: bn_mp_montgomery_calc_normalization.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* calculates a = B^n mod b for Montgomery reduction
- * Where B is the base [e.g. 2^DIGIT_BIT].
- * B^n mod b is computed by first computing
- * A = B^(n-1) which doesn't require a reduction but a simple OR.
- * then C = A * B = B^n is computed by performing upto DIGIT_BIT
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/*
  * shifts with subtractions when the result is greater than b.
  *
  * The method is slightly modified to shift B unconditionally upto just under
  * the leading bit of b.  This saves alot of multiple precision shifting.
  */
-int
-mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
+int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
 {
   int     x, bits, res;
 
   /* how many bits of last digit does b use */
   bits = mp_count_bits (b) % DIGIT_BIT;
 
-  /* compute A = B^(n-1) * 2^(bits-1) */
-  if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
-    return res;
-  }
+
+  if (b->used > 1) {
+     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
+        return res;
+     }
+  } else {
+     mp_set(a, 1);
+     bits = 1;
+  }
+
 
   /* now compute C = A * B mod b */
   for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
@@ -4262,25 +4509,27 @@
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_montgomery_calc_normalization.c */
 
 /* Start: bn_mp_montgomery_reduce.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
 int
@@ -4291,7 +4540,7 @@
 
   /* can the fast reduction [comba] method be used?
    *
-   * Note that unlike in mp_mul you're safely allowed *less*
+   * Note that unlike in mul you're safely allowed *less*
    * than the available columns [255 per default] since carries
    * are fixed up in the inner loop.
    */
@@ -4314,7 +4563,7 @@
     /* mu = ai * rho mod b
      *
      * The value of rho must be precalculated via
-     * bn_mp_montgomery_setup() such that
+     * montgomery_setup() such that
      * it equals -1/n0 mod b this allows the
      * following inner loop to reduce the
      * input one digit at a time
@@ -4378,25 +4627,27 @@
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_montgomery_reduce.c */
 
 /* Start: bn_mp_montgomery_setup.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* setups the montgomery reduction stuff */
 int
@@ -4431,29 +4682,31 @@
 #endif
 
   /* rho = -1/m mod b */
-  *rho = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
+  *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_montgomery_setup.c */
 
 /* Start: bn_mp_mul.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* high level multiplication (handles sign) */
 int mp_mul (mp_int * a, mp_int * b, mp_int * c)
@@ -4462,12 +4715,18 @@
   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
 
   /* use Toom-Cook? */
+#ifdef BN_MP_TOOM_MUL_C
   if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
     res = mp_toom_mul(a, b, c);
+  } else 
+#endif
+#ifdef BN_MP_KARATSUBA_MUL_C
   /* use Karatsuba? */
-  } else if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
+  if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
     res = mp_karatsuba_mul (a, b, c);
-  } else {
+  } else 
+#endif
+  {
     /* can we use the fast multiplier?
      *
      * The fast multiplier can be used if the output will 
@@ -4476,36 +4735,44 @@
      */
     int     digs = a->used + b->used + 1;
 
+#ifdef BN_FAST_S_MP_MUL_DIGS_C
     if ((digs < MP_WARRAY) &&
         MIN(a->used, b->used) <= 
         (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
       res = fast_s_mp_mul_digs (a, b, c, digs);
-    } else {
-      res = s_mp_mul (a, b, c);
-    }
-  }
-  c->sign = neg;
+    } else 
+#endif
+#ifdef BN_S_MP_MUL_DIGS_C
+      res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
+#else
+      res = MP_VAL;
+#endif
+
+  }
+  c->sign = (c->used > 0) ? neg : MP_ZPOS;
   return res;
 }
+#endif
 
 /* End: bn_mp_mul.c */
 
 /* Start: bn_mp_mul_2.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MUL_2_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* b = a*2 */
 int mp_mul_2(mp_int * a, mp_int * b)
@@ -4567,25 +4834,27 @@
   b->sign = a->sign;
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_mul_2.c */
 
 /* Start: bn_mp_mul_2d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MUL_2D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* shift left by a certain bit count */
 int mp_mul_2d (mp_int * a, int b, mp_int * c)
@@ -4650,25 +4919,27 @@
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_mul_2d.c */
 
 /* Start: bn_mp_mul_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MUL_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* multiply by a digit */
 int
@@ -4726,25 +4997,27 @@
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_mul_d.c */
 
 /* Start: bn_mp_mulmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_MULMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* d = a * b (mod c) */
 int
@@ -4765,25 +5038,27 @@
   mp_clear (&t);
   return res;
 }
+#endif
 
 /* End: bn_mp_mulmod.c */
 
 /* Start: bn_mp_n_root.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_N_ROOT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* find the n'th root of an integer 
  *
@@ -4895,25 +5170,27 @@
 __T1:mp_clear (&t1);
   return res;
 }
+#endif
 
 /* End: bn_mp_n_root.c */
 
 /* Start: bn_mp_neg.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_NEG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* b = -a */
 int mp_neg (mp_int * a, mp_int * b)
@@ -4927,25 +5204,27 @@
   }
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_neg.c */
 
 /* Start: bn_mp_or.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_OR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* OR two ints together */
 int mp_or (mp_int * a, mp_int * b, mp_int * c)
@@ -4975,25 +5254,27 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_or.c */
 
 /* Start: bn_mp_prime_fermat.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_PRIME_FERMAT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* performs one Fermat test.
  * 
@@ -5035,25 +5316,27 @@
 __T:mp_clear (&t);
   return err;
 }
+#endif
 
 /* End: bn_mp_prime_fermat.c */
 
 /* Start: bn_mp_prime_is_divisible.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_PRIME_IS_DIVISIBLE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* determines if an integers is divisible by one 
  * of the first PRIME_SIZE primes or not
@@ -5083,30 +5366,32 @@
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_prime_is_divisible.c */
 
 /* Start: bn_mp_prime_is_prime.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_PRIME_IS_PRIME_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* performs a variable number of rounds of Miller-Rabin
  *
  * Probability of error after t rounds is no more than
- * (1/4)^t when 1 <= t <= PRIME_SIZE
+
  *
  * Sets result to 1 if probably prime, 0 otherwise
  */
@@ -5164,25 +5449,27 @@
 __B:mp_clear (&b);
   return err;
 }
+#endif
 
 /* End: bn_mp_prime_is_prime.c */
 
 /* Start: bn_mp_prime_miller_rabin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_PRIME_MILLER_RABIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* Miller-Rabin test of "a" to the base of "b" as described in 
  * HAC pp. 139 Algorithm 4.24
@@ -5265,25 +5552,27 @@
 __N1:mp_clear (&n1);
   return err;
 }
+#endif
 
 /* End: bn_mp_prime_miller_rabin.c */
 
 /* Start: bn_mp_prime_next_prime.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_PRIME_NEXT_PRIME_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* finds the next prime after the number "a" using "t" trials
  * of Miller-Rabin.
@@ -5433,25 +5722,79 @@
    return err;
 }
 
+#endif
 
 /* End: bn_mp_prime_next_prime.c */
 
+/* Start: bn_mp_prime_rabin_miller_trials.c */
+#include <tommath.h>
+#ifdef BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+
+static const struct {
+   int k, t;
+} sizes[] = {
+{   128,    28 },
+{   256,    16 },
+{   384,    10 },
+{   512,     7 },
+{   640,     6 },
+{   768,     5 },
+{   896,     4 },
+{  1024,     4 }
+};
+
+/* returns # of RM trials required for a given bit size */
+int mp_prime_rabin_miller_trials(int size)
+{
+   int x;
+
+   for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
+       if (sizes[x].k == size) {
+          return sizes[x].t;
+       } else if (sizes[x].k > size) {
+          return (x == 0) ? sizes[0].t : sizes[x - 1].t;
+       }
+   }
+   return sizes[x-1].t + 1;
+}
+
+
+#endif
+
+/* End: bn_mp_prime_rabin_miller_trials.c */
+
 /* Start: bn_mp_prime_random_ex.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_PRIME_RANDOM_EX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* makes a truly random prime of a given size (bits),
  *
@@ -5531,6 +5874,9 @@
 
       /* is it prime? */
       if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)           { goto error; }
+      if (res == MP_NO) {  
+         continue;
+      }
 
       if (flags & LTM_PRIME_SAFE) {
          /* see if (a-1)/2 is prime */
@@ -5555,25 +5901,27 @@
 }
 
 
+#endif
 
 /* End: bn_mp_prime_random_ex.c */
 
 /* Start: bn_mp_radix_size.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_RADIX_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* returns size of ASCII reprensentation */
 int mp_radix_size (mp_int * a, int radix, int *size)
@@ -5624,47 +5972,51 @@
   return MP_OKAY;
 }
 
+#endif
 
 /* End: bn_mp_radix_size.c */
 
 /* Start: bn_mp_radix_smap.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_RADIX_SMAP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* chars used in radix conversions */
 const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+#endif
 
 /* End: bn_mp_radix_smap.c */
 
 /* Start: bn_mp_rand.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_RAND_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* makes a pseudo-random int of a given size */
 int
@@ -5699,25 +6051,27 @@
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_rand.c */
 
 /* Start: bn_mp_read_radix.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_READ_RADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* read a string [ASCII] in a given radix */
 int mp_read_radix (mp_int * a, char *str, int radix)
@@ -5779,25 +6133,27 @@
   }
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_read_radix.c */
 
 /* Start: bn_mp_read_signed_bin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_READ_SIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* read signed bin, big endian, first byte is 0==positive or 1==negative */
 int
@@ -5819,25 +6175,27 @@
 
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_read_signed_bin.c */
 
 /* Start: bn_mp_read_unsigned_bin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_READ_UNSIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
 int
@@ -5873,25 +6231,27 @@
   mp_clamp (a);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_read_unsigned_bin.c */
 
 /* Start: bn_mp_reduce.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* reduces x mod m, assumes 0 < x < m**2, mu is 
  * precomputed via mp_reduce_setup.
@@ -5917,9 +6277,20 @@
       goto CLEANUP;
     }
   } else {
+#ifdef BN_S_MP_MUL_HIGH_DIGS_C
     if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
       goto CLEANUP;
     }
+#elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+#else 
+    { 
+      res = MP_VAL;
+      goto CLEANUP;
+    }
+#endif
   }
 
   /* q3 = q2 / b**(k+1) */
@@ -5961,25 +6332,27 @@
 
   return res;
 }
+#endif
 
 /* End: bn_mp_reduce.c */
 
 /* Start: bn_mp_reduce_2k.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* reduces a modulo n where n is of the form 2**p - d */
 int
@@ -6021,25 +6394,27 @@
    return res;
 }
 
+#endif
 
 /* End: bn_mp_reduce_2k.c */
 
 /* Start: bn_mp_reduce_2k_setup.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* determines the setup value */
 int 
@@ -6067,30 +6442,33 @@
    mp_clear(&tmp);
    return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_reduce_2k_setup.c */
 
 /* Start: bn_mp_reduce_is_2k.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* determines if mp_reduce_2k can be used */
 int mp_reduce_is_2k(mp_int *a)
 {
-   int ix, iy, iz, iw;
+   int ix, iy, iw;
+   mp_digit iz;
    
    if (a->used == 0) {
       return 0;
@@ -6107,7 +6485,7 @@
              return 0;
           }
           iz <<= 1;
-          if (iz > (int)MP_MASK) {
+          if (iz > (mp_digit)MP_MASK) {
              ++iw;
              iz = 1;
           }
@@ -6116,31 +6494,32 @@
    return 1;
 }
 
+#endif
 
 /* End: bn_mp_reduce_is_2k.c */
 
 /* Start: bn_mp_reduce_setup.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* pre-calculate the value required for Barrett reduction
  * For a given modulus "b" it calulates the value required in "a"
  */
-int
-mp_reduce_setup (mp_int * a, mp_int * b)
+int mp_reduce_setup (mp_int * a, mp_int * b)
 {
   int     res;
   
@@ -6149,25 +6528,27 @@
   }
   return mp_div (a, b, a, NULL);
 }
+#endif
 
 /* End: bn_mp_reduce_setup.c */
 
 /* Start: bn_mp_rshd.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_RSHD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* shift right a certain amount of digits */
 void mp_rshd (mp_int * a, int b)
@@ -6219,25 +6600,27 @@
   /* remove excess digits */
   a->used -= b;
 }
+#endif
 
 /* End: bn_mp_rshd.c */
 
 /* Start: bn_mp_set.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SET_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* set to a digit */
 void mp_set (mp_int * a, mp_digit b)
@@ -6246,25 +6629,27 @@
   a->dp[0] = b & MP_MASK;
   a->used  = (a->dp[0] != 0) ? 1 : 0;
 }
+#endif
 
 /* End: bn_mp_set.c */
 
 /* Start: bn_mp_set_int.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SET_INT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* set a 32-bit const */
 int mp_set_int (mp_int * a, unsigned long b)
@@ -6292,25 +6677,27 @@
   mp_clamp (a);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_set_int.c */
 
 /* Start: bn_mp_shrink.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SHRINK_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* shrink a bignum */
 int mp_shrink (mp_int * a)
@@ -6325,50 +6712,54 @@
   }
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_shrink.c */
 
 /* Start: bn_mp_signed_bin_size.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SIGNED_BIN_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* get the size for an signed equivalent */
 int mp_signed_bin_size (mp_int * a)
 {
   return 1 + mp_unsigned_bin_size (a);
 }
+#endif
 
 /* End: bn_mp_signed_bin_size.c */
 
 /* Start: bn_mp_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* computes b = a*a */
 int
@@ -6376,44 +6767,57 @@
 {
   int     res;
 
+#ifdef BN_MP_TOOM_SQR_C
   /* use Toom-Cook? */
   if (a->used >= TOOM_SQR_CUTOFF) {
     res = mp_toom_sqr(a, b);
   /* Karatsuba? */
-  } else if (a->used >= KARATSUBA_SQR_CUTOFF) {
+  } else 
+#endif
+#ifdef BN_MP_KARATSUBA_SQR_C
+if (a->used >= KARATSUBA_SQR_CUTOFF) {
     res = mp_karatsuba_sqr (a, b);
-  } else {
+  } else 
+#endif
+  {
+#ifdef BN_FAST_S_MP_SQR_C
     /* can we use the fast comba multiplier? */
     if ((a->used * 2 + 1) < MP_WARRAY && 
          a->used < 
          (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
       res = fast_s_mp_sqr (a, b);
-    } else {
+    } else
+#endif
+#ifdef BN_S_MP_SQR_C
       res = s_mp_sqr (a, b);
-    }
+#else
+      res = MP_VAL;
+#endif
   }
   b->sign = MP_ZPOS;
   return res;
 }
+#endif
 
 /* End: bn_mp_sqr.c */
 
 /* Start: bn_mp_sqrmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SQRMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* c = a * a (mod b) */
 int
@@ -6434,25 +6838,27 @@
   mp_clear (&t);
   return res;
 }
+#endif
 
 /* End: bn_mp_sqrmod.c */
 
 /* Start: bn_mp_sqrt.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SQRT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* this function is less generic than mp_n_root, simpler and faster */
 int mp_sqrt(mp_int *arg, mp_int *ret) 
@@ -6513,25 +6919,27 @@
   return res;
 }
 
+#endif
 
 /* End: bn_mp_sqrt.c */
 
 /* Start: bn_mp_sub.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SUB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* high level subtraction (handles signs) */
 int
@@ -6570,25 +6978,27 @@
   return res;
 }
 
+#endif
 
 /* End: bn_mp_sub.c */
 
 /* Start: bn_mp_sub_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SUB_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* single digit subtraction */
 int
@@ -6657,25 +7067,27 @@
   return MP_OKAY;
 }
 
+#endif
 
 /* End: bn_mp_sub_d.c */
 
 /* Start: bn_mp_submod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_SUBMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* d = a - b (mod c) */
 int
@@ -6697,25 +7109,27 @@
   mp_clear (&t);
   return res;
 }
+#endif
 
 /* End: bn_mp_submod.c */
 
 /* Start: bn_mp_to_signed_bin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* store in signed [big endian] format */
 int
@@ -6729,25 +7143,27 @@
   b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_to_signed_bin.c */
 
 /* Start: bn_mp_to_unsigned_bin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* store in unsigned [big endian] format */
 int
@@ -6776,27 +7192,34 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_to_unsigned_bin.c */
 
 /* Start: bn_mp_toom_mul.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* multiplication using the Toom-Cook 3-way algorithm */
+#include <tommath.h>
+#ifdef BN_MP_TOOM_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
+
+/* multiplication using the Toom-Cook 3-way algorithm 
+ *
+ * Much more complicated than Karatsuba but has a lower asymptotic running time of 
+ * O(N**1.464).  This algorithm is only particularly useful on VERY large
+ * inputs (we're talking 1000s of digits here...).
+*/
 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
     mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
@@ -7052,25 +7475,27 @@
      return res;
 }     
      
+#endif
 
 /* End: bn_mp_toom_mul.c */
 
 /* Start: bn_mp_toom_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_TOOM_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* squaring using Toom-Cook 3-way algorithm */
 int
@@ -7276,25 +7701,27 @@
      return res;
 }
 
+#endif
 
 /* End: bn_mp_toom_sqr.c */
 
 /* Start: bn_mp_toradix.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_TORADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* stores a bignum as a ASCII string in a given radix (2..64) */
 int mp_toradix (mp_int * a, char *str, int radix)
@@ -7349,25 +7776,27 @@
   return MP_OKAY;
 }
 
+#endif
 
 /* End: bn_mp_toradix.c */
 
 /* Start: bn_mp_toradix_n.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_TORADIX_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* stores a bignum as a ASCII string in a given radix (2..64) 
  *
@@ -7436,25 +7865,27 @@
   return MP_OKAY;
 }
 
+#endif
 
 /* End: bn_mp_toradix_n.c */
 
 /* Start: bn_mp_unsigned_bin_size.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_UNSIGNED_BIN_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* get the size for an unsigned equivalent */
 int
@@ -7463,25 +7894,27 @@
   int     size = mp_count_bits (a);
   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
 }
+#endif
 
 /* End: bn_mp_unsigned_bin_size.c */
 
 /* Start: bn_mp_xor.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_XOR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* XOR two ints together */
 int
@@ -7505,32 +7938,34 @@
   }
 
   for (ix = 0; ix < px; ix++) {
-    t.dp[ix] ^= x->dp[ix];
+
   }
   mp_clamp (&t);
   mp_exch (c, &t);
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_mp_xor.c */
 
 /* Start: bn_mp_zero.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_MP_ZERO_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* set to zero */
 void
@@ -7540,80 +7975,27 @@
   a->used = 0;
   memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
 }
+#endif
 
 /* End: bn_mp_zero.c */
 
-/* Start: bn_prime_sizes_tab.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* this table gives the # of rabin miller trials for a prob of failure lower than 2^-96 */
-static const struct {
-   int k, t;
-} sizes[] = {
-{   128,    28 },
-{   256,    16 },
-{   384,    10 },
-{   512,     7 },
-{   640,     6 },
-{   768,     5 },
-{   896,     4 },
-{  1024,     4 },
-{  1152,     3 },
-{  1280,     3 },
-{  1408,     3 },
-{  1536,     3 },
-{  1664,     3 },
-{  1792,     2 } };
-
-/* returns # of RM trials required for a given bit size */
-int mp_prime_rabin_miller_trials(int size)
-{
-   int x;
-
-   for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
-       if (sizes[x].k == size) {
-          return sizes[x].t;
-       } else if (sizes[x].k > size) {
-          return (x == 0) ? sizes[0].t : sizes[x - 1].t;
-       }
-   }
-   return 1;
-}
-
-
-
-/* End: bn_prime_sizes_tab.c */
-
 /* Start: bn_prime_tab.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_PRIME_TAB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 const mp_digit __prime_tab[] = {
   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
@@ -7654,25 +8036,27 @@
   0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
 #endif
 };
+#endif
 
 /* End: bn_prime_tab.c */
 
 /* Start: bn_reverse.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_REVERSE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* reverse an array, used for radix code */
 void
@@ -7691,25 +8075,27 @@
     --iy;
   }
 }
+#endif
 
 /* End: bn_reverse.c */
 
 /* Start: bn_s_mp_add.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_S_MP_ADD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
 int
@@ -7798,25 +8184,27 @@
   mp_clamp (c);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_s_mp_add.c */
 
 /* Start: bn_s_mp_exptmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_S_MP_EXPTMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 #ifdef MP_LOW_MEM
    #define TAB_SIZE 32
@@ -8036,25 +8424,27 @@
   }
   return err;
 }
+#endif
 
 /* End: bn_s_mp_exptmod.c */
 
 /* Start: bn_s_mp_mul_digs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_S_MP_MUL_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* multiplies |a| * |b| and only computes upto digs digits of result
  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
@@ -8125,25 +8515,27 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_s_mp_mul_digs.c */
 
 /* Start: bn_s_mp_mul_high_digs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_S_MP_MUL_HIGH_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* multiplies |a| * |b| and does not compute the lower digs digits
  * [meant to get the higher part of the product]
@@ -8158,10 +8550,12 @@
   mp_digit tmpx, *tmpt, *tmpy;
 
   /* can we use the fast multiplier? */
+#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
   if (((a->used + b->used + 1) < MP_WARRAY)
       && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
     return fast_s_mp_mul_high_digs (a, b, c, digs);
   }
+#endif
 
   if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
     return res;
@@ -8202,25 +8596,27 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_s_mp_mul_high_digs.c */
 
 /* Start: bn_s_mp_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_S_MP_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
 int
@@ -8285,25 +8681,27 @@
   mp_clear (&t);
   return MP_OKAY;
 }
+#endif
 
 /* End: bn_s_mp_sqr.c */
 
 /* Start: bn_s_mp_sub.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BN_S_MP_SUB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
 int
@@ -8372,41 +8770,42 @@
   return MP_OKAY;
 }
 
+#endif
 
 /* End: bn_s_mp_sub.c */
 
 /* Start: bncore.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, [email protected], http://math.libtomcrypt.org
- */
-#include <tommath.h>
+#include <tommath.h>
+#ifdef BNCORE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, [email protected], http://math.libtomcrypt.org
+ */
 
 /* Known optimal configurations
 
  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
- Intel P4               /GCC v3.2     /        70/       108
- AMD Athlon XP          /GCC v3.2     /       109/       127
-
+ Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
+ 
 */
 
-/* configured for a AMD XP Thoroughbred core with etc/tune.c */
-int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 88,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 128,     /* Min. number of digits before Karatsuba squaring is used. */
         
         TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
         TOOM_SQR_CUTOFF      = 400; 
+#endif
 
 /* End: bncore.c */
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pretty.build	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,66 @@
+#!/bin/perl -w
+#
+# Cute little builder for perl 
+# Total waste of development time...
+#
+# This will build all the object files and then the archive .a file
+# requires GCC, GNU make and a sense of humour.
+#
+# Tom St Denis
+use strict;
+
+my $count = 0;
+my $starttime = time;
+my $rate  = 0;
+print "Scanning for source files...\n";
+foreach my $filename (glob "*.c") {
+       ++$count;
+}
+print "Source files to build: $count\nBuilding...\n";
+my $i = 0;
+my $lines = 0;
+my $filesbuilt = 0;
+foreach my $filename (glob "*.c") {
+       printf("Building %3.2f%%, ", (++$i/$count)*100.0);
+       if ($i % 4 == 0) { print "/, "; }
+       if ($i % 4 == 1) { print "-, "; }
+       if ($i % 4 == 2) { print "\\, "; }
+       if ($i % 4 == 3) { print "|, "; }
+       if ($rate > 0) {
+           my $tleft = ($count - $i) / $rate;
+           my $tsec  = $tleft%60;
+           my $tmin  = ($tleft/60)%60;
+           my $thour = ($tleft/3600)%60;
+           printf("%2d:%02d:%02d left, ", $thour, $tmin, $tsec);
+       }
+       my $cnt = ($i/$count)*30.0;
+       my $x   = 0;
+       print "[";
+       for (; $x < $cnt; $x++) { print "#"; }
+       for (; $x < 30; $x++)   { print " "; }
+       print "]\r";
+       my $tmp = $filename;
+       $tmp =~ s/\.c/".o"/ge;
+       if (open(SRC, "<$tmp")) {
+          close SRC;
+       } else {
+          !system("make $tmp > /dev/null 2>/dev/null") or die "\nERROR: Failed to make $tmp!!!\n";
+          open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
+          ++$lines while (<SRC>);
+          close SRC or die "Error closing $filename after reading: $!";
+          ++$filesbuilt;
+       }      
+
+       # update timer 
+       if (time != $starttime) {
+          my $delay = time - $starttime;
+          $rate = $i/$delay;
+       }
+}
+
+# finish building the library 
+printf("\nFinished building source (%d seconds, %3.2f files per second).\n", time - $starttime, $rate);
+print "Compiled approximately $filesbuilt files and $lines lines of code.\n";
+print "Doing final make (building archive...)\n";
+!system("make > /dev/null 2>/dev/null") or die "\nERROR: Failed to perform last make command!!!\n";
+print "done.\n";
\ No newline at end of file
--- a/tommath.h	Fri Dec 17 06:27:22 2004 +0000
+++ b/tommath.h	Sun Dec 19 15:57:19 2004 +0000
@@ -21,8 +21,7 @@
 #include <ctype.h>
 #include <limits.h>
 
-#define NO_LTM_TOOM 1
-#define NO_LTM_KARATSUBA 1
+#include <tommath_class.h>
 
 #undef MIN
 #define MIN(x,y) ((x)<(y)?(x):(y))
@@ -42,6 +41,14 @@
 
 #endif
 
+
+/* detect 64-bit mode if possible */
+#if defined(__x86_64__) 
+   #if !(defined(MP_64BIT) && defined(MP_16BIT) && defined(MP_8BIT))
+      #define MP_64BIT
+   #endif
+#endif
+
 /* some default configurations.
  *
  * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits
@@ -63,7 +70,7 @@
    typedef signed long long   long64;
 #endif
 
-   typedef ulong64            mp_digit;
+   typedef unsigned long      mp_digit;
    typedef unsigned long      mp_word __attribute__ ((mode(TI)));
 
    #define DIGIT_BIT          60
@@ -156,7 +163,7 @@
 
 /* default precision */
 #ifndef MP_PREC
-   #ifdef MP_LOW_MEM
+   #ifndef MP_LOW_MEM
       #define MP_PREC                 64     /* default digits of precision */
    #else
       #define MP_PREC                 8      /* default digits of precision */
@@ -544,6 +551,7 @@
 int mp_karatsuba_sqr(mp_int *a, mp_int *b);
 int mp_toom_sqr(mp_int *a, mp_int *b);
 int fast_mp_invmod(mp_int *a, mp_int *b, mp_int *c);
+int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c);
 int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
 int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode);
 int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tommath.out	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,139 @@
+\BOOKMARK [0][-]{chapter.1}{Introduction}{}
+\BOOKMARK [1][-]{section.1.1}{Multiple Precision Arithmetic}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.1.1}{What is Multiple Precision Arithmetic?}{section.1.1}
+\BOOKMARK [2][-]{subsection.1.1.2}{The Need for Multiple Precision Arithmetic}{section.1.1}
+\BOOKMARK [2][-]{subsection.1.1.3}{Benefits of Multiple Precision Arithmetic}{section.1.1}
+\BOOKMARK [1][-]{section.1.2}{Purpose of This Text}{chapter.1}
+\BOOKMARK [1][-]{section.1.3}{Discussion and Notation}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.3.1}{Notation}{section.1.3}
+\BOOKMARK [2][-]{subsection.1.3.2}{Precision Notation}{section.1.3}
+\BOOKMARK [2][-]{subsection.1.3.3}{Algorithm Inputs and Outputs}{section.1.3}
+\BOOKMARK [2][-]{subsection.1.3.4}{Mathematical Expressions}{section.1.3}
+\BOOKMARK [2][-]{subsection.1.3.5}{Work Effort}{section.1.3}
+\BOOKMARK [1][-]{section.1.4}{Exercises}{chapter.1}
+\BOOKMARK [1][-]{section.1.5}{Introduction to LibTomMath}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.5.1}{What is LibTomMath?}{section.1.5}
+\BOOKMARK [2][-]{subsection.1.5.2}{Goals of LibTomMath}{section.1.5}
+\BOOKMARK [1][-]{section.1.6}{Choice of LibTomMath}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.6.1}{Code Base}{section.1.6}
+\BOOKMARK [2][-]{subsection.1.6.2}{API Simplicity}{section.1.6}
+\BOOKMARK [2][-]{subsection.1.6.3}{Optimizations}{section.1.6}
+\BOOKMARK [2][-]{subsection.1.6.4}{Portability and Stability}{section.1.6}
+\BOOKMARK [2][-]{subsection.1.6.5}{Choice}{section.1.6}
+\BOOKMARK [0][-]{chapter.2}{Getting Started}{}
+\BOOKMARK [1][-]{section.2.1}{Library Basics}{chapter.2}
+\BOOKMARK [1][-]{section.2.2}{What is a Multiple Precision Integer?}{chapter.2}
+\BOOKMARK [2][-]{subsection.2.2.1}{The mp\137int Structure}{section.2.2}
+\BOOKMARK [1][-]{section.2.3}{Argument Passing}{chapter.2}
+\BOOKMARK [1][-]{section.2.4}{Return Values}{chapter.2}
+\BOOKMARK [1][-]{section.2.5}{Initialization and Clearing}{chapter.2}
+\BOOKMARK [2][-]{subsection.2.5.1}{Initializing an mp\137int}{section.2.5}
+\BOOKMARK [2][-]{subsection.2.5.2}{Clearing an mp\137int}{section.2.5}
+\BOOKMARK [1][-]{section.2.6}{Maintenance Algorithms}{chapter.2}
+\BOOKMARK [2][-]{subsection.2.6.1}{Augmenting an mp\137int's Precision}{section.2.6}
+\BOOKMARK [2][-]{subsection.2.6.2}{Initializing Variable Precision mp\137ints}{section.2.6}
+\BOOKMARK [2][-]{subsection.2.6.3}{Multiple Integer Initializations and Clearings}{section.2.6}
+\BOOKMARK [2][-]{subsection.2.6.4}{Clamping Excess Digits}{section.2.6}
+\BOOKMARK [0][-]{chapter.3}{Basic Operations}{}
+\BOOKMARK [1][-]{section.3.1}{Introduction}{chapter.3}
+\BOOKMARK [1][-]{section.3.2}{Assigning Values to mp\137int Structures}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.2.1}{Copying an mp\137int}{section.3.2}
+\BOOKMARK [2][-]{subsection.3.2.2}{Creating a Clone}{section.3.2}
+\BOOKMARK [1][-]{section.3.3}{Zeroing an Integer}{chapter.3}
+\BOOKMARK [1][-]{section.3.4}{Sign Manipulation}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.4.1}{Absolute Value}{section.3.4}
+\BOOKMARK [2][-]{subsection.3.4.2}{Integer Negation}{section.3.4}
+\BOOKMARK [1][-]{section.3.5}{Small Constants}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.5.1}{Setting Small Constants}{section.3.5}
+\BOOKMARK [2][-]{subsection.3.5.2}{Setting Large Constants}{section.3.5}
+\BOOKMARK [1][-]{section.3.6}{Comparisons}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.6.1}{Unsigned Comparisions}{section.3.6}
+\BOOKMARK [2][-]{subsection.3.6.2}{Signed Comparisons}{section.3.6}
+\BOOKMARK [0][-]{chapter.4}{Basic Arithmetic}{}
+\BOOKMARK [1][-]{section.4.1}{Introduction}{chapter.4}
+\BOOKMARK [1][-]{section.4.2}{Addition and Subtraction}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.2.1}{Low Level Addition}{section.4.2}
+\BOOKMARK [2][-]{subsection.4.2.2}{Low Level Subtraction}{section.4.2}
+\BOOKMARK [2][-]{subsection.4.2.3}{High Level Addition}{section.4.2}
+\BOOKMARK [2][-]{subsection.4.2.4}{High Level Subtraction}{section.4.2}
+\BOOKMARK [1][-]{section.4.3}{Bit and Digit Shifting}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.3.1}{Multiplication by Two}{section.4.3}
+\BOOKMARK [2][-]{subsection.4.3.2}{Division by Two}{section.4.3}
+\BOOKMARK [1][-]{section.4.4}{Polynomial Basis Operations}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.4.1}{Multiplication by x}{section.4.4}
+\BOOKMARK [2][-]{subsection.4.4.2}{Division by x}{section.4.4}
+\BOOKMARK [1][-]{section.4.5}{Powers of Two}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.5.1}{Multiplication by Power of Two}{section.4.5}
+\BOOKMARK [2][-]{subsection.4.5.2}{Division by Power of Two}{section.4.5}
+\BOOKMARK [2][-]{subsection.4.5.3}{Remainder of Division by Power of Two}{section.4.5}
+\BOOKMARK [0][-]{chapter.5}{Multiplication and Squaring}{}
+\BOOKMARK [1][-]{section.5.1}{The Multipliers}{chapter.5}
+\BOOKMARK [1][-]{section.5.2}{Multiplication}{chapter.5}
+\BOOKMARK [2][-]{subsection.5.2.1}{The Baseline Multiplication}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.2}{Faster Multiplication by the ``Comba'' Method}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.3}{Polynomial Basis Multiplication}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.4}{Karatsuba Multiplication}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.5}{Toom-Cook 3-Way Multiplication}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.6}{Signed Multiplication}{section.5.2}
+\BOOKMARK [1][-]{section.5.3}{Squaring}{chapter.5}
+\BOOKMARK [2][-]{subsection.5.3.1}{The Baseline Squaring Algorithm}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.2}{Faster Squaring by the ``Comba'' Method}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.3}{Polynomial Basis Squaring}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.4}{Karatsuba Squaring}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.5}{Toom-Cook Squaring}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.6}{High Level Squaring}{section.5.3}
+\BOOKMARK [0][-]{chapter.6}{Modular Reduction}{}
+\BOOKMARK [1][-]{section.6.1}{Basics of Modular Reduction}{chapter.6}
+\BOOKMARK [1][-]{section.6.2}{The Barrett Reduction}{chapter.6}
+\BOOKMARK [2][-]{subsection.6.2.1}{Fixed Point Arithmetic}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.2}{Choosing a Radix Point}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.3}{Trimming the Quotient}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.4}{Trimming the Residue}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.5}{The Barrett Algorithm}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.6}{The Barrett Setup Algorithm}{section.6.2}
+\BOOKMARK [1][-]{section.6.3}{The Montgomery Reduction}{chapter.6}
+\BOOKMARK [2][-]{subsection.6.3.1}{Digit Based Montgomery Reduction}{section.6.3}
+\BOOKMARK [2][-]{subsection.6.3.2}{Baseline Montgomery Reduction}{section.6.3}
+\BOOKMARK [2][-]{subsection.6.3.3}{Faster ``Comba'' Montgomery Reduction}{section.6.3}
+\BOOKMARK [2][-]{subsection.6.3.4}{Montgomery Setup}{section.6.3}
+\BOOKMARK [1][-]{section.6.4}{The Diminished Radix Algorithm}{chapter.6}
+\BOOKMARK [2][-]{subsection.6.4.1}{Choice of Moduli}{section.6.4}
+\BOOKMARK [2][-]{subsection.6.4.2}{Choice of k}{section.6.4}
+\BOOKMARK [2][-]{subsection.6.4.3}{Restricted Diminished Radix Reduction}{section.6.4}
+\BOOKMARK [2][-]{subsection.6.4.4}{Unrestricted Diminished Radix Reduction}{section.6.4}
+\BOOKMARK [1][-]{section.6.5}{Algorithm Comparison}{chapter.6}
+\BOOKMARK [0][-]{chapter.7}{Exponentiation}{}
+\BOOKMARK [1][-]{section.7.1}{Exponentiation Basics}{chapter.7}
+\BOOKMARK [2][-]{subsection.7.1.1}{Single Digit Exponentiation}{section.7.1}
+\BOOKMARK [1][-]{section.7.2}{k-ary Exponentiation}{chapter.7}
+\BOOKMARK [2][-]{subsection.7.2.1}{Optimal Values of k}{section.7.2}
+\BOOKMARK [2][-]{subsection.7.2.2}{Sliding-Window Exponentiation}{section.7.2}
+\BOOKMARK [1][-]{section.7.3}{Modular Exponentiation}{chapter.7}
+\BOOKMARK [2][-]{subsection.7.3.1}{Barrett Modular Exponentiation}{section.7.3}
+\BOOKMARK [1][-]{section.7.4}{Quick Power of Two}{chapter.7}
+\BOOKMARK [0][-]{chapter.8}{Higher Level Algorithms}{}
+\BOOKMARK [1][-]{section.8.1}{Integer Division with Remainder}{chapter.8}
+\BOOKMARK [2][-]{subsection.8.1.1}{Quotient Estimation}{section.8.1}
+\BOOKMARK [2][-]{subsection.8.1.2}{Normalized Integers}{section.8.1}
+\BOOKMARK [2][-]{subsection.8.1.3}{Radix- Division with Remainder}{section.8.1}
+\BOOKMARK [1][-]{section.8.2}{Single Digit Helpers}{chapter.8}
+\BOOKMARK [2][-]{subsection.8.2.1}{Single Digit Addition and Subtraction}{section.8.2}
+\BOOKMARK [2][-]{subsection.8.2.2}{Single Digit Multiplication}{section.8.2}
+\BOOKMARK [2][-]{subsection.8.2.3}{Single Digit Division}{section.8.2}
+\BOOKMARK [2][-]{subsection.8.2.4}{Single Digit Root Extraction}{section.8.2}
+\BOOKMARK [1][-]{section.8.3}{Random Number Generation}{chapter.8}
+\BOOKMARK [1][-]{section.8.4}{Formatted Representations}{chapter.8}
+\BOOKMARK [2][-]{subsection.8.4.1}{Reading Radix-n Input}{section.8.4}
+\BOOKMARK [2][-]{subsection.8.4.2}{Generating Radix-n Output}{section.8.4}
+\BOOKMARK [0][-]{chapter.9}{Number Theoretic Algorithms}{}
+\BOOKMARK [1][-]{section.9.1}{Greatest Common Divisor}{chapter.9}
+\BOOKMARK [2][-]{subsection.9.1.1}{Complete Greatest Common Divisor}{section.9.1}
+\BOOKMARK [1][-]{section.9.2}{Least Common Multiple}{chapter.9}
+\BOOKMARK [1][-]{section.9.3}{Jacobi Symbol Computation}{chapter.9}
+\BOOKMARK [2][-]{subsection.9.3.1}{Jacobi Symbol}{section.9.3}
+\BOOKMARK [1][-]{section.9.4}{Modular Inverse}{chapter.9}
+\BOOKMARK [2][-]{subsection.9.4.1}{General Case}{section.9.4}
+\BOOKMARK [1][-]{section.9.5}{Primality Tests}{chapter.9}
+\BOOKMARK [2][-]{subsection.9.5.1}{Trial Division}{section.9.5}
+\BOOKMARK [2][-]{subsection.9.5.2}{The Fermat Test}{section.9.5}
+\BOOKMARK [2][-]{subsection.9.5.3}{The Miller-Rabin Test}{section.9.5}
Binary file tommath.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tommath.src	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,6314 @@
+\documentclass[b5paper]{book}
+\usepackage{hyperref}
+\usepackage{makeidx}
+\usepackage{amssymb}
+\usepackage{color}
+\usepackage{alltt}
+\usepackage{graphicx}
+\usepackage{layout}
+\def\union{\cup}
+\def\intersect{\cap}
+\def\getsrandom{\stackrel{\rm R}{\gets}}
+\def\cross{\times}
+\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
+\def\catn{$\|$}
+\def\divides{\hspace{0.3em} | \hspace{0.3em}}
+\def\nequiv{\not\equiv}
+\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
+\def\lcm{{\rm lcm}}
+\def\gcd{{\rm gcd}}
+\def\log{{\rm log}}
+\def\ord{{\rm ord}}
+\def\abs{{\mathit abs}}
+\def\rep{{\mathit rep}}
+\def\mod{{\mathit\ mod\ }}
+\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
+\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
+\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
+\def\Or{{\rm\ or\ }}
+\def\And{{\rm\ and\ }}
+\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
+\def\implies{\Rightarrow}
+\def\undefined{{\rm ``undefined"}}
+\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
+\let\oldphi\phi
+\def\phi{\varphi}
+\def\Pr{{\rm Pr}}
+\newcommand{\str}[1]{{\mathbf{#1}}}
+\def\F{{\mathbb F}}
+\def\N{{\mathbb N}}
+\def\Z{{\mathbb Z}}
+\def\R{{\mathbb R}}
+\def\C{{\mathbb C}}
+\def\Q{{\mathbb Q}}
+\definecolor{DGray}{gray}{0.5}
+\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
+\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
+\def\gap{\vspace{0.5ex}}
+\makeindex
+\begin{document}
+\frontmatter
+\pagestyle{empty}
+\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition }
+\author{\mbox{
+%\begin{small}
+\begin{tabular}{c}
+Tom St Denis \\
+Algonquin College \\
+\\
+Mads Rasmussen \\
+Open Communications Security \\
+\\
+Greg Rose \\
+QUALCOMM Australia \\
+\end{tabular}
+%\end{small}
+}
+}
+\maketitle
+This text has been placed in the public domain.  This text corresponds to the v0.30 release of the 
+LibTomMath project.
+
+\begin{alltt}
+Tom St Denis
+111 Banning Rd
+Ottawa, Ontario
+K2L 1C3
+Canada
+
+Phone: 1-613-836-3160
+Email: [email protected]
+\end{alltt}
+
+This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
+{\em book} macro package and the Perl {\em booker} package.
+
+\tableofcontents
+\listoffigures
+\chapter*{Prefaces to the Draft Edition}
+I started this text in April 2003 to complement my LibTomMath library.  That is, explain how to implement the functions
+contained in LibTomMath.  The goal is to have a textbook that any Computer Science student can use when implementing their
+own multiple precision arithmetic.  The plan I wanted to follow was flesh out all the
+ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time.  Chance
+would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the
+text.  
+
+Choosing to not waste any time I dove right into the project even before my spring semester was finished.  I wrote a bit
+off and on at first.  The moment my exams were finished I jumped into long 12 to 16 hour days.  The result after only
+a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted
+to read it.  I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara.  So far I have
+managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain
+rewarding.
+
+Now we are past December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
+Currently I am far off from this goal.  I've done partial re-writes of chapters one, two and three but they are not even
+finished yet.  I haven't given up on the project, only had some setbacks.  First O'Reilly declined to publish the text then
+Addison-Wesley and Greg is tried another which I don't know the name of.  However, at this point I want to focus my energy
+onto finishing the book not securing a contract.
+
+So why am I writing this text?  It seems like a lot of work right?  Most certainly it is a lot of work writing a textbook.  
+Even the simplest introductory material has to be lined with references and figures.  A lot of the text has to be re-written
+from point form to prose form to ensure an easier read.  Why am I doing all this work for free then?  Simple. My philosophy
+is quite simply ``Open Source.  Open Academia.  Open Minds'' which means that to achieve a goal of open minds, that is,
+people willing to accept new ideas and explore the unknown you have to make available material they can access freely 
+without hinderance.  
+
+I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come
+to depend upon.  I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their
+software.  Several educational institutions use it as a matter of course and many freelance developers use it as
+part of their projects.  To further my contributions I started the LibTomMath project in December 2002 aimed at providing
+multiple precision arithmetic routines that students could learn from.  That is write routines that are not only easy
+to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C.  
+
+The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in.  In the end, when all is
+said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic.  
+
+At this time I feel I should share a little information about myself.  The most common question I was asked at 
+Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended.  The unfortunate
+truth is that I neither teach at or attend a school of academic reputation.  I'm currently at Algonquin College which 
+is what I'd like to call ``somewhat academic but mostly vocational'' college.  In otherwords, job training.
+
+I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen
+computer science fields, a few fields of mathematics and some English).  I look forward to teaching someday but I am
+still far off from that goal.  
+
+Now it would be improper for me to not introduce the rest of the texts co-authors.  While they are only contributing 
+corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out
+in the text so far.  Greg has always been there for me.  He has tracked my LibTom projects since their inception and even
+sent cheques to help pay tuition from time to time.  His background has provided a wonderful source to bounce ideas off
+of and improve the quality of my writing.  Mads is another fellow who has just ``been there''.  I don't even recall what
+his interest in the LibTom projects is but I'm definitely glad he has been around.  His ability to catch logical errors
+in my written English have saved me on several occasions to say the least.
+
+What to expect next?  Well this is still a rough draft.  I've only had the chance to update a few chapters.  However, I've
+been getting the feeling that people are starting to use my text and I owe them some updated material.  My current tenative
+plan is to edit one chapter every two weeks starting January 4th.  It seems insane but my lower course load at college
+should provide ample time.  By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many
+people who will take it.
+
+\begin{flushright} Tom St Denis \end{flushright}
+
+\newpage
+I found the opportunity to work with Tom appealing for several reasons, not only could I broaden my own horizons, but also 
+contribute to educate others facing the problem of having to handle big number mathematical calculations.
+
+This book is Tom's child and he has been caring and fostering the project ever since the beginning with a clear mind of 
+how he wanted the project to turn out. I have helped by proofreading the text and we have had several discussions about 
+the layout and language used.
+
+I hold a masters degree in cryptography from the University of Southern Denmark and have always been interested in the 
+practical aspects of cryptography. 
+
+Having worked in the security consultancy business for several years in S\~{a}o Paulo, Brazil, I have been in touch with a 
+great deal of work in which multiple precision mathematics was needed. Understanding the possibilities for speeding up 
+multiple precision calculations is often very important since we deal with outdated machine architecture where modular 
+reductions, for example, become painfully slow.
+
+This text is for people who stop and wonder when first examining algorithms such as RSA for the first time and asks 
+themselves, ``You tell me this is only secure for large numbers, fine; but how do you implement these numbers?''
+
+\begin{flushright}
+Mads Rasmussen
+
+S\~{a}o Paulo - SP
+
+Brazil
+\end{flushright}
+
+\newpage
+It's all because I broke my leg. That just happened to be at about the same time that Tom asked for someone to review the section of the book about 
+Karatsuba multiplication. I was laid up, alone and immobile, and thought ``Why not?'' I vaguely knew what Karatsuba multiplication was, but not 
+really, so I thought I could help, learn, and stop myself from watching daytime cable TV, all at once.
+
+At the time of writing this, I've still not met Tom or Mads in meatspace. I've been following Tom's progress since his first splash on the 
+sci.crypt Usenet news group. I watched him go from a clueless newbie, to the cryptographic equivalent of a reformed smoker, to a real
+contributor to the field, over a period of about two years. I've been impressed with his obvious intelligence, and astounded by his productivity. 
+Of course, he's young enough to be my own child, so he doesn't have my problems with staying awake.
+
+When I reviewed that single section of the book, in its very earliest form, I was very pleasantly surprised. So I decided to collaborate more fully, 
+and at least review all of it, and perhaps write some bits too. There's still a long way to go with it, and I have watched a number of close 
+friends go through the mill of publication, so I think that the way to go is longer than Tom thinks it is. Nevertheless, it's a good effort, 
+and I'm pleased to be involved with it.
+
+\begin{flushright}
+Greg Rose, Sydney, Australia, June 2003. 
+\end{flushright}
+
+\mainmatter
+\pagestyle{headings}
+\chapter{Introduction}
+\section{Multiple Precision Arithmetic}
+
+\subsection{What is Multiple Precision Arithmetic?}
+When we think of long-hand arithmetic such as addition or multiplication we rarely consider the fact that we instinctively
+raise or lower the precision of the numbers we are dealing with.  For example, in decimal we almost immediate can 
+reason that $7$ times $6$ is $42$.  However, $42$ has two digits of precision as opposed to one digit we started with.  
+Further multiplications of say $3$ result in a larger precision result $126$.  In these few examples we have multiple 
+precisions for the numbers we are working with.  Despite the various levels of precision a single subset\footnote{With the occasional optimization.}
+ of algorithms can be designed to accomodate them.  
+
+By way of comparison a fixed or single precision operation would lose precision on various operations.  For example, in
+the decimal system with fixed precision $6 \cdot 7 = 2$.
+
+Essentially at the heart of computer based multiple precision arithmetic are the same long-hand algorithms taught in
+schools to manually add, subtract, multiply and divide.  
+
+\subsection{The Need for Multiple Precision Arithmetic}
+The most prevalent need for multiple precision arithmetic, often referred to as ``bignum'' math, is within the implementation
+of public-key cryptography algorithms.   Algorithms such as RSA \cite{RSAREF} and Diffie-Hellman \cite{DHREF} require 
+integers of significant magnitude to resist known cryptanalytic attacks.  For example, at the time of this writing a 
+typical RSA modulus would be at least greater than $10^{309}$.  However, modern programming languages such as ISO C \cite{ISOC} and 
+Java \cite{JAVA} only provide instrinsic support for integers which are relatively small and single precision.
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{|r|c|}
+\hline \textbf{Data Type} & \textbf{Range} \\
+\hline char  & $-128 \ldots 127$ \\
+\hline short & $-32768 \ldots 32767$ \\
+\hline long  & $-2147483648 \ldots 2147483647$ \\
+\hline long long & $-9223372036854775808 \ldots 9223372036854775807$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Typical Data Types for the C Programming Language}
+\label{fig:ISOC}
+\end{figure}
+
+The largest data type guaranteed to be provided by the ISO C programming 
+language\footnote{As per the ISO C standard.  However, each compiler vendor is allowed to augment the precision as they 
+see fit.}  can only represent values up to $10^{19}$ as shown in figure \ref{fig:ISOC}. On its own the C language is 
+insufficient to accomodate the magnitude required for the problem at hand.  An RSA modulus of magnitude $10^{19}$ could be 
+trivially factored\footnote{A Pollard-Rho factoring would take only $2^{16}$ time.} on the average desktop computer, 
+rendering any protocol based on the algorithm insecure.  Multiple precision algorithms solve this very problem by 
+extending the range of representable integers while using single precision data types.
+
+Most advancements in fast multiple precision arithmetic stem from the need for faster and more efficient cryptographic 
+primitives.  Faster modular reduction and exponentiation algorithms such as Barrett's algorithm, which have appeared in 
+various cryptographic journals, can render algorithms such as RSA and Diffie-Hellman more efficient.  In fact, several 
+major companies such as RSA Security, Certicom and Entrust have built entire product lines on the implementation and 
+deployment of efficient algorithms.
+
+However, cryptography is not the only field of study that can benefit from fast multiple precision integer routines.  
+Another auxiliary use of multiple precision integers is high precision floating point data types.  
+The basic IEEE \cite{IEEE} standard floating point type is made up of an integer mantissa $q$, an exponent $e$ and a sign bit $s$.  
+Numbers are given in the form $n = q \cdot b^e \cdot -1^s$ where $b = 2$ is the most common base for IEEE.  Since IEEE 
+floating point is meant to be implemented in hardware the precision of the mantissa is often fairly small 
+(\textit{23, 48 and 64 bits}).  The mantissa is merely an integer and a multiple precision integer could be used to create
+a mantissa of much larger precision than hardware alone can efficiently support.  This approach could be useful where 
+scientific applications must minimize the total output error over long calculations.
+
+Yet another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
+In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}.
+
+\subsection{Benefits of Multiple Precision Arithmetic}
+\index{precision}
+The benefit of multiple precision representations over single or fixed precision representations is that 
+no precision is lost while representing the result of an operation which requires excess precision.  For example, 
+the product of two $n$-bit integers requires at least $2n$ bits of precision to be represented faithfully.  A multiple 
+precision algorithm would augment the precision of the destination to accomodate the result while a single precision system 
+would truncate excess bits to maintain a fixed level of precision.
+
+It is possible to implement algorithms which require large integers with fixed precision algorithms.  For example, elliptic
+curve cryptography (\textit{ECC}) is often implemented on smartcards by fixing the precision of the integers to the maximum 
+size the system will ever need.  Such an approach can lead to vastly simpler algorithms which can accomodate the 
+integers required even if the host platform cannot natively accomodate them\footnote{For example, the average smartcard 
+processor has an 8 bit accumulator.}.  However, as efficient as such an approach may be, the resulting source code is not
+normally very flexible.  It cannot, at runtime, accomodate inputs of higher magnitude than the designer anticipated.
+
+Multiple precision algorithms have the most overhead of any style of arithmetic.  For the the most part the 
+overhead can be kept to a minimum with careful planning, but overall, it is not well suited for most memory starved
+platforms.  However, multiple precision algorithms do offer the most flexibility in terms of the magnitude of the 
+inputs.  That is, the same algorithms based on multiple precision integers can accomodate any reasonable size input 
+without the designer's explicit forethought.  This leads to lower cost of ownership for the code as it only has to 
+be written and tested once.
+
+\section{Purpose of This Text}
+The purpose of this text is to instruct the reader regarding how to implement efficient multiple precision algorithms.  
+That is to not only explain a limited subset of the core theory behind the algorithms but also the various ``house keeping'' 
+elements that are neglected by authors of other texts on the subject.  Several well reknowned texts \cite{TAOCPV2,HAC} 
+give considerably detailed explanations of the theoretical aspects of algorithms and often very little information 
+regarding the practical implementation aspects.  
+
+In most cases how an algorithm is explained and how it is actually implemented are two very different concepts.  For 
+example, the Handbook of Applied Cryptography (\textit{HAC}), algorithm 14.7 on page 594, gives a relatively simple 
+algorithm for performing multiple precision integer addition.  However, the description lacks any discussion concerning 
+the fact that the two integer inputs may be of differing magnitudes.  As a result the implementation is not as simple
+as the text would lead people to believe.  Similarly the division routine (\textit{algorithm 14.20, pp. 598}) does not 
+discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{step \#3}).
+
+Both texts also do not discuss several key optimal algorithms required such as ``Comba'' and Karatsuba multipliers 
+and fast modular inversion, which we consider practical oversights.  These optimal algorithms are vital to achieve 
+any form of useful performance in non-trivial applications.  
+
+To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer
+package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.org}} package is used 
+to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field 
+tested and work very well.  The LibTomMath library is freely available on the Internet for all uses and this text 
+discusses a very large portion of the inner workings of the library.
+
+The algorithms that are presented will always include at least one ``pseudo-code'' description followed 
+by the actual C source code that implements the algorithm.  The pseudo-code can be used to implement the same 
+algorithm in other programming languages as the reader sees fit.  
+
+This text shall also serve as a walkthrough of the creation of multiple precision algorithms from scratch.  Showing
+the reader how the algorithms fit together as well as where to start on various taskings.  
+
+\section{Discussion and Notation}
+\subsection{Notation}
+A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1}, \ldots, x_1, x_0)_{ \beta }$ and represent
+the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$.  The elements of the array $x$ are said to be the radix $\beta$ digits 
+of the integer.  For example, $x = (1,2,3)_{10}$ would represent the integer 
+$1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.  
+
+\index{mp\_int}
+The term ``mp\_int'' shall refer to a composite structure which contains the digits of the integer it represents, as well 
+as auxilary data required to manipulate the data.  These additional members are discussed further in section 
+\ref{sec:MPINT}.  For the purposes of this text a ``multiple precision integer'' and an ``mp\_int'' are assumed to be 
+synonymous.  When an algorithm is specified to accept an mp\_int variable it is assumed the various auxliary data members 
+are present as well.  An expression of the type \textit{variablename.item} implies that it should evaluate to the 
+member named ``item'' of the variable.  For example, a string of characters may have a member ``length'' which would 
+evaluate to the number of characters in the string.  If the string $a$ equals ``hello'' then it follows that 
+$a.length = 5$.  
+
+For certain discussions more generic algorithms are presented to help the reader understand the final algorithm used
+to solve a given problem.  When an algorithm is described as accepting an integer input it is assumed the input is 
+a plain integer with no additional multiple-precision members.  That is, algorithms that use integers as opposed to 
+mp\_ints as inputs do not concern themselves with the housekeeping operations required such as memory management.  These 
+algorithms will be used to establish the relevant theory which will subsequently be used to describe a multiple
+precision algorithm to solve the same problem.  
+
+\subsection{Precision Notation}
+The variable $\beta$ represents the radix of a single digit of a multiple precision integer and 
+must be of the form $q^p$ for $q, p \in \Z^+$.  A single precision variable must be able to represent integers in 
+the range $0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range 
+$0 \le x < q \beta^2$.  The extra radix-$q$ factor allows additions and subtractions to proceed without truncation of the 
+carry.  Since all modern computers are binary, it is assumed that $q$ is two.
+
+\index{mp\_digit} \index{mp\_word}
+Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent 
+a single precision integer type, while, the data type \textbf{mp\_word} will represent a double precision integer type.  In 
+several algorithms (notably the Comba routines) temporary results will be stored in arrays of double precision mp\_words.  
+For the purposes of this text $x_j$ will refer to the $j$'th digit of a single precision array and $\hat x_j$ will refer to 
+the $j$'th digit of a double precision array.  Whenever an expression is to be assigned to a double precision
+variable it is assumed that all single precision variables are promoted to double precision during the evaluation.  
+Expressions that are assigned to a single precision variable are truncated to fit within the precision of a single
+precision data type.
+
+For example, if $\beta = 10^2$ a single precision data type may represent a value in the 
+range $0 \le x < 10^3$, while a double precision data type may represent a value in the range $0 \le x < 10^5$.  Let
+$a = 23$ and $b = 49$ represent two single precision variables.  The single precision product shall be written
+as $c \leftarrow a \cdot b$ while the double precision product shall be written as $\hat c \leftarrow a \cdot b$.
+In this particular case, $\hat c = 1127$ and $c = 127$.  The most significant digit of the product would not fit 
+in a single precision data type and as a result $c \ne \hat c$.  
+
+\subsection{Algorithm Inputs and Outputs}
+Within the algorithm descriptions all variables are assumed to be scalars of either single or double precision
+as indicated.  The only exception to this rule is when variables have been indicated to be of type mp\_int.  This 
+distinction is important as scalars are often used as array indicies and various other counters.  
+
+\subsection{Mathematical Expressions}
+The $\lfloor \mbox{ } \rfloor$ brackets imply an expression truncated to an integer not greater than the expression 
+itself.  For example, $\lfloor 5.7 \rfloor = 5$.  Similarly the $\lceil \mbox{ } \rceil$ brackets imply an expression
+rounded to an integer not less than the expression itself.  For example, $\lceil 5.1 \rceil = 6$.  Typically when 
+the $/$ division symbol is used the intention is to perform an integer division with truncation.  For example, 
+$5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When an expression is written as a 
+fraction a real value division is implied, for example ${5 \over 2} = 2.5$.  
+
+The norm of a multiple precision integer, for example $\vert \vert x \vert \vert$, will be used to represent the number of digits in the representation
+of the integer.  For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$.  
+
+\subsection{Work Effort}
+\index{big-Oh}
+To measure the efficiency of the specified algorithms, a modified big-Oh notation is used.  In this system all 
+single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.  
+That is a single precision addition, multiplication and division are assumed to take the same time to 
+complete.  While this is generally not true in practice, it will simplify the discussions considerably.
+
+Some algorithms have slight advantages over others which is why some constants will not be removed in 
+the notation.  For example, a normal baseline multiplication (section \ref{sec:basemult}) requires $O(n^2)$ work while a 
+baseline squaring (section \ref{sec:basesquare}) requires $O({{n^2 + n}\over 2})$ work.  In standard big-Oh notation these 
+would both be said to be equivalent to $O(n^2)$.  However, 
+in the context of the this text this is not the case as the magnitude of the inputs will typically be rather small.  As a 
+result small constant factors in the work effort will make an observable difference in algorithm efficiency.
+
+All of the algorithms presented in this text have a polynomial time work level.  That is, of the form 
+$O(n^k)$ for $n, k \in \Z^{+}$.  This will help make useful comparisons in terms of the speed of the algorithms and how 
+various optimizations will help pay off in the long run.
+
+\section{Exercises}
+Within the more advanced chapters a section will be set aside to give the reader some challenging exercises related to
+the discussion at hand.  These exercises are not designed to be prize winning problems, but instead to be thought 
+provoking.  Wherever possible the problems are forward minded, stating problems that will be answered in subsequent 
+chapters.  The reader is encouraged to finish the exercises as they appear to get a better understanding of the 
+subject material.  
+
+That being said, the problems are designed to affirm knowledge of a particular subject matter.  Students in particular
+are encouraged to verify they can answer the problems correctly before moving on.
+
+Similar to the exercises of \cite[pp. ix]{TAOCPV2} these exercises are given a scoring system based on the difficulty of
+the problem.  However, unlike \cite{TAOCPV2} the problems do not get nearly as hard.  The scoring of these 
+exercises ranges from one (the easiest) to five (the hardest).  The following table sumarizes the 
+scoring system used.
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|l|}
+\hline $\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
+                            & minutes to solve.  Usually does not involve much computer time \\
+                            & to solve. \\
+\hline $\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
+                     & time usage.  Usually requires a program to be written to \\
+                     & solve the problem. \\
+\hline $\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
+                     & of work.  Usually involves trivial research and development of \\
+                     & new theory from the perspective of a student. \\
+\hline $\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
+                     & of work and research, the solution to which will demonstrate \\
+                     & a higher mastery of the subject matter. \\
+\hline $\left [ 5 \right ]$ & A hard problem that involves concepts that are difficult for a \\
+                     & novice to solve.  Solutions to these problems will demonstrate a \\
+                     & complete mastery of the given subject. \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Exercise Scoring System}
+\end{figure}
+
+Problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
+devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level 
+are also designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  These
+two levels are essentially entry level questions.  
+
+Problems at the third level are meant to be a bit more difficult than the first two levels.  The answer is often 
+fairly obvious but arriving at an exacting solution requires some thought and skill.  These problems will almost always 
+involve devising a new algorithm or implementing a variation of another algorithm previously presented.  Readers who can
+answer these questions will feel comfortable with the concepts behind the topic at hand.
+
+Problems at the fourth level are meant to be similar to those of the level three questions except they will require 
+additional research to be completed.  The reader will most likely not know the answer right away, nor will the text provide 
+the exact details of the answer until a subsequent chapter.  
+
+Problems at the fifth level are meant to be the hardest 
+problems relative to all the other problems in the chapter.  People who can correctly answer fifth level problems have a 
+mastery of the subject matter at hand.
+
+Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
+is encouraged to answer the follow-up problems and try to draw the relevance of problems.
+
+\section{Introduction to LibTomMath}
+
+\subsection{What is LibTomMath?}
+LibTomMath is a free and open source multiple precision integer library written entirely in portable ISO C.  By portable it 
+is meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on 
+any given platform.  
+
+The library has been successfully tested under numerous operating systems including Unix\footnote{All of these
+trademarks belong to their respective rightful owners.}, MacOS, Windows, Linux, PalmOS and on standalone hardware such 
+as the Gameboy Advance.  The library is designed to contain enough functionality to be able to develop applications such 
+as public key cryptosystems and still maintain a relatively small footprint.
+
+\subsection{Goals of LibTomMath}
+
+Libraries which obtain the most efficiency are rarely written in a high level programming language such as C.  However, 
+even though this library is written entirely in ISO C, considerable care has been taken to optimize the algorithm implementations within the 
+library.  Specifically the code has been written to work well with the GNU C Compiler (\textit{GCC}) on both x86 and ARM 
+processors.  Wherever possible, highly efficient algorithms, such as Karatsuba multiplication, sliding window 
+exponentiation and Montgomery reduction have been provided to make the library more efficient.  
+
+Even with the nearly optimal and specialized algorithms that have been included the Application Programing Interface 
+(\textit{API}) has been kept as simple as possible.  Often generic place holder routines will make use of specialized 
+algorithms automatically without the developer's specific attention.  One such example is the generic multiplication 
+algorithm \textbf{mp\_mul()} which will automatically use Toom--Cook, Karatsuba, Comba or baseline multiplication 
+based on the magnitude of the inputs and the configuration of the library.  
+
+Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should 
+be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
+MPI library was used as a API template for all the basic functions.  MPI was chosen because it is another library that fits 
+in the same niche as LibTomMath.  Even though LibTomMath uses MPI as the template for the function names and argument 
+passing conventions, it has been written from scratch by Tom St Denis.
+
+The project is also meant to act as a learning tool for students, the logic being that no easy-to-follow ``bignum'' 
+library exists which can be used to teach computer science students how to perform fast and reliable multiple precision 
+integer arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.  
+
+\section{Choice of LibTomMath}
+LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
+for more worthy reasons.  Other libraries such as GMP \cite{GMP}, MPI \cite{MPI}, LIP \cite{LIP} and OpenSSL 
+\cite{OPENSSL} have multiple precision integer arithmetic routines but would not be ideal for this text for 
+reasons that will be explained in the following sub-sections.
+
+\subsection{Code Base}
+The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
+segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
+developer can more readily discern the true intent of a given section of source code without trying to keep track of
+what conditional code will be used.
+
+The code base of LibTomMath is well organized.  Each function is in its own separate source code file 
+which allows the reader to find a given function very quickly.  On average there are $76$ lines of code per source
+file which makes the source very easily to follow.  By comparison MPI and LIP are single file projects making code tracing
+very hard.  GMP has many conditional code segments which also hinder tracing.  
+
+When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $100$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.}
+ which is fairly small compared to GMP (over $250$KiB).  LibTomMath is slightly larger than MPI (which compiles to about 
+$50$KiB) but LibTomMath is also much faster and more complete than MPI.
+
+\subsection{API Simplicity}
+LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build 
+with LibTomMath without change. The function names correlate directly to the action they perform.  Almost all of the 
+functions share the same parameter passing convention.  The learning curve is fairly shallow with the API provided 
+which is an extremely valuable benefit for the student and developer alike.  
+
+The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to 
+illegible short hand.  LibTomMath does not share this characteristic.  
+
+The GMP library also does not return error codes.  Instead it uses a POSIX.1 \cite{POSIX1} signal system where errors
+are signaled to the host application.  This happens to be the fastest approach but definitely not the most versatile.  In
+effect a math error (i.e. invalid input, heap error, etc) can cause a program to stop functioning which is definitely 
+undersireable in many situations.
+
+\subsection{Optimizations}
+While LibTomMath is certainly not the fastest library (GMP often beats LibTomMath by a factor of two) it does
+feature a set of optimal algorithms for tasks such as modular reduction, exponentiation, multiplication and squaring.  GMP 
+and LIP also feature such optimizations while MPI only uses baseline algorithms with no optimizations.  GMP lacks a few
+of the additional modular reduction optimizations that LibTomMath features\footnote{At the time of this writing GMP
+only had Barrett and Montgomery modular reduction algorithms.}.  
+
+LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular
+exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually  
+slower than the best libraries such as GMP and OpenSSL by only a small factor.
+
+\subsection{Portability and Stability}
+LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler 
+(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any 
+variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of 
+MPI has recently stopped working on his library and LIP has long since been discontinued.  
+
+GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
+development and are very stable across a variety of platforms.
+
+\subsection{Choice}
+LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
+the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However, 
+the reader is encouraged to download their own copy of the library to actually be able to work with the library.  
+
+\chapter{Getting Started}
+\section{Library Basics}
+The trick to writing any useful library of source code is to build a solid foundation and work outwards from it.  First, 
+a problem along with allowable solution parameters should be identified and analyzed.  In this particular case the 
+inability to accomodate multiple precision integers is the problem.  Futhermore, the solution must be written
+as portable source code that is reasonably efficient across several different computer platforms.
+
+After a foundation is formed the remainder of the library can be designed and implemented in a hierarchical fashion.  
+That is, to implement the lowest level dependencies first and work towards the most abstract functions last.  For example, 
+before implementing a modular exponentiation algorithm one would implement a modular reduction algorithm.
+By building outwards from a base foundation instead of using a parallel design methodology the resulting project is 
+highly modular.  Being highly modular is a desirable property of any project as it often means the resulting product
+has a small footprint and updates are easy to perform.  
+
+Usually when I start a project I will begin with the header files.  I define the data types I think I will need and 
+prototype the initial functions that are not dependent on other functions (within the library).  After I 
+implement these base functions I prototype more dependent functions and implement them.   The process repeats until
+I implement all of the functions I require.  For example, in the case of LibTomMath I implemented functions such as 
+mp\_init() well before I implemented mp\_mul() and even further before I implemented mp\_exptmod().  As an example as to 
+why this design works note that the Karatsuba and Toom-Cook multipliers were written \textit{after} the 
+dependent function mp\_exptmod() was written.  Adding the new multiplication algorithms did not require changes to the 
+mp\_exptmod() function itself and lowered the total cost of ownership (\textit{so to speak}) and of development 
+for new algorithms.  This methodology allows new algorithms to be tested in a complete framework with relative ease.
+
+FIGU,design_process,Design Flow of the First Few Original LibTomMath Functions.
+
+Only after the majority of the functions were in place did I pursue a less hierarchical approach to auditing and optimizing
+the source code.  For example, one day I may audit the multipliers and the next day the polynomial basis functions.  
+
+It only makes sense to begin the text with the preliminary data types and support algorithms required as well.  
+This chapter discusses the core algorithms of the library which are the dependents for every other algorithm.
+
+\section{What is a Multiple Precision Integer?}
+Recall that most programming languages, in particular ISO C \cite{ISOC}, only have fixed precision data types that on their own cannot 
+be used to represent values larger than their precision will allow. The purpose of multiple precision algorithms is 
+to use fixed precision data types to create and manipulate multiple precision integers which may represent values 
+that are very large.  
+
+As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits.  In the decimal system
+the largest single digit value is $9$.  However, by concatenating digits together larger numbers may be represented.  Newly prepended digits 
+(\textit{to the left}) are said to be in a different power of ten column.  That is, the number $123$ can be described as having a $1$ in the hundreds 
+column, $2$ in the tens column and $3$ in the ones column.  Or more formally $123 = 1 \cdot 10^2 + 2 \cdot 10^1 + 3 \cdot 10^0$.  Computer based 
+multiple precision arithmetic is essentially the same concept.  Larger integers are represented by adjoining fixed 
+precision computer words with the exception that a different radix is used.
+
+What most people probably do not think about explicitly are the various other attributes that describe a multiple precision 
+integer.  For example, the integer $154_{10}$ has two immediately obvious properties.  First, the integer is positive, 
+that is the sign of this particular integer is positive as opposed to negative.  Second, the integer has three digits in 
+its representation.  There is an additional property that the integer posesses that does not concern pencil-and-paper 
+arithmetic.  The third property is how many digits placeholders are available to hold the integer.  
+
+The human analogy of this third property is ensuring there is enough space on the paper to write the integer.  For example,
+if one starts writing a large number too far to the right on a piece of paper they will have to erase it and move left.  
+Similarly, computer algorithms must maintain strict control over memory usage to ensure that the digits of an integer
+will not exceed the allowed boundaries.  These three properties make up what is known as a multiple precision 
+integer or mp\_int for short.  
+
+\subsection{The mp\_int Structure}
+\label{sec:MPINT}
+The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer.  The ISO C standard does not provide for 
+any such data type but it does provide for making composite data types known as structures.  The following is the structure definition 
+used within LibTomMath.
+
+\index{mp\_int}
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+%\begin{verbatim}
+\begin{tabular}{|l|}
+\hline
+typedef struct \{ \\
+\hspace{3mm}int used, alloc, sign;\\
+\hspace{3mm}mp\_digit *dp;\\
+\} \textbf{mp\_int}; \\
+\hline
+\end{tabular}
+%\end{verbatim}
+\end{small}
+\caption{The mp\_int Structure}
+\label{fig:mpint}
+\end{center}
+\end{figure}
+
+The mp\_int structure (fig. \ref{fig:mpint}) can be broken down as follows.
+
+\begin{enumerate}
+\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
+a given integer.  The \textbf{used} count must be positive (or zero) and may not exceed the \textbf{alloc} count.  
+
+\item The \textbf{alloc} parameter denotes how 
+many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count 
+of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the 
+array to accommodate the precision of the result.  
+
+\item The pointer \textbf{dp} points to a dynamically allocated array of digits that represent the given multiple 
+precision integer.  It is padded with $(\textbf{alloc} - \textbf{used})$ zero digits.  The array is maintained in a least 
+significant digit order.  As a pencil and paper analogy the array is organized such that the right most digits are stored
+first starting at the location indexed by zero\footnote{In C all arrays begin at zero.} in the array.  For example, 
+if \textbf{dp} contains $\lbrace a, b, c, \ldots \rbrace$ where \textbf{dp}$_0 = a$, \textbf{dp}$_1 = b$, \textbf{dp}$_2 = c$, $\ldots$ then 
+it would represent the integer $a + b\beta + c\beta^2 + \ldots$  
+
+\index{MP\_ZPOS} \index{MP\_NEG}
+\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).  
+\end{enumerate}
+
+\subsubsection{Valid mp\_int Structures}
+Several rules are placed on the state of an mp\_int structure and are assumed to be followed for reasons of efficiency.  
+The only exceptions are when the structure is passed to initialization functions such as mp\_init() and mp\_init\_copy().
+
+\begin{enumerate}
+\item The value of \textbf{alloc} may not be less than one.  That is \textbf{dp} always points to a previously allocated
+array of digits.
+\item The value of \textbf{used} may not exceed \textbf{alloc} and must be greater than or equal to zero.
+\item The value of \textbf{used} implies the digit at index $(used - 1)$ of the \textbf{dp} array is non-zero.  That is, 
+leading zero digits in the most significant positions must be trimmed.
+   \begin{enumerate}
+   \item Digits in the \textbf{dp} array at and above the \textbf{used} location must be zero.
+   \end{enumerate}
+\item The value of \textbf{sign} must be \textbf{MP\_ZPOS} if \textbf{used} is zero; 
+this represents the mp\_int value of zero.
+\end{enumerate}
+
+\section{Argument Passing}
+A convention of argument passing must be adopted early on in the development of any library.  Making the function 
+prototypes consistent will help eliminate many headaches in the future as the library grows to significant complexity.  
+In LibTomMath the multiple precision integer functions accept parameters from left to right as pointers to mp\_int 
+structures.  That means that the source (input) operands are placed on the left and the destination (output) on the right.   
+Consider the following examples.
+
+\begin{verbatim}
+   mp_mul(&a, &b, &c);   /* c = a * b */
+   mp_add(&a, &b, &a);   /* a = a + b */
+   mp_sqr(&a, &b);       /* b = a * a */
+\end{verbatim}
+
+The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
+functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
+
+Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around, to mimic the order
+of assignment expressions.  That is, the destination (output) is on the left and arguments (inputs) are on the right.  In 
+truth, it is entirely a matter of preference.  In the case of LibTomMath the convention from the MPI library has been 
+adopted.  
+
+Another very useful design consideration, provided for in LibTomMath, is whether to allow argument sources to also be a 
+destination.  For example, the second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important 
+feature to implement since it allows the calling functions to cut down on the number of variables it must maintain.  
+However, to implement this feature specific care has to be given to ensure the destination is not modified before the 
+source is fully read.
+
+\section{Return Values}
+A well implemented application, no matter what its purpose, should trap as many runtime errors as possible and return them 
+to the caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour.  However, the end 
+developer can still manage to cause a library to crash.  For example, by passing an invalid pointer an application may
+fault by dereferencing memory not owned by the application.
+
+In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for 
+instance) and memory allocation errors.  It will not check that the mp\_int passed to any function is valid nor 
+will it check pointers for validity.  Any function that can cause a runtime error will return an error code as an 
+\textbf{int} data type with one of the following values (fig \ref{fig:errcodes}).
+
+\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Value} & \textbf{Meaning} \\
+\hline \textbf{MP\_OKAY} & The function was successful \\
+\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
+\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
+\hline
+\end{tabular}
+\end{center}
+\caption{LibTomMath Error Codes}
+\label{fig:errcodes}
+\end{figure}
+
+When an error is detected within a function it should free any memory it allocated, often during the initialization of
+temporary mp\_ints, and return as soon as possible.  The goal is to leave the system in the same state it was when the 
+function was called.  Error checking with this style of API is fairly simple.
+
+\begin{verbatim}
+   int err;
+   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
+      printf("Error: %s\n", mp_error_to_string(err));
+      exit(EXIT_FAILURE);
+   }
+\end{verbatim}
+
+The GMP \cite{GMP} library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal 
+and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases.
+
+\section{Initialization and Clearing}
+The logical starting point when actually writing multiple precision integer functions is the initialization and 
+clearing of the mp\_int structures.  These two algorithms will be used by the majority of the higher level algorithms.
+
+Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
+the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even though
+the initial integer will represent zero.  If only a single digit were allocated quite a few subsequent re-allocations
+would occur when operations are performed on the integers.  There is a tradeoff between how many default digits to allocate
+and how many re-allocations are tolerable.  Obviously allocating an excessive amount of digits initially will waste 
+memory and become unmanageable.  
+
+If the memory for the digits has been successfully allocated then the rest of the members of the structure must
+be initialized.  Since the initial state of an mp\_int is to represent the zero integer, the allocated digits must be set
+to zero.  The \textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
+
+\subsection{Initializing an mp\_int}
+An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the
+structure are set to valid values.  The mp\_init algorithm will perform such an action.
+
+\index{mp\_init}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Allocate memory and initialize $a$ to a known valid mp\_int state.  \\
+\hline \\
+1.  Allocate memory for \textbf{MP\_PREC} digits. \\
+2.  If the allocation failed return(\textit{MP\_MEM}) \\
+3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$\\
+4.  $a.sign \leftarrow MP\_ZPOS$\\
+5.  $a.used \leftarrow 0$\\
+6.  $a.alloc \leftarrow MP\_PREC$\\
+7.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init}
+\end{figure}
+
+\textbf{Algorithm mp\_init.}
+The purpose of this function is to initialize an mp\_int structure so that the rest of the library can properly
+manipulte it.  It is assumed that the input may not have had any of its members previously initialized which is certainly
+a valid assumption if the input resides on the stack.  
+
+Before any of the members such as \textbf{sign}, \textbf{used} or \textbf{alloc} are initialized the memory for
+the digits is allocated.  If this fails the function returns before setting any of the other members.  The \textbf{MP\_PREC} 
+name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} 
+used to dictate the minimum precision of newly initialized mp\_int integers.  Ideally, it is at least equal to the smallest
+precision number you'll be working with.
+
+Allocating a block of digits at first instead of a single digit has the benefit of lowering the number of usually slow
+heap operations later functions will have to perform in the future.  If \textbf{MP\_PREC} is set correctly the slack 
+memory and the number of heap operations will be trivial.
+
+Once the allocation has been made the digits have to be set to zero as well as the \textbf{used}, \textbf{sign} and
+\textbf{alloc} members initialized.  This ensures that the mp\_int will always represent the default state of zero regardless
+of the original condition of the input.
+
+\textbf{Remark.}
+This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally
+when the ``to'' keyword is placed between two expressions.  For example, ``for $a$ from $b$ to $c$ do'' means that
+a subsequent expression (or body of expressions) are to be evaluated upto $c - b$ times so long as $b \le c$.  In each
+iteration the variable $a$ is substituted for a new integer that lies inclusively between $b$ and $c$.  If $b > c$ occured
+the loop would not iterate.  By contrast if the ``downto'' keyword were used in place of ``to'' the loop would iterate 
+decrementally.
+
+EXAM,bn_mp_init.c
+
+One immediate observation of this initializtion function is that it does not return a pointer to a mp\_int structure.  It 
+is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack.  The 
+call to mp\_init() is used only to initialize the members of the structure to a known default state.  
+
+Here we see (line @23,XMALLOC@) the memory allocation is performed first.  This allows us to exit cleanly and quickly
+if there is an error.  If the allocation fails the routine will return \textbf{MP\_MEM} to the caller to indicate there
+was a memory error.  The function XMALLOC is what actually allocates the memory.  Technically XMALLOC is not a function
+but a macro defined in ``tommath.h``.  By default, XMALLOC will evaluate to malloc() which is the C library's built--in
+memory allocation routine.
+
+In order to assure the mp\_int is in a known state the digits must be set to zero.  On most platforms this could have been
+accomplished by using calloc() instead of malloc().  However,  to correctly initialize a integer type to a given value in a 
+portable fashion you have to actually assign the value.  The for loop (line @28,for@) performs this required
+operation.
+
+After the memory has been successfully initialized the remainder of the members are initialized 
+(lines @29,used@ through @31,sign@) to their respective default states.  At this point the algorithm has succeeded and
+a success code is returned to the calling function.  If this function returns \textbf{MP\_OKAY} it is safe to assume the 
+mp\_int structure has been properly initialized and is safe to use with other functions within the library.  
+
+\subsection{Clearing an mp\_int}
+When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be 
+returned to the application's memory pool with the mp\_clear algorithm.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clear}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  The memory for $a$ shall be deallocated.  \\
+\hline \\
+1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
+2.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}2.1  $a_n \leftarrow 0$ \\
+3.  Free the memory allocated for the digits of $a$. \\
+4.  $a.used \leftarrow 0$ \\
+5.  $a.alloc \leftarrow 0$ \\
+6.  $a.sign \leftarrow MP\_ZPOS$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clear}
+\end{figure}
+
+\textbf{Algorithm mp\_clear.}
+This algorithm accomplishes two goals.  First, it clears the digits and the other mp\_int members.  This ensures that 
+if a developer accidentally re-uses a cleared structure it is less likely to cause problems.  The second goal
+is to free the allocated memory.
+
+The logic behind the algorithm is extended by marking cleared mp\_int structures so that subsequent calls to this
+algorithm will not try to free the memory multiple times.  Cleared mp\_ints are detectable by having a pre-defined invalid 
+digit pointer \textbf{dp} setting.
+
+Once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
+with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear.
+
+EXAM,bn_mp_clear.c
+
+The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line @23,a->dp != NULL@)
+checks to see if the \textbf{dp} member is not \textbf{NULL}.  If the mp\_int is a valid mp\_int then \textbf{dp} cannot be
+\textbf{NULL} in which case the if statement will evaluate to true.
+
+The digits of the mp\_int are cleared by the for loop (line @25,for@) which assigns a zero to every digit.  Similar to mp\_init()
+the digits are assigned zero instead of using block memory operations (such as memset()) since this is more portable.  
+
+The digits are deallocated off the heap via the XFREE macro.  Similar to XMALLOC the XFREE macro actually evaluates to
+a standard C library function.  In this case the free() function.  Since free() only deallocates the memory the pointer
+still has to be reset to \textbf{NULL} manually (line @33,NULL@).  
+
+Now that the digits have been cleared and deallocated the other members are set to their final values (lines @34,= 0@ and @35,ZPOS@).
+
+\section{Maintenance Algorithms}
+
+The previous sections describes how to initialize and clear an mp\_int structure.  To further support operations
+that are to be performed on mp\_int structures (such as addition and multiplication) the dependent algorithms must be
+able to augment the precision of an mp\_int and 
+initialize mp\_ints with differing initial conditions.  
+
+These algorithms complete the set of low level algorithms required to work with mp\_int structures in the higher level
+algorithms such as addition, multiplication and modular exponentiation.
+
+\subsection{Augmenting an mp\_int's Precision}
+When storing a value in an mp\_int structure, a sufficient number of digits must be available to accomodate the entire 
+result of an operation without loss of precision.  Quite often the size of the array given by the \textbf{alloc} member 
+is large enough to simply increase the \textbf{used} digit count.  However, when the size of the array is too small it 
+must be re-sized appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_grow}. \\
+\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
+\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
+\hline \\
+1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
+2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
+3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+4.  Re-allocate the array of digits $a$ to size $v$ \\
+5.  If the allocation failed then return(\textit{MP\_MEM}). \\
+6.  for n from a.alloc to $v - 1$ do  \\
+\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.alloc \leftarrow v$ \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_grow}
+\end{figure}
+
+\textbf{Algorithm mp\_grow.}
+It is ideal to prevent re-allocations from being performed if they are not required (step one).  This is useful to 
+prevent mp\_ints from growing excessively in code that erroneously calls mp\_grow.  
+
+The requested digit count is padded up to next multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} (steps two and three).  
+This helps prevent many trivial reallocations that would grow an mp\_int by trivially small values.  
+
+It is assumed that the reallocation (step four) leaves the lower $a.alloc$ digits of the mp\_int intact.  This is much 
+akin to how the \textit{realloc} function from the standard C library works.  Since the newly allocated digits are 
+assumed to contain undefined values they are initially set to zero.
+
+EXAM,bn_mp_grow.c
+
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @23,if@) checks
+if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
+the function skips the re-allocation part thus saving time.
+
+When a re-allocation is performed it is turned into an optimal request to save time in the future.  The requested digit count is
+padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line @25, size@).  The XREALLOC function is used
+to re-allocate the memory.  As per the other functions XREALLOC is actually a macro which evaluates to realloc by default.  The realloc
+function leaves the base of the allocation intact which means the first \textbf{alloc} digits of the mp\_int are the same as before
+the re-allocation.  All	that is left is to clear the newly allocated digits and return.
+
+Note that the re-allocation result is actually stored in a temporary pointer $tmp$.  This is to allow this function to return
+an error with a valid pointer.  Earlier releases of the library stored the result of XREALLOC into the mp\_int $a$.  That would
+result in a memory leak if XREALLOC ever failed.  
+
+\subsection{Initializing Variable Precision mp\_ints}
+Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size 
+of input mp\_ints to a given algorithm.  The purpose of algorithm mp\_init\_size is similar to mp\_init except that it 
+will allocate \textit{at least} a specified number of digits.  
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_size}. \\
+\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$. \\
+\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
+\hline \\
+1.  $u \leftarrow b \mbox{ (mod }MP\_PREC\mbox{)}$ \\
+2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+3.  Allocate $v$ digits. \\
+4.  for $n$ from $0$ to $v - 1$ do \\
+\hspace{3mm}4.1  $a_n \leftarrow 0$ \\
+5.  $a.sign \leftarrow MP\_ZPOS$\\
+6.  $a.used \leftarrow 0$\\
+7.  $a.alloc \leftarrow v$\\
+8.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_init\_size}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_size.}
+This algorithm will initialize an mp\_int structure $a$ like algorithm mp\_init with the exception that the number of 
+digits allocated can be controlled by the second input argument $b$.  The input size is padded upwards so it is a 
+multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} digits.  This padding is used to prevent trivial 
+allocations from becoming a bottleneck in the rest of the algorithms.
+
+Like algorithm mp\_init, the mp\_int structure is initialized to a default state representing the integer zero.  This 
+particular algorithm is useful if it is known ahead of time the approximate size of the input.  If the approximation is
+correct no further memory re-allocations are required to work with the mp\_int.
+
+EXAM,bn_mp_init_size.c
+
+The number of digits $b$ requested is padded (line @22,MP_PREC@) by first augmenting it to the next multiple of 
+\textbf{MP\_PREC} and then adding \textbf{MP\_PREC} to the result.  If the memory can be successfully allocated the 
+mp\_int is placed in a default state representing the integer zero.  Otherwise, the error code \textbf{MP\_MEM} will be 
+returned (line @27,return@).  
+
+The digits are allocated and set to zero at the same time with the calloc() function (line @25,XCALLOC@).  The 
+\textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set 
+to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines @29,used@, @30,alloc@ and @31,sign@).  If the function 
+returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the 
+functions to work with.
+
+\subsection{Multiple Integer Initializations and Clearings}
+Occasionally a function will require a series of mp\_int data types to be made available simultaneously.  
+The purpose of algorithm mp\_init\_multi is to initialize a variable length array of mp\_int structures in a single
+statement.  It is essentially a shortcut to multiple initializations.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_multi}. \\
+\textbf{Input}.   Variable length array $V_k$ of mp\_int variables of length $k$. \\
+\textbf{Output}.  The array is initialized such that each mp\_int of $V_k$ is ready to use. \\
+\hline \\
+1.  for $n$ from 0 to $k - 1$ do \\
+\hspace{+3mm}1.1.  Initialize the mp\_int $V_n$ (\textit{mp\_init}) \\
+\hspace{+3mm}1.2.  If initialization failed then do \\
+\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
+\hspace{+9mm}1.2.1.1.  Free the mp\_int $V_j$ (\textit{mp\_clear}) \\
+\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
+2.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_multi}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_multi.}
+The algorithm will initialize the array of mp\_int variables one at a time.  If a runtime error has been detected 
+(\textit{step 1.2}) all of the previously initialized variables are cleared.  The goal is an ``all or nothing'' 
+initialization which allows for quick recovery from runtime errors.
+
+EXAM,bn_mp_init_multi.c
+
+This function intializes a variable length list of mp\_int structure pointers.  However, instead of having the mp\_int
+structures in an actual C array they are simply passed as arguments to the function.  This function makes use of the 
+``...'' argument syntax of the C programming language.  The list is terminated with a final \textbf{NULL} argument 
+appended on the right.  
+
+The function uses the ``stdarg.h'' \textit{va} functions to step portably through the arguments to the function.  A count
+$n$ of succesfully initialized mp\_int structures is maintained (line @47,n++@) such that if a failure does occur,
+the algorithm can backtrack and free the previously initialized structures (lines @27,if@ to @46,}@).  
+
+
+\subsection{Clamping Excess Digits}
+When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of 
+the function instead of checking during the computation.  For example, a multiplication of a $i$ digit number by a 
+$j$ digit produces a result of at most $i + j$ digits.  It is entirely possible that the result is $i + j - 1$ 
+though, with no final carry into the last position.  However, suppose the destination had to be first expanded 
+(\textit{via mp\_grow}) to accomodate $i + j - 1$ digits than further expanded to accomodate the final carry.  
+That would be a considerable waste of time since heap operations are relatively slow.
+
+The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function
+terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
+there would be an excess high order zero digit.  
+
+For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit 
+will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
+accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very 
+low the representation is excessively large.  
+
+The mp\_clamp algorithm is designed to solve this very problem.  It will trim high-order zeros by decrementing the 
+\textbf{used} count until a non-zero most significant digit is found.  Also in this system, zero is considered to be a 
+positive number which means that if the \textbf{used} count is decremented to zero, the sign must be set to 
+\textbf{MP\_ZPOS}.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clamp}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
+\hline \\
+1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
+\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
+2.  if $a.used = 0$ then do \\
+\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
+\hline \\
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clamp}
+\end{figure}
+
+\textbf{Algorithm mp\_clamp.}
+As can be expected this algorithm is very simple.  The loop on step one is expected to iterate only once or twice at
+the most.  For example, this will happen in cases where there is not a carry to fill the last position.  Step two fixes the sign for 
+when all of the digits are zero to ensure that the mp\_int is valid at all times.
+
+EXAM,bn_mp_clamp.c
+
+Note on line @27,while@ how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
+language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is 
+important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously 
+undesirable.  The parenthesis on line @28,a->used@ is used to make sure the \textbf{used} count is decremented and not
+the pointer ``a''.  
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
+                     & \\
+$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
+                     & encryption when $\beta = 2^{28}$.  \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
+                     & \\
+$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
+                     & \\
+\end{tabular}
+
+
+%%%
+% CHAPTER FOUR
+%%%
+
+\chapter{Basic Operations}
+
+\section{Introduction}
+In the previous chapter a series of low level algorithms were established that dealt with initializing and maintaining
+mp\_int structures.  This chapter will discuss another set of seemingly non-algebraic algorithms which will form the low 
+level basis of the entire library.  While these algorithm are relatively trivial it is important to understand how they
+work before proceeding since these algorithms will be used almost intrinsically in the following chapters.
+
+The algorithms in this chapter deal primarily with more ``programmer'' related tasks such as creating copies of
+mp\_int structures, assigning small values to mp\_int structures and comparisons of the values mp\_int structures
+represent.   
+
+\section{Assigning Values to mp\_int Structures}
+\subsection{Copying an mp\_int}
+Assigning the value that a given mp\_int structure represents to another mp\_int structure shall be known as making
+a copy for the purposes of this text.  The copy of the mp\_int will be a separate entity that represents the same
+value as the mp\_int it was copied from.  The mp\_copy algorithm provides this functionality. 
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_copy}. \\
+\textbf{Input}.  An mp\_int $a$ and $b$. \\
+\textbf{Output}.  Store a copy of $a$ in $b$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{mp\_grow}) \\
+2.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}2.1  $b_{n} \leftarrow a_{n}$ \\
+3.  for $n$ from $a.used$ to $b.used - 1$ do \\
+\hspace{3mm}3.1  $b_{n} \leftarrow 0$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $b.sign \leftarrow a.sign$ \\
+6.  return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_copy.}
+This algorithm copies the mp\_int $a$ such that upon succesful termination of the algorithm the mp\_int $b$ will
+represent the same integer as the mp\_int $a$.  The mp\_int $b$ shall be a complete and distinct copy of the 
+mp\_int $a$ meaing that the mp\_int $a$ can be modified and it shall not affect the value of the mp\_int $b$.
+
+If $b$ does not have enough room for the digits of $a$ it must first have its precision augmented via the mp\_grow 
+algorithm.  The digits of $a$ are copied over the digits of $b$ and any excess digits of $b$ are set to zero (step two
+and three).  The \textbf{used} and \textbf{sign} members of $a$ are finally copied over the respective members of
+$b$.
+
+\textbf{Remark.}  This algorithm also introduces a new idiosyncrasy that will be used throughout the rest of the
+text.  The error return codes of other algorithms are not explicitly checked in the pseudo-code presented.  For example, in 
+step one of the mp\_copy algorithm the return of mp\_grow is not explicitly checked to ensure it succeeded.  Text space is 
+limited so it is assumed that if a algorithm fails it will clear all temporarily allocated mp\_ints and return
+the error code itself.  However, the C code presented will demonstrate all of the error handling logic required to 
+implement the pseudo-code.
+
+EXAM,bn_mp_copy.c
+
+Occasionally a dependent algorithm may copy an mp\_int effectively into itself such as when the input and output
+mp\_int structures passed to a function are one and the same.  For this case it is optimal to return immediately without 
+copying digits (line @24,a == b@).  
+
+The mp\_int $b$ must have enough digits to accomodate the used digits of the mp\_int $a$.  If $b.alloc$ is less than
+$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines @29,alloc@ to @33,}@).  In order to
+simplify the inner loop that copies the digits from $a$ to $b$, two aliases $tmpa$ and $tmpb$ point directly at the digits
+of the mp\_ints $a$ and $b$ respectively.  These aliases (lines @42,tmpa@ and @45,tmpb@) allow the compiler to access the digits without first dereferencing the
+mp\_int pointers and then subsequently the pointer to the digits.  
+
+After the aliases are established the digits from $a$ are copied into $b$ (lines @48,for@ to @50,}@) and then the excess 
+digits of $b$ are set to zero (lines @53,for@ to @55,}@).  Both ``for'' loops make use of the pointer aliases and in 
+fact the alias for $b$ is carried through into the second ``for'' loop to clear the excess digits.  This optimization 
+allows the alias to stay in a machine register fairly easy between the two loops.
+
+\textbf{Remarks.}  The use of pointer aliases is an implementation methodology first introduced in this function that will
+be used considerably in other functions.  Technically, a pointer alias is simply a short hand alias used to lower the 
+number of pointer dereferencing operations required to access data.  For example, a for loop may resemble
+
+\begin{alltt}
+for (x = 0; x < 100; x++) \{
+    a->num[4]->dp[x] = 0;
+\}
+\end{alltt}
+
+This could be re-written using aliases as 
+
+\begin{alltt}
+mp_digit *tmpa;
+a = a->num[4]->dp;
+for (x = 0; x < 100; x++) \{
+    *a++ = 0;
+\}
+\end{alltt}
+
+In this case an alias is used to access the 
+array of digits within an mp\_int structure directly.  It may seem that a pointer alias is strictly not required 
+as a compiler may optimize out the redundant pointer operations.  However, there are two dominant reasons to use aliases.
+
+The first reason is that most compilers will not effectively optimize pointer arithmetic.  For example, some optimizations 
+may work for the Microsoft Visual C++ compiler (MSVC) and not for the GNU C Compiler (GCC).  Also some optimizations may 
+work for GCC and not MSVC.  As such it is ideal to find a common ground for as many compilers as possible.  Pointer 
+aliases optimize the code considerably before the compiler even reads the source code which means the end compiled code 
+stands a better chance of being faster.
+
+The second reason is that pointer aliases often can make an algorithm simpler to read.  Consider the first ``for'' 
+loop of the function mp\_copy() re-written to not use pointer aliases.
+
+\begin{alltt}
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) \{
+      b->dp[n] = a->dp[n];
+    \}
+\end{alltt}
+
+Whether this code is harder to read depends strongly on the individual.  However, it is quantifiably slightly more 
+complicated as there are four variables within the statement instead of just two.
+
+\subsubsection{Nested Statements}
+Another commonly used technique in the source routines is that certain sections of code are nested.  This is used in
+particular with the pointer aliases to highlight code phases.  For example, a Comba multiplier (discussed in chapter six)
+will typically have three different phases.  First the temporaries are initialized, then the columns calculated and 
+finally the carries are propagated.  In this example the middle column production phase will typically be nested as it
+uses temporary variables and aliases the most.
+
+The nesting also simplies the source code as variables that are nested are only valid for their scope.  As a result
+the various temporary variables required do not propagate into other sections of code.
+
+
+\subsection{Creating a Clone}
+Another common operation is to make a local temporary copy of an mp\_int argument.  To initialize an mp\_int 
+and then copy another existing mp\_int into the newly intialized mp\_int will be known as creating a clone.  This is 
+useful within functions that need to modify an argument but do not wish to actually modify the original copy.  The 
+mp\_init\_copy algorithm has been designed to help perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_copy}. \\
+\textbf{Input}.   An mp\_int $a$ and $b$\\
+\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
+\hline \\
+1.  Init $a$.  (\textit{mp\_init}) \\
+2.  Copy $b$ to $a$.  (\textit{mp\_copy}) \\
+3.  Return the status of the copy operation. \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_copy.}
+This algorithm will initialize an mp\_int variable and copy another previously initialized mp\_int variable into it.  As 
+such this algorithm will perform two operations in one step.  
+
+EXAM,bn_mp_init_copy.c
+
+This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that 
+\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
+and \textbf{a} will be left intact.  
+
+\section{Zeroing an Integer}
+Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
+perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_zero}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Zero the contents of $a$ \\
+\hline \\
+1.  $a.used \leftarrow 0$ \\
+2.  $a.sign \leftarrow$ MP\_ZPOS \\
+3.  for $n$ from 0 to $a.alloc - 1$ do \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_zero}
+\end{figure}
+
+\textbf{Algorithm mp\_zero.}
+This algorithm simply resets a mp\_int to the default state.  
+
+EXAM,bn_mp_zero.c
+
+After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the 
+\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
+
+\section{Sign Manipulation}
+\subsection{Absolute Value}
+With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
+the absolute value of an mp\_int.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_abs}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = \vert a \vert$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  $b.sign \leftarrow MP\_ZPOS$ \\
+4.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_abs}
+\end{figure}
+
+\textbf{Algorithm mp\_abs.}
+This algorithm computes the absolute of an mp\_int input.  First it copies $a$ over $b$.  This is an example of an
+algorithm where the check in mp\_copy that determines if the source and destination are equal proves useful.  This allows,
+for instance, the developer to pass the same mp\_int as the source and destination to this function without addition 
+logic to handle it.
+
+EXAM,bn_mp_abs.c
+
+\subsection{Integer Negation}
+With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
+the negative of an mp\_int input.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_neg}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = -a$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  If $a.used = 0$ then return(\textit{MP\_OKAY}). \\
+4.  If $a.sign = MP\_ZPOS$ then do \\
+\hspace{3mm}4.1  $b.sign = MP\_NEG$. \\
+5.  else do \\
+\hspace{3mm}5.1  $b.sign = MP\_ZPOS$. \\
+6.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_neg}
+\end{figure}
+
+\textbf{Algorithm mp\_neg.}
+This algorithm computes the negation of an input.  First it copies $a$ over $b$.  If $a$ has no used digits then
+the algorithm returns immediately.  Otherwise it flips the sign flag and stores the result in $b$.  Note that if 
+$a$ had no digits then it must be positive by definition.  Had step three been omitted then the algorithm would return
+zero as negative.
+
+EXAM,bn_mp_neg.c
+
+\section{Small Constants}
+\subsection{Setting Small Constants}
+Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set}. \\
+\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{mp\_zero}). \\
+2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
+3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
+                              1 &  \mbox{if }a_0 > 0 \\
+                              0 &  \mbox{if }a_0 = 0 
+                              \end{array} \right .$ \\
+\hline                              
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set}
+\end{figure}
+
+\textbf{Algorithm mp\_set.}
+This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
+single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
+
+EXAM,bn_mp_set.c
+
+Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign.  Line @22,MP_MASK@ copies the digit 
+into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
+reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
+$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line @23,a->used@ will set the \textbf{used} member with respect to the 
+digit actually set. This function will always make the integer positive.
+
+One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
+this function should take that into account.  Only trivially small constants can be set using this function.
+
+\subsection{Setting Large Constants}
+To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is ideal.  It accepts a ``long''
+data type as input and will always treat it as a 32-bit integer.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set\_int}. \\
+\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{mp\_zero}) \\
+2.  for $n$ from 0 to 7 do \\
+\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\
+\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
+\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
+\hspace{3mm}2.4  $a.used \leftarrow a.used + 1$ \\
+3.  Clamp excess used digits (\textit{mp\_clamp}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set\_int}
+\end{figure}
+
+\textbf{Algorithm mp\_set\_int.}
+The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the 
+mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions.  In step 2.2 the
+next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is 
+incremented to reflect the addition.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
+zero digits used and the newly added four bits would be ignored.
+
+Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
+
+EXAM,bn_mp_set_int.c
+
+This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
+addition on line @38,a->used@ ensures that the newly added in bits are added to the number of digits.  While it may not 
+seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line @27,mp_mul_2d@ 
+as well as the  call to mp\_clamp() on line @40,mp_clamp@.  Both functions will clamp excess leading digits which keeps 
+the number of used digits low.
+
+\section{Comparisons}
+\subsection{Unsigned Comparisions}
+Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
+to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
+to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude 
+positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.  
+
+The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
+mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the 
+signs are known to agree in advance.
+
+To facilitate working with the results of the comparison functions three constants are required.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|r|l|}
+\hline \textbf{Constant} & \textbf{Meaning} \\
+\hline \textbf{MP\_GT} & Greater Than \\
+\hline \textbf{MP\_EQ} & Equal To \\
+\hline \textbf{MP\_LT} & Less Than \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Comparison Return Codes}
+\end{figure}
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp\_mag}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
+\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
+\hline \\
+1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
+2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
+3.  for n from $a.used - 1$ to 0 do \\
+\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
+\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
+4.  Return(\textit{MP\_EQ}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp\_mag}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp\_mag.}
+By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
+\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.  
+Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.  
+If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.  
+
+By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
+the zero'th digit.  If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}.
+
+EXAM,bn_mp_cmp_mag.c
+
+The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs.  These two are performed before all of the digits
+are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
+without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
+array of digits.
+
+\subsection{Signed Comparisons}
+Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
+comparison a trivial signed comparison algorithm can be written.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
+\hline \\
+1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
+2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
+3.  if $a.sign = MP\_NEG$ then \\
+\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\
+4   Otherwise \\
+\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp.}
+The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate 
+comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step 
+three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then 
+$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
+
+EXAM,bn_mp_cmp.c
+
+The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   At line @30,if@, the inputs are compared based on magnitudes.  If the signs were both negative then 
+the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}).  Otherwise, the signs are assumed to 
+be both positive and a forward direction unsigned comparison is performed.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
+                     & \\
+$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
+                     & of two random digits (of equal magnitude) before a difference is found. \\
+                     & \\
+$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
+                     & on the observations made in the previous problem. \\
+                     &
+\end{tabular}
+
+\chapter{Basic Arithmetic}
+\section{Introduction}
+At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been 
+established.  The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms.  These 
+algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms.  It is very important 
+that these algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms 
+which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.  
+
+MARK,SHIFTS
+All of the algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right 
+logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real 
+number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $\beta^2 = 10^2$}).  
+Algebraically a binary logical shift is equivalent to a division or multiplication by a power of two.  
+For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
+
+One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
+from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the 
+result is $110_2$.  
+
+\section{Addition and Subtraction}
+In common twos complement fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
+$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.  
+As a result subtraction can be performed with a trivial series of logical operations and an addition.
+
+However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
+sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or 
+subtraction algorithms with the sign fixed up appropriately.
+
+The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
+the integers respectively.
+
+\subsection{Low Level Addition}
+An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the 
+trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.  
+Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
+
+\newpage
+\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
+\hline \\
+1.  if $a.used > b.used$ then \\
+\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
+\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
+\hspace{+3mm}1.3  $x   \leftarrow a$ \\
+2.  else  \\
+\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
+\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
+\hspace{+3mm}2.3  $x   \leftarrow b$ \\
+3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\
+4.  $oldused \leftarrow c.used$ \\
+5.  $c.used \leftarrow max + 1$ \\
+6.  $u \leftarrow 0$ \\
+7.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{+3mm}7.1  $c_n \leftarrow a_n + b_n + u$ \\
+\hspace{+3mm}7.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+8.  if $min \ne max$ then do \\
+\hspace{+3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{+6mm}8.1.1  $c_n \leftarrow x_n + u$ \\
+\hspace{+6mm}8.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9.  $c_{max} \leftarrow u$ \\
+10.  if $olduse > max$ then \\
+\hspace{+3mm}10.1  for $n$ from $max + 1$ to $oldused - 1$ do \\
+\hspace{+6mm}10.1.1  $c_n \leftarrow 0$ \\
+11.  Clamp excess digits in $c$.  (\textit{mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_add}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_add.}
+This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.  
+Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}.  Even the 
+MIX pseudo  machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
+
+The first thing that has to be accomplished is to sort out which of the two inputs is the largest.  The addition logic
+will simply add all of the smallest input to the largest input and store that first part of the result in the
+destination.  Then it will apply a simpler addition loop to excess digits of the larger input.
+
+The first two steps will handle sorting the inputs such that $min$ and $max$ hold the digit counts of the two 
+inputs.  The variable $x$ will be an mp\_int alias for the largest input or the second input $b$ if they have the
+same number of digits.  After the inputs are sorted the destination $c$ is grown as required to accomodate the sum 
+of the two inputs.  The original \textbf{used} count of $c$ is copied and set to the new used count.  
+
+At this point the first addition loop will go through as many digit positions that both inputs have.  The carry
+variable $\mu$ is set to zero outside the loop.  Inside the loop an ``addition'' step requires three statements to produce
+one digit of the summand.  First
+two digits from $a$ and $b$ are added together along with the carry $\mu$.  The carry of this step is extracted and stored
+in $\mu$ and finally the digit of the result $c_n$ is truncated within the range $0 \le c_n < \beta$.
+
+Now all of the digit positions that both inputs have in common have been exhausted.  If $min \ne max$ then $x$ is an alias
+for one of the inputs that has more digits.  A simplified addition loop is then used to essentially copy the remaining digits
+and the carry to the destination.
+
+The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are zeroed which completes the addition.
+
+
+EXAM,bn_s_mp_add.c
+
+Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
+mp\_int assigned to the largest input, in effect it is a local alias.  Lines @37,init@ to @42,}@ ensure that the destination is grown to 
+accomodate the result of the addition. 
+
+Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
+lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively.  These aliases are used to ensure the
+compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
+
+The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the 
+implementation.  The initial addition loop begins on line @66,for@ and ends on line @75,}@.  Similarly the conditional addition loop
+begins on line @81,for@ and ends on line @90,}@.  The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@.  
+Note the ``++'' operator on the same line.  After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero.
+
+\subsection{Low Level Subtraction}
+The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
+unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must 
+be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.  
+This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
+
+MARK,GAMMA
+
+For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
+the range $0 \le x < 2\beta$ for the algorithms to work correctly.  However, it is allowable that a mp\_digit represent a larger range of values.  For 
+this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a 
+mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
+
+For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
+\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
+\hline \\
+1.  $min \leftarrow b.used$ \\
+2.  $max \leftarrow a.used$ \\
+3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{mp\_grow}) \\
+4.  $oldused \leftarrow c.used$ \\ 
+5.  $c.used \leftarrow max$ \\
+6.  $u \leftarrow 0$ \\
+7.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{3mm}7.1  $c_n \leftarrow a_n - b_n - u$ \\
+\hspace{3mm}7.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+8.  if $min < max$ then do \\
+\hspace{3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{6mm}8.1.1  $c_n \leftarrow a_n - u$ \\
+\hspace{6mm}8.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9. if $oldused > max$ then do \\
+\hspace{3mm}9.1  for $n$ from $max$ to $oldused - 1$ do \\
+\hspace{6mm}9.1.1  $c_n \leftarrow 0$ \\
+10. Clamp excess digits of $c$.  (\textit{mp\_clamp}). \\
+11. Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_sub}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sub.}
+This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
+passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
+algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
+of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
+
+The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2 
+set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at 
+most $max$ digits in length as opposed to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and 
+set to the maximal count for the operation.
+
+The subtraction loop that begins on step seven is essentially the same as the addition loop of algorithm s\_mp\_add except single precision 
+subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction 
+loops.  Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry.  
+
+For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$.  The least significant bit will force a carry upwards to 
+the third bit which will be set to zero after the borrow.  After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the 
+third bit of $0101_2$ is subtracted from the result it will cause another carry.  In this case though the carry will be forced to propagate all the 
+way to the most significant bit.  
+
+Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most 
+significant bit.  Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that
+is needed is a single zero or one bit for the carry.  Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the 
+carry.  This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed.  
+
+If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
+10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
+
+EXAM,bn_s_mp_sub.c
+
+Line @24,min@ and @25,max@ perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for 
+$a$, $b$ and $c$ respectively.
+
+The first subtraction loop occurs on lines @47,u = 0@ through @61,}@.  The theory behind the subtraction loop is exactly the same as that for
+the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
+(\textit{see line @57, >>@}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
+the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
+occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
+shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
+twos compliment machines which is a safe assumption to make.
+
+If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through
+$a$ and copy the result to $c$.  
+
+\subsection{High Level Addition}
+Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
+established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data 
+types.  
+
+Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} 
+flag.  A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases.
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed addition $c = a + b$. \\
+\hline \\
+1.  if $a.sign = b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag})  \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_add}
+\end{figure}
+
+\textbf{Algorithm mp\_add.}
+This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from 
+either \cite{TAOCPV2} or \cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly 
+straightforward but restricted since subtraction can only produce positive results.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&&\\
+
+\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
+\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
+
+\hline &&&&\\
+
+\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Addition Guide Chart}
+\label{fig:AddChart}
+\end{figure}
+
+Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three 
+specific cases need to be handled.  The return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are 
+forwarded to step three to check for errors.  This simplifies the description of the algorithm considerably and best 
+follows how the implementation actually was achieved.
+
+Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
+s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
+to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.
+
+For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
+produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp 
+within algorithm s\_mp\_add will force $-0$ to become $0$.  
+
+EXAM,bn_mp_add.c
+
+The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
+is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
+explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
+level functions do so.  Returning their return code is sufficient.
+
+\subsection{High Level Subtraction}
+The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed subtraction $c = a - b$. \\
+\hline \\
+1.  if $a.sign \ne b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_sub}
+\end{figure}
+
+\textbf{Algorithm mp\_sub.}
+This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or 
+\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  Chart \ref{fig:SubChart} lists the eight possible inputs and
+the operations required.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Subtraction Guide Chart}
+\label{fig:SubChart}
+\end{figure}
+
+Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the 
+algorithm from producing $-a - -a = -0$ as a result.  
+
+EXAM,bn_mp_sub.c
+
+Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
+and forward it to the end of the function.  On line @38, != MP_LT@ the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
+``greater than or equal to'' comparison.  
+
+\section{Bit and Digit Shifting}
+MARK,POLY
+It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.  
+This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.  
+
+In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
+the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
+are on radix-$\beta$ digits.  
+
+\subsection{Multiplication by Two}
+
+In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient 
+operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = 2a$. \\
+\hline \\
+1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{mp\_grow}) \\
+2.  $oldused \leftarrow b.used$ \\
+3.  $b.used \leftarrow a.used$ \\
+4.  $r \leftarrow 0$ \\
+5.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}5.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
+\hspace{3mm}5.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}5.3  $r \leftarrow rr$ \\
+6.  If $r \ne 0$ then do \\
+\hspace{3mm}6.1  $b_{n + 1} \leftarrow r$ \\
+\hspace{3mm}6.2  $b.used \leftarrow b.used + 1$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2.}
+This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such 
+an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since 
+it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.  
+
+Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
+is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
+
+Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together 
+are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
+obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
+the previous carry.  Recall from ~SHIFTS~ that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with 
+forwarding the carry to the next iteration.
+
+Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$.  
+Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$.
+
+EXAM,bn_mp_mul_2.c
+
+This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
+is the use of the logical shift operator on line @52,<<@ to perform a single precision doubling.  
+
+\subsection{Division by Two}
+A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = a/2$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{mp\_grow}) \\
+2.  If the reallocation failed return(\textit{MP\_MEM}). \\
+3.  $oldused \leftarrow b.used$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $r \leftarrow 0$ \\
+6.  for $n$ from $b.used - 1$ to $0$ do \\
+\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
+\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}6.3  $r \leftarrow rr$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Clamp excess digits of $b$.  (\textit{mp\_clamp}) \\
+10.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2.}
+This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
+core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
+could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
+reading past the end of the array of digits.
+
+Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the 
+least significant bit not the most significant bit.  
+
+EXAM,bn_mp_div_2.c
+
+\section{Polynomial Basis Operations}
+Recall from ~POLY~ that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
+the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single 
+place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
+division and Karatsuba multiplication.  
+
+Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
+$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
+polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.  
+
+\subsection{Multiplication by $x$}
+
+Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one 
+degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
+multiplying by the integer $\beta$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
+2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{mp\_grow}). \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  $a.used \leftarrow a.used + b$ \\
+5.  $i \leftarrow a.used - 1$ \\
+6.  $j \leftarrow a.used - 1 - b$ \\
+7.  for $n$ from $a.used - 1$ to $b$ do \\
+\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
+\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
+\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
+8.  for $n$ from 0 to $b - 1$ do \\
+\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lshd}
+\end{figure}
+
+\textbf{Algorithm mp\_lshd.}
+This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs 
+from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location.  The
+motivation behind this change is due to the way this function is typically used.  Algorithms such as mp\_add store the result in an optionally
+different third mp\_int because the original inputs are often still required.  Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is
+typically used on values where the original value is no longer required.  The algorithm will return success immediately if 
+$b \le 0$ since the rest of algorithm is only valid when $b > 0$.  
+
+First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
+the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).  
+The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on 
+step 8 sets the lower $b$ digits to zero.
+
+\newpage
+FIGU,sliding_window,Sliding Window Movement
+
+EXAM,bn_mp_lshd.c
+
+The if statement on line @24,if@ ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
+the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line @42,top@ is an alias
+for the leading digit while $bottom$ on line @45,bottom@ is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
+over the input.  
+
+\subsection{Division by $x$}
+
+Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return. \\
+2.  If $a.used \le b$ then do \\
+\hspace{3mm}2.1  Zero $a$.  (\textit{mp\_zero}). \\
+\hspace{3mm}2.2  Return. \\
+3.  $i \leftarrow 0$ \\
+4.  $j \leftarrow b$ \\
+5.  for $n$ from 0 to $a.used - b - 1$ do \\
+\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
+\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
+\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
+6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
+\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.used \leftarrow a.used - b$ \\
+8.  Return. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rshd}
+\end{figure}
+
+\textbf{Algorithm mp\_rshd.}
+This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
+it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.  
+
+If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
+to the shift count $b$ then it will simply zero the input and return.
+
+After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
+is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.  
+Also the digits are copied from the leading to the trailing edge.
+
+Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented.
+
+EXAM,bn_mp_rshd.c
+
+The only noteworthy element of this routine is the lack of a return type.  
+
+-- Will update later to give it a return type...Tom
+
+\section{Powers of Two}
+
+Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For 
+example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
+shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.  
+
+\subsection{Multiplication by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
+\hline \\
+1.  $c \leftarrow a$.  (\textit{mp\_copy}) \\
+2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  If $b \ge lg(\beta)$ then \\
+\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\
+\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
+5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $d \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+\hspace{3mm}6.4  If $r > 0$ then do \\
+\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
+\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2d.}
+This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
+quickly compute the product.
+
+First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than 
+$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ 
+left.
+
+After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts 
+required.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
+Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
+variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
+
+This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
+complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
+
+EXAM,bn_mp_mul_2d.c
+
+Notes to be revised when code is updated. -- Tom
+
+\subsection{Division by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
+\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{mp\_zero}) \\
+\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow a$ \\
+3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+4.  If $b \ge lg(\beta)$ then do \\
+\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\
+5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $k \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+7.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2d.}
+This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm 
+mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
+by using algorithm mp\_mod\_2d.
+
+EXAM,bn_mp_div_2d.c
+
+The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally 
+ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the 
+result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
+the quotient is obtained.
+
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+
+\subsection{Remainder of Division by Power of Two}
+
+The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
+algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mod\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{mp\_zero}) \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $b > a.used \cdot lg(\beta)$ then do \\
+\hspace{3mm}2.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
+\hspace{3mm}2.2  Return the result of step 2.1. \\
+3.  $c \leftarrow a$ \\
+4.  If step 3 failed return(\textit{MP\_MEM}). \\
+5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
+\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
+6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
+8.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mod\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mod\_2d.}
+This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the 
+result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$ 
+is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
+
+EXAM,bn_mp_mod_2d.c
+
+-- Add comments later, Tom.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
+                      & in $O(n)$ time. \\
+                      &\\
+$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
+                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
+                      & upto $64$ with a hamming weight less than three. \\
+                      &\\
+$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
+                      & $2^k - 1$ as well. \\
+                      &\\
+$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
+                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
+                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
+                      & calculation.  \\
+                      & \\
+$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
+                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
+                      & the cost of addition. \\
+                      & \\
+$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
+                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
+                      & \\
+$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
+                      & calculating the result of a signed comparison. \\
+                      &
+\end{tabular}
+
+\chapter{Multiplication and Squaring}
+\section{The Multipliers}
+For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of 
+algorithms of any multiple precision integer package.  The set of multiplier algorithms include integer multiplication, squaring and modular reduction 
+where in each of the algorithms single precision multiplication is the dominant operation performed.  This chapter will discuss integer multiplication 
+and squaring, leaving modular reductions for the subsequent chapter.  
+
+The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular 
+exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  During a modular
+exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, 
+35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision 
+multiplications.
+
+For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied 
+against every digit of the other multiplicand.  Traditional long-hand multiplication is based on this process;  while the techniques can differ the 
+overall algorithm used is essentially the same.  Only ``recently'' have faster algorithms been studied.  First Karatsuba multiplication was discovered in 
+1962.  This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach.  
+This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions.  
+
+\section{Multiplication}
+\subsection{The Baseline Multiplication}
+\label{sec:basemult}
+\index{baseline multiplication}
+Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
+algorithm that school children are taught.  The algorithm is considered an $O(n^2)$ algorithm since for two $n$-digit inputs $n^2$ single precision 
+multiplications are required.  More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required.  To 
+simplify most discussions, it will be assumed that the inputs have comparable number of digits.  
+
+The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be 
+used.  This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible.    One important 
+facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution.  The importance of this 
+modification will become evident during the discussion of Barrett modular reduction.  Recall that for a $n$ and $m$ digit input the product 
+will be at most $n + m$ digits.  Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product.  
+
+Recall from ~GAMMA~ the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend the variable set to 
+include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The 
+constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see ~COMBA~ for more information}).
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+1.  If min$(a.used, b.used) < \delta$ then do \\
+\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}).  \\
+\hspace{3mm}1.2  Return the result of step 1.1 \\
+\\
+Allocate and initialize a temporary mp\_int. \\
+2.  Init $t$ to be of size $digs$ \\
+3.  If step 2 failed return(\textit{MP\_MEM}). \\
+4.  $t.used \leftarrow digs$ \\
+\\
+Compute the product. \\
+5.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}5.1  $u \leftarrow 0$ \\
+\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
+\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
+\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.5  if $ix + pb < digs$ then do \\
+\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
+6.  Clamp excess digits of $t$. \\
+7.  Swap $c$ with $t$ \\
+8.  Clear $t$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_mul\_digs}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_mul\_digs.}
+This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits.  While it may seem
+a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent 
+algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}.  
+Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the 
+inputs.
+
+The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   If the minimum digit count of either
+input is less than $\delta$, then the Comba method may be used instead.    After the Comba method is ruled out, the baseline algorithm begins.  A 
+temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to 
+compute products when either $a = c$ or $b = c$ without overwriting the inputs.  
+
+All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
+is given the count of digits to read from $b$ inside the nested loop.  If $pb \le 1$ then no more output digits can be produced and the algorithm
+will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplications.    That is, in each pass of the 
+innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.  
+
+For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
+visualized in the following table.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|l|}
+\hline   &&          & 5 & 7 & 6 & \\
+\hline   $\times$&&  & 2 & 4 & 1 & \\
+\hline &&&&&&\\
+  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
+  &2 &   3    & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\
+  1 & 3 & 8 & 8 & 1 & 6 &   $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\
+\hline  
+\end{tabular}
+\end{center}
+\caption{Long-Hand Multiplication Diagram}
+\end{figure}
+
+Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate 
+count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
+
+Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable.  The multiplication on that step
+is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
+double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
+5.4.1 is propagated through the nested loop.  If the carry was not propagated immediately it would overflow the single precision digit 
+$t_{ix+iy}$ and the result would be lost.  
+
+At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  The carry does not have to be added to the $ix+pb$'th
+digit since that digit is assumed to be zero at this point.  However, if $ix + pb \ge digs$ the carry is not set as it would make the result
+exceed the precision requested.
+
+EXAM,bn_s_mp_mul_digs.c
+
+Lines @31,if@ to @35,}@ determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
+the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
+the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
+
+Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@.  Note how all of the
+variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
+are used instead of single precision.  The multiplication on line @65,) * (@ makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
+the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
+processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
+example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+
+\subsection{Faster Multiplication by the ``Comba'' Method}
+MARK,COMBA
+
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
+makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
+(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
+carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
+his 1986 paper \cite{BARRETT} written five years before.
+
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
+the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
+final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
+
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
+simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
+of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
+
+\begin{equation}
+\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
+\end{equation}
+
+Where $\vec x_n$ is the $n'th$ column of the output vector.  Consider the following example which computes the vector $\vec x$ for the multiplication
+of $576$ and $241$.  
+
+\newpage\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|}
+  \hline &          & 5 & 7 & 6 & First Input\\
+  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
+\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
+                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
+   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
+\hline 10 & 34 & 45 & 31 & 6 & Final Result \\   
+\hline   
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Comba Multiplication Diagram}
+\end{figure}
+
+At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.  
+Now the columns must be fixed by propagating the carry upwards.  The resultant vector will have one extra dimension over the input vector which is
+congruent to adding a leading zero digit.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Comba Fixup}. \\
+\textbf{Input}.   Vector $\vec x$ of dimension $k$ \\
+\textbf{Output}.  Vector $\vec x$ such that the carries have been propagated. \\
+\hline \\
+1.  for $n$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\
+\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\
+2.  Return($\vec x$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Comba Fixup}
+\end{figure}
+
+With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case 
+$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
+efficient than the baseline algorithm why not simply always use this algorithm?
+
+\subsubsection{Column Weight.}
+At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output 
+independently.  A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix
+the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
+three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
+an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit inputs the maximum weight of any column is 
+min$(m, n)$ which is fairly obvious.
+
+The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
+from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
+two quantities we must not violate the following
+
+\begin{equation}
+k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
+\end{equation}
+
+Which reduces to 
+
+\begin{equation}
+k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
+\end{equation}
+
+Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
+found.
+
+\begin{equation}
+k  < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}}
+\end{equation}
+
+The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$.  In this configuration 
+the smaller input may not have more than $256$ digits if the Comba method is to be used.  This is quite satisfactory for most applications since 
+$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}).\\
+\\
+Zero the temporary array $\hat W$. \\
+3.  for $n$ from $0$ to $digs - 1$ do \\
+\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
+\\
+Compute the columns. \\
+4.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
+\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+\\
+Propagate the carries upwards. \\
+5.  $oldused \leftarrow c.used$ \\
+6.  $c.used \leftarrow digs$ \\
+7.  If $digs > 1$ then do \\
+\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
+\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
+\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
+8.  else do \\
+\hspace{3mm}8.1  $ix \leftarrow 0$ \\
+9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+\\
+Zero excess digits. \\
+10.  If $digs < oldused$ then do \\
+\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
+\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
+11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_mul\_digs}
+\label{fig:COMBAMULT}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
+essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
+
+The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
+unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
+
+The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
+a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
+iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
+
+To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
+cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
+$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method requires only $O(pn^2 + qn)$ time, however in practice, 
+the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
+and addition operations in the nested loop in parallel.  
+
+EXAM,bn_fast_s_mp_mul_digs.c
+
+The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
+implementation a series of aliases (\textit{lines @67, tmpx@, @70, tmpy@ and @75,_W@}) are used to simplify the inner $O(n^2)$ loop.  
+In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
+
+The inner loop on lines @83,for@, @84,mp_word@ and @85,}@ is where the algorithm will spend the majority of the time, which is why it has been 
+stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
+very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
+(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
+and scheduling the instructions so there are very few dependency stalls.
+
+In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
+baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
+digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
+be simultaneously used.  
+
+\subsection{Polynomial Basis Multiplication}
+To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
+the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and  
+$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required.  In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree.
+ 
+The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$.  The coefficients $w_i$ will
+directly yield the desired product when $\beta$ is substituted for $x$.  The direct solution to solve for the $2n + 1$ coefficients
+requires $O(n^2)$ time and would in practice be slower than the Comba technique.
+
+However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown 
+coefficients.   This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with 
+Gaussian elimination.  This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in 
+effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$.  
+
+The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible.  However, since 
+$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place.  The benefit of this technique stems from the 
+fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively.  As a result finding the $2n + 1$ relations required 
+by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs.
+
+When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$.  The $\zeta_0$ term
+is simply the product $W(0) = w_0 = a_0 \cdot b_0$.  The $\zeta_1$ term is the product 
+$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$.  The third point $\zeta_{\infty}$ is less obvious but rather
+simple to explain.  The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication.  
+The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$.  Note that the 
+points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly.
+
+If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} 
+$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ for small values of $q$.  The term ``mirror point'' stems from the fact that 
+$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$.  For
+example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror.
+
+\begin{eqnarray}
+\zeta_{2}                  = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\
+16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0)
+\end{eqnarray}
+
+Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts.  For example, when $n = 2$ the
+polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$.  This technique of polynomial representation is known as Horner's method.  
+
+As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications.  Each multiplication is of 
+multiplicands that have $n$ times fewer digits than the inputs.  The asymptotic running time of this algorithm is 
+$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}).  Figure~\ref{fig:exponent}
+summarizes the exponents for various values of $n$.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Split into $n$ Parts} & \textbf{Exponent}  & \textbf{Notes}\\
+\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\
+\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\
+\hline $4$ & $1.403677461$ &\\
+\hline $5$ & $1.365212389$ &\\
+\hline $10$ & $1.278753601$ &\\
+\hline $100$ & $1.149426538$ &\\
+\hline $1000$ & $1.100270931$ &\\
+\hline $10000$ & $1.075252070$ &\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Asymptotic Running Time of Polynomial Basis Multiplication}
+\label{fig:exponent}
+\end{figure}
+
+At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$.  However, the overhead
+of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large
+numbers.  
+
+\subsubsection{Cutoff Point}
+The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach.  However, 
+the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved.  This makes the
+polynomial basis approach more costly to use with small inputs.
+
+Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}).  There exists a 
+point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and 
+when $m > y$ the Comba methods are slower than the polynomial basis algorithms.  
+
+The exact location of $y$ depends on several key architectural elements of the computer platform in question.
+
+\begin{enumerate}
+\item  The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc.  For example
+on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$.  The higher the ratio in favour of multiplication the lower
+the cutoff point $y$ will be.  
+
+\item  The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is.  Generally speaking as the number of splits
+grows the complexity grows substantially.  Ideally solving the system will only involve addition, subtraction and shifting of integers.  This
+directly reflects on the ratio previous mentioned.
+
+\item  To a lesser extent memory bandwidth and function call overheads.  Provided the values are in the processor cache this is less of an
+influence over the cutoff point.
+
+\end{enumerate}
+
+A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met.  For example, if the point
+is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster.  Finding the cutoff points is fairly simple when
+a high resolution timer is available.  
+
+\subsection{Karatsuba Multiplication}
+Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for
+general purpose multiplication.  Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with 
+light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
+
+\begin{equation}
+f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
+\end{equation}
+
+Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
+this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
+out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
+$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
+
+\begin{center}
+\begin{tabular}{rcrcrcrc}
+$\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
+$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
+$\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
+\end{tabular}
+\end{center}
+
+By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
+of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
+making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
+$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
+\hline \\
+1.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
+2.  If step 2 failed then return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
+3.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
+6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\
+7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
+\\
+Calculate the three products. \\
+8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
+9.  $x1y1 \leftarrow x1 \cdot y1$ \\
+10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+11.  $x0 \leftarrow y1 - y0$ \\
+12.  $t1 \leftarrow t1 \cdot x0$ \\
+\\
+Calculate the middle term. \\
+13.  $x0 \leftarrow x0y0 + x1y1$ \\
+14.  $t1 \leftarrow x0 - t1$ \\
+\\
+Calculate the final product. \\
+15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
+16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
+17.  $t1 \leftarrow x0y0 + t1$ \\
+18.  $c \leftarrow t1 + x1y1$ \\
+19.  Clear all of the temporary variables. \\
+20.  Return(\textit{MP\_OKAY}).\\
+\hline 
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_mul.}
+This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm.  It is loosely based on the description
+from Knuth \cite[pp. 294-295]{TAOCPV2}.  
+
+\index{radix point}
+In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen.  The radix point chosen must
+be used for both of the inputs meaning that it must be smaller than the smallest input.  Step 3 chooses the radix point $B$ as half of the 
+smallest input \textbf{used} count.  After the radix point is chosen the inputs are split into lower and upper halves.  Step 4 and 5 
+compute the lower halves.  Step 6 and 7 computer the upper halves.  
+
+After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
+$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
+of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
+
+The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
+
+EXAM,bn_mp_karatsuba_mul.c
+
+The new coding element in this routine, not  seen in previous routines, is the usage of goto statements.  The conventional
+wisdom is that goto statements should be avoided.  This is generally true, however when every single function call can fail, it makes sense
+to handle error recovery with a single piece of code.  Lines @61,if@ to @75,if@ handle initializing all of the temporary variables 
+required.  Note how each of the if statements goes to a different label in case of failure.  This allows the routine to correctly free only
+the temporaries that have been successfully allocated so far.
+
+The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large.  This saves the 
+additional reallocation that would have been necessary.  Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective
+number of digits for the next section of code.
+
+The first algebraic portion of the algorithm is to split the two inputs into their halves.  However, instead of using mp\_mod\_2d and mp\_rshd
+to extract the halves, the respective code has been placed inline within the body of the function.  To initialize the halves, the \textbf{used} and 
+\textbf{sign} members are copied first.  The first for loop on line @98,for@ copies the lower halves.  Since they are both the same magnitude it 
+is simpler to calculate both lower halves in a single loop.  The for loop on lines @104,for@ and @109,for@ calculate the upper halves $x1$ and 
+$y1$ respectively.
+
+By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs.
+
+When line @152,err@ is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
+the same code that handles errors can be used to clear the temporary variables and return.  
+
+\subsection{Toom-Cook $3$-Way Multiplication}
+Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 2$ except that the points  are 
+chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce.  Here, the points $\zeta_{0}$, 
+$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients 
+of the $W(x)$.
+
+With the five relations that Toom-Cook specifies, the following system of equations is formed.
+
+\begin{center}
+\begin{tabular}{rcrcrcrcrcr}
+$\zeta_0$                    & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$  \\
+$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$  \\
+$\zeta_1$                    & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$  \\
+$\zeta_2$                    & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$  \\
+$\zeta_{\infty}$             & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$  \\
+\end{tabular}
+\end{center}
+
+A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power
+of two, two divisions by three and one multiplication by three.  All of these $19$ sub-operations require less than quadratic time, meaning that
+the algorithm can be faster than a baseline multiplication.  However, the greater complexity of this algorithm places the cutoff point
+(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toom\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow  a  \cdot  b $ \\
+\hline \\
+Split $a$ and $b$ into three pieces.  E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\
+1.  $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\
+2.  $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+3.  $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+4.  $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+5.  $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+6.  $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+7.  $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+\\
+Find the five equations for $w_0, w_1, ..., w_4$. \\
+8.  $w_0 \leftarrow a_0 \cdot b_0$ \\
+9.  $w_4 \leftarrow a_2 \cdot b_2$ \\
+10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\
+11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
+12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\
+13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\
+14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\
+15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\
+16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
+17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\
+\\
+Continued on the next page.\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toom\_mul}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot  b $ \\
+\hline \\
+Now solve the system of equations. \\
+18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\
+19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\
+20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\
+21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
+22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\
+23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\
+24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
+25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\
+\\
+Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\
+26. for $n$ from $1$ to $4$ do \\
+\hspace{3mm}26.1  $w_n \leftarrow w_n \cdot \beta^{nk}$ \\
+27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\
+28. Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toom\_mul (continued)}
+\end{figure}
+
+\textbf{Algorithm mp\_toom\_mul.}
+This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach.  Compared to the Karatsuba multiplication, this 
+algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead.  In this
+description, several statements have been compounded to save space.  The intention is that the statements are executed from left to right across
+any given step.
+
+The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively.  From these smaller
+integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required.
+
+The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively.  The relation $w_1, w_2$ and $w_3$ correspond
+to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively.  These are found using logical shifts to independently find
+$f(y)$ and $g(y)$ which significantly speeds up the algorithm.
+
+After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients 
+$w_1, w_2$ and $w_3$ to be isolated.  The steps 18 through 25 perform the system reduction required as previously described.  Each step of
+the reduction represents the comparable matrix operation that would be performed had this been performed by pencil.  For example, step 18 indicates
+that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$.  
+
+Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known.  By substituting $\beta^{k}$ for $x$, the integer 
+result $a \cdot b$ is produced.
+
+EXAM,bn_mp_toom_mul.c
+
+-- Comments to be added during editing phase.
+
+\subsection{Signed Multiplication}
+Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
+of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot b$ \\
+\hline \\
+1.  If $a.sign = b.sign$ then \\
+\hspace{3mm}1.1  $sign = MP\_ZPOS$ \\
+2.  else \\
+\hspace{3mm}2.1  $sign = MP\_ZNEG$ \\
+3.  If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then  \\
+\hspace{3mm}3.1  $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\
+4.  else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\
+\hspace{3mm}4.1  $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\
+5.  else \\
+\hspace{3mm}5.1  $digs \leftarrow a.used + b.used + 1$ \\
+\hspace{3mm}5.2  If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\
+\hspace{6mm}5.2.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs.  \\
+\hspace{3mm}5.3  else \\
+\hspace{6mm}5.3.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs.  \\
+6.  $c.sign \leftarrow sign$ \\
+7.  Return the result of the unsigned multiplication performed. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_mul.}
+This algorithm performs the signed multiplication of two inputs.  It will make use of any of the three unsigned multiplication algorithms 
+available when the input is of appropriate size.  The \textbf{sign} of the result is not set until the end of the algorithm since algorithm
+s\_mp\_mul\_digs will clear it.  
+
+EXAM,bn_mp_mul.c
+
+The implementation is rather simplistic and is not particularly noteworthy.  Line @22,?@ computes the sign of the result using the ``?'' 
+operator from the C programming language.  Line @37,<<@ computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.  
+
+\section{Squaring}
+\label{sec:basesquare}
+
+Squaring is a special case of multiplication where both multiplicands are equal.  At first it may seem like there is no significant optimization
+available but in fact there is.  Consider the multiplication of $576$ against $241$.  In total there will be nine single precision multiplications
+performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot  6$, $2 \cdot 7$ and $2 \cdot 5$.  Now consider 
+the multiplication of $123$ against $123$.  The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, 
+$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$.  On closer inspection some of the products are equivalent.  For example, $3 \cdot 2 = 2 \cdot 3$ 
+and $3 \cdot 1 = 1 \cdot 3$. 
+
+For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$
+required for multiplication.  The following diagram gives an example of the operations required.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{ccccc|c}
+&&1&2&3&\\
+$\times$ &&1&2&3&\\
+\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\
+       & $2 \cdot 1$  & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\
+         $1 \cdot 1$  & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\
+\end{tabular}
+\end{center}
+\caption{Squaring Optimization Diagram}
+\end{figure}
+
+MARK,SQUARE
+Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious.  For the purposes of this discussion let $x$
+represent the number being squared.  The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it.  
+
+The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product.  Every non-square term of a column will
+appear twice hence the name ``double product''.  Every odd column is made up entirely of double products.  In fact every column is made up of double 
+products and at most one square (\textit{see the exercise section}).  
+
+The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, 
+occurs at column $2k + 1$.  For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. 
+Column two of row one is a square and column three is the first unique column.
+
+\subsection{The Baseline Squaring Algorithm}
+The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
+will not handle.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits.  (\textit{mp\_init\_size}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}) \\
+3.  $t.used \leftarrow 2 \cdot a.used + 1$ \\
+4.  For $ix$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}Calculate the square. \\
+\hspace{3mm}4.1  $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\
+\hspace{3mm}4.2  $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}Calculate the double products after the square. \\
+\hspace{3mm}4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}4.4  For $iy$ from $ix + 1$ to $a.used - 1$ do \\
+\hspace{6mm}4.4.1  $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\
+\hspace{6mm}4.4.2  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}4.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}Set the last carry. \\
+\hspace{3mm}4.5  While $u > 0$ do \\
+\hspace{6mm}4.5.1  $iy \leftarrow iy + 1$ \\
+\hspace{6mm}4.5.2  $\hat r \leftarrow t_{ix + iy} + u$ \\
+\hspace{6mm}4.5.3  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}4.5.4  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+5.  Clamp excess digits of $t$.  (\textit{mp\_clamp}) \\
+6.  Exchange $b$ and $t$. \\
+7.  Clear $t$ (\textit{mp\_clear}) \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sqr.}
+This algorithm computes the square of an input using the three observations on squaring.  It is based fairly faithfully on  algorithm 14.16 of HAC
+\cite[pp.596-597]{HAC}.  Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring.  This allows the 
+destination mp\_int to be the same as the source mp\_int.
+
+The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while
+the inner loop computes the columns of the partial result.  Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate
+the carry and compute the double products.  
+
+The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this
+very algorithm.  The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that
+when it is multiplied by two, it can be properly represented by a mp\_word.
+
+Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial 
+results calculated so far.  This involves expensive carry propagation which will be eliminated in the next algorithm.  
+
+EXAM,bn_s_mp_sqr.c
+
+Inside the outer loop (\textit{see line @32,for@}) the square term is calculated on line @35,r =@.  Line @42,>>@ extracts the carry from the square
+term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines @45,tmpx@ and @48,tmpt@ respectively.  The doubling is performed using two
+additions (\textit{see line @57,r + r@}) since it is usually faster than shifting,if not at least as fast.  
+
+\subsection{Faster Squaring by the ``Comba'' Method}
+A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
+drawback that it must double the product inside the inner loop as well.  As for multiplication, the Comba technique can be used to eliminate these
+performance hazards.
+
+The first obvious solution is to make an array of mp\_words which will hold all of the columns.  This will indeed eliminate all of the carry
+propagation operations from the inner loop.  However, the inner product must still be doubled $O(n^2)$ times.  The solution stems from the simple fact
+that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
+$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
+
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
+arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
+moved to a $O(n)$ work level outside the $O(n^2)$ level.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
+1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
+2.  If step 1 failed return(\textit{MP\_MEM}). \\
+3.  for $ix$ from $0$ to $2a.used + 1$ do \\
+\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
+\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
+4.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}Compute the square.\\
+\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\
+\\
+\hspace{3mm}Compute the double products.\\
+\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
+\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
+5.  $oldused \leftarrow b.used$ \\
+6.  $b.used \leftarrow 2a.used + 1$ \\
+\\
+Double the products and propagate the carries simultaneously. \\
+7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
+8.  for $ix$ from $1$ to $2a.used$ do \\
+\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
+\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
+\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
+9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
+10.  if $2a.used + 1 < oldused$ then do \\
+\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
+\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
+11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}). \\ 
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_sqr.}
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
+the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+
+This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
+array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
+processors to simply make it a full size array.
+
+The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
+it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
+computes the sum of the products for each column.  They are not doubled until later.
+
+After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
+operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
+squares in place.  
+
+EXAM,bn_fast_s_mp_sqr.c
+
+-- Write something deep and insightful later, Tom.
+
+\subsection{Polynomial Basis Squaring}
+The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
+is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$.  Instead of performing $2n + 1$
+multiplications to find the $\zeta$ relations, squaring operations are performed instead.  
+
+\subsection{Karatsuba Squaring}
+Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square.  
+Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  The Karatsuba equation can be modified to square a 
+number with the following equation.
+
+\begin{equation}
+h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
+\end{equation}
+
+Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
+Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
+$O \left ( n^{lg(3)} \right )$.
+
+If the asymptotic times of Karatsuba squaring and multiplication are the same, why not simply use the multiplication algorithm 
+instead?  The answer to this arises from the cutoff point for squaring.  As in multiplication there exists a cutoff point, at which the 
+time required for a Comba based squaring and a Karatsuba based squaring meet.  Due to the overhead inherent in the Karatsuba method, the cutoff 
+point is fairly high.  For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits.  
+
+Consider squaring a 200 digit number with this technique.  It will be split into two 100 digit halves which are subsequently squared.  
+The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm.  If Karatsuba multiplication
+were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  Initialize the following temporary mp\_ints:  $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\
+2.  If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1\beta^B + x0$ \\
+3.  $B \leftarrow \lfloor a.used / 2 \rfloor$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+5.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\
+\\
+Calculate the three squares. \\
+6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
+7.  $x1x1 \leftarrow x1^2$ \\
+8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+9.  $t1 \leftarrow t1^2$ \\
+\\
+Compute the middle term. \\
+10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
+11.  $t1 \leftarrow t2 - t1$ \\
+\\
+Compute final product. \\
+12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
+13.  $x1x1 \leftarrow x1x1\beta^{2B}$ \\
+14.  $t1 \leftarrow t1 + x0x0$ \\
+15.  $b \leftarrow t1 + x1x1$ \\
+16.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_sqr}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_sqr.}
+This algorithm computes the square of an input $a$ using the Karatsuba technique.  This algorithm is very similar to the Karatsuba based
+multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings.
+
+The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is
+placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
+as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
+
+By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
+Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
+this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
+
+Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or
+machine clock cycles.}. 
+
+\begin{equation}
+5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2
+\end{equation}
+
+For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$.  This implies that the following inequality should hold.
+\begin{center}
+\begin{tabular}{rcl}
+${5n \over 3} + 3n^2 + 3n$     & $<$ & ${n \over 3} + 6n^2$ \\
+${5 \over 3} + 3n + 3$     & $<$ & ${1 \over 3} + 6n$ \\
+${13 \over 9}$     & $<$ & $n$ \\
+\end{tabular}
+\end{center}
+
+This results in a cutoff point around $n = 2$.  As a consequence it is actually faster to compute the middle term the ``long way'' on processors
+where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication.  On
+the Intel P4 processor this ratio is 1:29 making this method even more beneficial.  The only common exception is the ARMv4 processor which has a
+ratio of 1:7.  } than simpler operations such as addition.  
+
+EXAM,bn_mp_karatsuba_sqr.c
+
+This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul.  It uses the same inline style to copy and 
+shift the input into the two halves.  The loop from line @54,{@ to line @70,}@ has been modified since only one input exists.  The \textbf{used}
+count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin.  At this point $x1$ and $x0$ are valid equivalents
+to the respective halves as if mp\_rshd and mp\_mod\_2d had been used.  
+
+By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered.  On the Athlon the cutoff point
+is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
+it is actually below the Comba limit (\textit{at 110 digits}).
+
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
+the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
+
+\textit{Last paragraph sucks.  re-write! -- Tom}
+
+\subsection{Toom-Cook Squaring}
+The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
+instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
+derive their own Toom-Cook squaring algorithm.  
+
+\subsection{High Level Squaring}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  If $a.used \ge TOOM\_SQR\_CUTOFF$ then  \\
+\hspace{3mm}1.1  $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\
+2.  else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\
+\hspace{3mm}2.1  $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\
+3.  else \\
+\hspace{3mm}3.1  $digs \leftarrow a.used + b.used + 1$ \\
+\hspace{3mm}3.2  If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\
+\hspace{6mm}3.2.1  $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr.  \\
+\hspace{3mm}3.3  else \\
+\hspace{6mm}3.3.1  $b \leftarrow a^2$ using algorithm s\_mp\_sqr.  \\
+4.  $b.sign \leftarrow MP\_ZPOS$ \\
+5.  Return the result of the unsigned squaring performed. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm mp\_sqr.}
+This algorithm computes the square of the input using one of four different algorithms.  If the input is very large and has at least
+\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used.  If
+neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used.  
+
+EXAM,bn_mp_sqr.c
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
+                      & that have different number of digits in Karatsuba multiplication. \\
+                      & \\
+$\left [ 3 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
+                      & of double products and at most one square is stated.  Prove this statement. \\
+                      & \\                      
+$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
+                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
+                      & \\
+$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
+                      & \\
+$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
+                      & \\ 
+$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
+                      & required for equation $6.7$ to be true.  \\
+                      & \\
+\end{tabular}
+
+\chapter{Modular Reduction}
+MARK,REDUCTION
+\section{Basics of Modular Reduction}
+\index{modular residue}
+Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, 
+such as factoring.  Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set.  A number $a$ is said to be \textit{reduced}
+modulo another number $b$ by finding the remainder of the division $a/b$.  Full integer division with remainder is a topic to be covered 
+in~\ref{sec:division}.
+
+Modular reduction is equivalent to solving for $r$ in the following equation.  $a = bq + r$ where $q = \lfloor a/b \rfloor$.  The result 
+$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$.  In other vernacular $r$ is known as the 
+``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and
+other forms of residues.  
+
+Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
+is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
+RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
+Elliptic Curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
+range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
+algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
+
+\section{The Barrett Reduction}
+The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate
+division.  Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to 
+
+\begin{equation}
+c = a - b \cdot \lfloor a/b \rfloor
+\end{equation}
+
+Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP\footnote{It is worth noting that Barrett's paper 
+targeted the DSP56K processor.}  intuition would indicate the next step would be to replace $a/b$ by a multiplication by the reciprocal.  However, 
+DSP intuition on its own will not work as these numbers are considerably larger than the precision of common DSP floating point data types.  
+It would take another common optimization to optimize the algorithm.
+
+\subsection{Fixed Point Arithmetic}
+The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers.  Fixed
+point arithmetic would become very popular as it greatly optimize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were 
+fairly slow if not unavailable.   The idea behind fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit 
+integer and a $q$-bit fraction part (\textit{where $p+q = k$}).  
+
+In this system a $k$-bit integer $n$ would actually represent $n/2^q$.  For example, with $q = 4$ the integer $n = 37$ would actually represent the
+value $2.3125$.  To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized by 
+moving the implied decimal point back to where it should be.  For example, with $q = 4$ to multiply the integers $9$ and $5$ they must be converted 
+to fixed point first by multiplying by $2^q$.  Let $a = 9(2^q)$ represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the 
+fixed point representation of $5$.  The product $ab$ is equal to $45(2^{2q})$ which when normalized by dividing by $2^q$ produces $45(2^q)$.  
+
+This technique became popular since a normal integer multiplication and logical shift right are the only required operations to perform a multiplication
+of two fixed point numbers.  Using fixed point arithmetic, division can be easily approximated by multiplying by the reciprocal.  If $2^q$ is 
+equivalent to one than $2^q/b$ is equivalent to the fixed point approximation of $1/b$ using real arithmetic.  Using this fact dividing an integer 
+$a$ by another integer $b$ can be achieved with the following expression.
+
+\begin{equation}
+\lfloor a / b \rfloor \mbox{ }\approx\mbox{ } \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
+\end{equation}
+
+The precision of the division is proportional to the value of $q$.  If the divisor $b$ is used frequently as is the case with 
+modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift.  Both operations
+are considerably faster than division on most processors.  
+
+Consider dividing $19$ by $5$.  The correct result is $\lfloor 19/5 \rfloor = 3$.  With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which
+leads to a product of $19$ which when divided by $2^q$ produces $2$.  However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and
+the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct.  The value of $2^q$ must be close to or ideally
+larger than the dividend.  In effect if $a$ is the dividend then $q$ should allow $0 \le \lfloor a/2^q \rfloor \le 1$ in order for this approach
+to work correctly.  Plugging this form of divison into the original equation the following modular residue equation arises.
+
+\begin{equation}
+c = a - b \cdot \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
+\end{equation}
+
+Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol.  Using the $\mu$
+variable also helps re-inforce the idea that it is meant to be computed once and re-used.
+
+\begin{equation}
+c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor
+\end{equation}
+
+Provided that $2^q \ge a$ this algorithm will produce a quotient that is either exactly correct or off by a value of one.  In the context of Barrett
+reduction the value of $a$ is bound by $0 \le a \le (b - 1)^2$ meaning that $2^q \ge b^2$ is sufficient to ensure the reciprocal will have enough
+precision.  
+
+Let $n$ represent the number of digits in $b$.  This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and 
+another $n^2$ single precision multiplications to find the residue.  In total $3n^2$ single precision multiplications are required to 
+reduce the number.  
+
+For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$.  Consider reducing
+$a = 180388626447$ modulo $b$ using the above reduction equation.  The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$.
+By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found.
+
+\subsection{Choosing a Radix Point}
+Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications.  If that were the best
+that could be achieved a full division\footnote{A division requires approximately $O(2cn^2)$ single precision multiplications for a small value of $c$.  
+See~\ref{sec:division} for further details.} might as well be used in its place.  The key to optimizing the reduction is to reduce the precision of
+the initial multiplication that finds the quotient.  
+
+Let $a$ represent the number of which the residue is sought.  Let $b$ represent the modulus used to find the residue.  Let $m$ represent
+the number of digits in $b$.  For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$, which is generally true if 
+two $m$-digit numbers have been multiplied.  Dividing $a$ by $b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer.  Digits below the 
+$m - 1$'th digit of $a$ will contribute at most a value of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$.  Another way to
+express this is by re-writing $a$ as two parts.  If $a' \equiv a \mbox{ (mod }b^m\mbox{)}$ and $a'' = a - a'$ then 
+${a \over b} \equiv {{a' + a''} \over b}$ which is equivalent to ${a' \over b} + {a'' \over b}$.  Since $a'$ is bound to be less than $b$ the quotient
+is bound by $0 \le {a' \over b} < 1$.
+
+Since the digits of $a'$ do not contribute much to the quotient the observation is that they might as well be zero.  However, if the digits 
+``might as well be zero'' they might as well not be there in the first place.  Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input
+with the irrelevant digits trimmed.  Now the modular reduction is trimmed to the almost equivalent equation
+
+\begin{equation}
+c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor
+\end{equation}
+
+Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$ where in this case $q$ is a multiple of $lg(\beta)$. Also note that the 
+exponent on the divisor when added to the amount $q_0$ was shifted by equals $2m$.  If the optimization had not been performed the divisor 
+would have the exponent $2m$ so in the end the exponents do ``add up''. Using the above equation the quotient 
+$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most two.  The original fixed point quotient can be off
+by as much as one (\textit{provided the radix point is chosen suitably}) and now that the lower irrelevent digits have been trimmed the quotient
+can be off by an additional value of one for a total of at most two.  This implies that 
+$0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  By first subtracting $b$ times the quotient and then conditionally subtracting 
+$b$ once or twice the residue is found.
+
+The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single
+precision multiplications, ignoring the subtractions required.  In total $2m^2 + m$ single precision multiplications are required to find the residue.  
+This is considerably faster than the original attempt.
+
+For example, let $\beta = 10$ represent the radix of the digits.  Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ 
+represent the value of which the residue is desired.  In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$.  
+With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$.  The quotient is then 
+$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$.  Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ 
+is found.  
+
+\subsection{Trimming the Quotient}
+So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications.  As 
+it stands now the algorithm is already fairly fast compared to a full integer division algorithm.  However, there is still room for
+optimization.  
+
+After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower
+half of the product.  It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision 
+multiplications.  If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly.  
+In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed.  
+
+The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number.  Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision
+multiplications would be required.  Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number
+of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications.  
+
+\subsection{Trimming the Residue}
+After the quotient has been calculated it is used to reduce the input.  As previously noted the algorithm is not exact and it can be off by a small
+multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  If $b$ is $m$ digits than the 
+result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are
+implicitly zero.  
+
+The next optimization arises from this very fact.  Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full
+$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed.  Similarly the value of $a$ can
+be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well.  A multiplication that produces 
+only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications.  
+
+With both optimizations in place the algorithm is the algorithm Barrett proposed.  It requires $m^2 + 2m - 1$ single precision multiplications which
+is considerably faster than the straightforward $3m^2$ method.  
+
+\subsection{The Barrett Algorithm}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor, m = \lceil lg_{\beta}(b) \rceil, (0 \le a < b^2, b > 1)$ \\
+\textbf{Output}.  $a \mbox{ (mod }b\mbox{)}$ \\
+\hline \\
+Let $m$ represent the number of digits in $b$.  \\
+1.  Make a copy of $a$ and store it in $q$.  (\textit{mp\_init\_copy}) \\
+2.  $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\
+\\
+Produce the quotient. \\
+3.  $q \leftarrow q \cdot \mu$  (\textit{note: only produce digits at or above $m-1$}) \\
+4.  $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\
+\\
+Subtract the multiple of modulus from the input. \\
+5.  $a \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+6.  $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\
+7.  $a \leftarrow a - q$ (\textit{mp\_sub}) \\
+\\
+Add $\beta^{m+1}$ if a carry occured. \\
+8.  If $a < 0$ then (\textit{mp\_cmp\_d}) \\
+\hspace{3mm}8.1  $q \leftarrow 1$ (\textit{mp\_set}) \\
+\hspace{3mm}8.2  $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\
+\hspace{3mm}8.3  $a \leftarrow a + q$ \\
+\\
+Now subtract the modulus if the residue is too large (e.g. quotient too small). \\
+9.  While $a \ge b$ do (\textit{mp\_cmp}) \\
+\hspace{3mm}9.1  $c \leftarrow a - b$ \\
+10.  Clear $q$. \\
+11.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce.}
+This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm.  It is loosely based on algorithm 14.42 of HAC
+\cite[pp.  602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}.  The algorithm has several restrictions and assumptions which must 
+be adhered to for the algorithm to work.
+
+First the modulus $b$ is assumed to be positive and greater than one.  If the modulus were less than or equal to one than subtracting
+a multiple of it would either accomplish nothing or actually enlarge the input.  The input $a$ must be in the range $0 \le a < b^2$ in order
+for the quotient to have enough precision.  If $a$ is the product of two numbers that were already reduced modulo $b$, this will not be a problem.
+Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish.  The value of $\mu$ is passed as an argument to this 
+algorithm and is assumed to be calculated and stored before the algorithm is used.  
+
+Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position.  An algorithm called 
+$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task.  The algorithm is based on $s\_mp\_mul\_digs$ except that
+instead of stopping at a given level of precision it starts at a given level of precision.  This optimal algorithm can only be used if the number
+of digits in $b$ is very much smaller than $\beta$.  
+
+While it is known that 
+$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue, so an implied 
+``borrow'' from the higher digits might leave a negative result.  After the multiple of the modulus has been subtracted from $a$ the residue must be 
+fixed up in case it is negative.  The invariant $\beta^{m+1}$ must be added to the residue to make it positive again.  
+
+The while loop at step 9 will subtract $b$ until the residue is less than $b$.  If the algorithm is performed correctly this step is 
+performed at most twice, and on average once. However, if $a \ge b^2$ than it will iterate substantially more times than it should.
+
+EXAM,bn_mp_reduce.c
+
+The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up.  This essentially halves
+the number of single precision multiplications required.  However, the optimization is only safe if $\beta$ is much larger than the number of digits
+in the modulus.  In the source code this is evaluated on lines @36,if@ to @44,}@ where algorithm s\_mp\_mul\_high\_digs is used when it is
+safe to do so.  
+
+\subsection{The Barrett Setup Algorithm}
+In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
+future use so that the Barrett algorithm can be used without delay.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_setup}. \\
+\textbf{Input}.   mp\_int $a$ ($a > 1$)  \\
+\textbf{Output}.  $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\
+\hline \\
+1.  $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot  m}$ (\textit{mp\_2expt}) \\
+2.  $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\
+3.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_setup}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_setup.}
+This algorithm computes the reciprocal $\mu$ required for Barrett reduction.  First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot  m}$ which
+is equivalent and much faster.  The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$.
+
+EXAM,bn_mp_reduce_setup.c
+
+This simple routine calculates the reciprocal $\mu$ required by Barrett reduction.  Note the extended usage of algorithm mp\_div where the variable
+which would received the remainder is passed as NULL.  As will be discussed in~\ref{sec:division} the division routine allows both the quotient and the 
+remainder to be passed as NULL meaning to ignore the value.  
+
+\section{The Montgomery Reduction}
+Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting 
+form of reduction in common use.  It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a 
+residue times a constant.  However, as perplexing as this may sound the algorithm is relatively simple and very efficient.  
+
+Throughout this entire section the variable $n$ will represent the modulus used to form the residue.  As will be discussed shortly the value of
+$n$ must be odd.  The variable $x$ will represent the quantity of which the residue is sought.  Similar to the Barrett algorithm the input
+is restricted to $0 \le x < n^2$.  To begin the description some simple number theory facts must be established.
+
+\textbf{Fact 1.}  Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$.  Another way
+to explain this is that $n$ is (\textit{or multiples of $n$ are}) congruent to zero modulo $n$.  Adding zero will not change the value of the residue.  
+
+\textbf{Fact 2.}  If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$.  Actually
+this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to 
+multiplication by $k^{-1}$ modulo $n$.  
+
+From these two simple facts the following simple algorithm can be derived.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction}. \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $1$ to $k$ do \\
+\hspace{3mm}1.1  If $x$ is odd then \\
+\hspace{6mm}1.1.1  $x \leftarrow x + n$ \\
+\hspace{3mm}1.2  $x \leftarrow x/2$ \\
+2.  Return $x$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction}
+\end{figure}
+
+The algorithm reduces the input one bit at a time using the two congruencies stated previously.  Inside the loop $n$, which is odd, is
+added to $x$ if $x$ is odd.  This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two.  Since
+$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$.  Let $r$ represent the 
+final result of the Montgomery algorithm.  If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to 
+$0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction is required to get the residue desired.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|l|}
+\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
+\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\
+\hline $2$ & $x/2 = 1453$ \\
+\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\
+\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\
+\hline $5$ & $x/2 = 278$ \\
+\hline $6$ & $x/2 = 139$ \\
+\hline $7$ & $x + n = 396$, $x/2 = 198$ \\
+\hline $8$ & $x/2 = 99$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example of Montgomery Reduction (I)}
+\label{fig:MONT1}
+\end{figure}
+
+Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$.  The result of the algorithm $r = 99$ is
+congruent to the value of $2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^8$ modulo $257$ the correct residue 
+$r \equiv 158$ is produced.  
+
+Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
+and $k^2$ single precision additions.  At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful.  
+Fortunately there exists an alternative representation of the algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
+\hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
+2.  Return $x/2^k$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction (modified I)}
+\end{figure}
+
+This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2.  The number of single
+precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|l|r|}
+\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} & \textbf{Result ($x$) in Binary} \\
+\hline -- & $5555$ & $1010110110011$ \\
+\hline $1$ & $x + 2^{0}n = 5812$ &  $1011010110100$ \\
+\hline $2$ & $5812$ & $1011010110100$ \\
+\hline $3$ & $x + 2^{2}n = 6840$ & $1101010111000$ \\
+\hline $4$ & $x + 2^{3}n = 8896$ & $10001011000000$ \\
+\hline $5$ & $8896$ & $10001011000000$ \\
+\hline $6$ & $8896$ & $10001011000000$ \\
+\hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\
+\hline $8$ & $25344$ & $110001100000000$ \\
+\hline -- & $x/2^k = 99$ & \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example of Montgomery Reduction (II)}
+\label{fig:MONT2}
+\end{figure}
+
+Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 8$. 
+With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the 
+loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is 
+zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.  
+
+\subsection{Digit Based Montgomery Reduction}
+Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis.  Consider the
+previous algorithm re-written to compute the Montgomery reduction in this new fashion.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1  $x \leftarrow x + \mu n \beta^t$ \\
+2.  Return $x/\beta^k$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction (modified II)}
+\end{figure}
+
+The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue.  If the first digit of 
+the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit.  This
+problem breaks down to solving the following congruency.  
+
+\begin{center}
+\begin{tabular}{rcl}
+$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\
+$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\
+$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
+\end{tabular}
+\end{center}
+
+In each iteration of the loop on step 1 a new value of $\mu$ must be calculated.  The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used 
+extensively in this algorithm and should be precomputed.  Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$.  
+
+For example, let $\beta = 10$ represent the radix.  Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$.  Let $x = 33$ 
+represent the value to reduce.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\
+\hline --                 & $33$ & --\\
+\hline $0$                 & $33 + \mu n = 50$ & $1$ \\
+\hline $1$                 & $50 + \mu n \beta = 900$ & $5$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Montgomery Reduction}
+\end{figure}
+
+The final result $900$ is then divided by $\beta^k$ to produce the final result $9$.  The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ 
+which implies the result is not the modular residue of $x$ modulo $n$.  However, recall that the residue is actually multiplied by $\beta^{-k}$ in
+the algorithm.  To get the true residue the value must be multiplied by $\beta^k$.  In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and
+the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$.  
+
+\subsection{Baseline Montgomery Reduction}
+The baseline Montgomery reduction algorithm will produce the residue for any size input.  It is designed to be a catch-all algororithm for 
+Montgomery reductions.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
+\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  $digs \leftarrow 2n.used + 1$ \\
+2.  If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\
+\hspace{3mm}2.1  Use algorithm fast\_mp\_montgomery\_reduce instead. \\
+\\
+Setup $x$ for the reduction. \\
+3.  If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\
+4.  $x.used \leftarrow digs$ \\
+\\
+Eliminate the lower $k$ digits. \\
+5.  For $ix$ from $0$ to $k - 1$ do \\
+\hspace{3mm}5.1  $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}5.2  $u \leftarrow 0$ \\
+\hspace{3mm}5.3  For $iy$ from $0$ to $k - 1$ do \\
+\hspace{6mm}5.3.1  $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\
+\hspace{6mm}5.3.2  $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.3.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.4  While $u > 0$ do \\
+\hspace{6mm}5.4.1  $iy \leftarrow iy + 1$ \\
+\hspace{6mm}5.4.2  $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\
+\hspace{6mm}5.4.4  $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\
+\\
+Divide by $\beta^k$ and fix up as required. \\
+6.  $x \leftarrow \lfloor x / \beta^k \rfloor$ \\
+7.  If $x \ge n$ then \\
+\hspace{3mm}7.1  $x \leftarrow x - n$ \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_montgomery\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_montgomery\_reduce.}
+This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm.  The algorithm is loosely based
+on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop.  The
+restrictions on this algorithm are fairly easy to adapt to.  First $0 \le x < n^2$ bounds the input to numbers in the same range as 
+for the Barrett algorithm.  Additionally if $n > 1$ and $n$ is odd there will exist a modular inverse $\rho$.  $\rho$ must be calculated in
+advance of this algorithm.  Finally the variable $k$ is fixed and a pseudonym for $n.used$.  
+
+Step 2 decides whether a faster Montgomery algorithm can be used.  It is based on the Comba technique meaning that there are limits on
+the size of the input.  This algorithm is discussed in ~COMBARED~.
+
+Step 5 is the main reduction loop of the algorithm.  The value of $\mu$ is calculated once per iteration in the outer loop.  The inner loop
+calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits.  Both the addition and
+multiplication are performed in the same loop to save time and memory.  Step 5.4 will handle any additional carries that escape the inner loop.
+
+Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications 
+in the inner loop.  In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision
+multiplications.  
+
+EXAM,bn_mp_montgomery_reduce.c
+
+This is the baseline implementation of the Montgomery reduction algorithm.  Lines @30,digs@ to @35,}@ determine if the Comba based
+routine can be used instead.  Line @47,mu@ computes the value of $\mu$ for that particular iteration of the outer loop.  
+
+The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop.  The alias $tmpx$ refers to the $ix$'th digit of $x$ and
+the alias $tmpn$ refers to the modulus $n$.  
+
+\subsection{Faster ``Comba'' Montgomery Reduction}
+MARK,COMBARED
+
+The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial
+nature of the inner loop.  The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba
+technique.  The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates
+a $k \times 1$ product $k$ times. 
+
+The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$.  This means the 
+carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit.  The solution as it turns out is very simple.  
+Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry.  
+
+With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases
+the speed of the algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
+\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\
+1.  if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\
+Copy the digits of $x$ into the array $\hat W$ \\
+2.  For $ix$ from $0$ to $x.used - 1$ do \\
+\hspace{3mm}2.1  $\hat W_{ix} \leftarrow x_{ix}$ \\
+3.  For $ix$ from $x.used$ to $2n.used - 1$ do \\
+\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
+Elimiate the lower $k$ digits. \\
+4.  for $ix$ from $0$ to $n.used - 1$ do \\
+\hspace{3mm}4.1  $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}4.2  For $iy$ from $0$ to $n.used - 1$ do \\
+\hspace{6mm}4.2.1  $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\
+\hspace{3mm}4.3  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
+Propagate carries upwards. \\
+5.  for $ix$ from $n.used$ to $2n.used + 1$ do \\
+\hspace{3mm}5.1  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
+Shift right and reduce modulo $\beta$ simultaneously. \\
+6.  for $ix$ from $0$ to $n.used + 1$ do \\
+\hspace{3mm}6.1  $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\
+Zero excess digits and fixup $x$. \\
+7.  if $x.used > n.used + 1$ then do \\
+\hspace{3mm}7.1  for $ix$ from $n.used + 1$ to $x.used - 1$ do \\
+\hspace{6mm}7.1.1  $x_{ix} \leftarrow 0$ \\
+8.  $x.used \leftarrow n.used + 1$ \\
+9.  Clamp excessive digits of $x$. \\
+10.  If $x \ge n$ then \\
+\hspace{3mm}10.1  $x \leftarrow x - n$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_mp\_montgomery\_reduce}
+\end{figure}
+
+\textbf{Algorithm fast\_mp\_montgomery\_reduce.}
+This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique.  It is on most computer platforms significantly
+faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}).  The algorithm has the same restrictions
+on the input as the baseline reduction algorithm.  An additional two restrictions are imposed on this algorithm.  The number of digits $k$ in the 
+the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$.   When $\beta = 2^{28}$ this algorithm can be used to reduce modulo
+a modulus of at most $3,556$ bits in length.  
+
+As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product.  It is initially filled with the
+contents of $x$ with the excess digits zeroed.  The reduction loop is very similar the to the baseline loop at heart.  The multiplication on step
+4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$.  Some multipliers such
+as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce.  By performing
+a single precision multiplication instead half the amount of time is spent.
+
+Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work.  That is what step
+4.3 will do.  In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards.  Note
+how the upper bits of those same words are not reduced modulo $\beta$.  This is because those values will be discarded shortly and there is no
+point.
+
+Step 5 will propagate the remainder of the carries upwards.  On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are
+stored in the destination $x$.  
+
+EXAM,bn_fast_mp_montgomery_reduce.c
+
+The $\hat W$ array is first filled with digits of $x$ on line @49,for@ then the rest of the digits are zeroed on line @54,for@.  Both loops share
+the same alias variables to make the code easier to read.  
+
+The value of $\mu$ is calculated in an interesting fashion.  First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit.  This
+forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line @101,>>@ fixes the carry 
+for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$.
+
+The for loop on line @113,for@ propagates the rest of the carries upwards through the columns.  The for loop on line @126,for@ reduces the columns
+modulo $\beta$ and shifts them $k$ places at the same time.  The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th
+digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$.  
+
+\subsection{Montgomery Setup}
+To calculate the variable $\rho$ a relatively simple algorithm will be required.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_montgomery\_setup}. \\
+\textbf{Input}.   mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\
+\textbf{Output}.  $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
+\hline \\
+1.  $b \leftarrow n_0$ \\
+2.  If $b$ is even return(\textit{MP\_VAL}) \\
+3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
+4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
+\hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
+5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
+6.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_montgomery\_setup} 
+\end{figure}
+
+\textbf{Algorithm mp\_montgomery\_setup.}
+This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms.  It uses a very interesting trick 
+to calculate $1/n_0$ when $\beta$ is a power of two.  
+
+EXAM,bn_mp_montgomery_setup.c
+
+This source code computes the value of $\rho$ required to perform Montgomery reduction.  It has been modified to avoid performing excess
+multiplications when $\beta$ is not the default 28-bits.  
+
+\section{The Diminished Radix Algorithm}
+The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett
+or Montgomery methods for certain forms of moduli.  The technique is based on the following simple congruence.
+
+\begin{equation}
+(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)}
+\end{equation}
+
+This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive.  It used the fact that if $n = 2^{31}$ and $k=1$ that 
+then a x86 multiplier could produce the 62-bit product and use  the ``shrd'' instruction to perform a double-precision right shift.  The proof
+of the above equation is very simple.  First write $x$ in the product form.
+
+\begin{equation}
+x = qn + r
+\end{equation}
+
+Now reduce both sides modulo $(n - k)$.
+
+\begin{equation}
+x \equiv qk + r  \mbox{ (mod }(n-k)\mbox{)}
+\end{equation}
+
+The variable $n$ reduces modulo $n - k$ to $k$.  By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ 
+into the equation the original congruence is reproduced, thus concluding the proof.  The following algorithm is based on this observation.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Diminished Radix Reduction}. \\
+\textbf{Input}.   Integer $x$, $n$, $k$ \\
+\textbf{Output}.  $x \mbox{ mod } (n - k)$ \\
+\hline \\
+1.  $q \leftarrow \lfloor x / n \rfloor$ \\
+2.  $q \leftarrow k \cdot q$ \\
+3.  $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\
+4.  $x \leftarrow x + q$ \\
+5.  If $x \ge (n - k)$ then \\
+\hspace{3mm}5.1  $x \leftarrow x - (n - k)$ \\
+\hspace{3mm}5.2  Goto step 1. \\
+6.  Return $x$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Diminished Radix Reduction}
+\label{fig:DR}
+\end{figure}
+
+This algorithm will reduce $x$ modulo $n - k$ and return the residue.  If $0 \le x < (n - k)^2$ then the algorithm will loop almost always
+once or twice and occasionally three times.  For simplicity sake the value of $x$ is bounded by the following simple polynomial.
+
+\begin{equation} 
+0 \le x < n^2 + k^2 - 2nk
+\end{equation}
+
+The true bound is  $0 \le x < (n - k - 1)^2$ but this has quite a few more terms.  The value of $q$ after step 1 is bounded by the following.
+
+\begin{equation}
+q < n - 2k - k^2/n
+\end{equation}
+
+Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero.  The value of $x$ after step 3 is bounded trivially as
+$0 \le x < n$.  By step four the sum $x + q$ is bounded by 
+
+\begin{equation}
+0 \le q + x < (k + 1)n - 2k^2 - 1
+\end{equation}
+
+With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3.  After the second pass it is highly unlike that the
+sum in step 4 will exceed $n - k$.  In practice fewer than three passes of the algorithm are required to reduce virtually every input in the 
+range $0 \le x < (n - k - 1)^2$.  
+
+\begin{figure}
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|}
+\hline
+$x = 123456789, n = 256, k = 3$ \\
+\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\
+$q \leftarrow q*k = 1446759$ \\
+$x \leftarrow x \mbox{ mod } n = 21$ \\
+$x \leftarrow x + q = 1446780$ \\
+$x \leftarrow x - (n - k) = 1446527$ \\
+\hline 
+$q \leftarrow \lfloor x/n \rfloor = 5650$ \\
+$q \leftarrow q*k = 16950$ \\
+$x \leftarrow x \mbox{ mod } n = 127$ \\
+$x \leftarrow x + q = 17077$ \\
+$x \leftarrow x - (n - k) = 16824$ \\
+\hline 
+$q \leftarrow \lfloor x/n \rfloor = 65$ \\
+$q \leftarrow q*k = 195$ \\
+$x \leftarrow x \mbox{ mod } n = 184$ \\
+$x \leftarrow x + q = 379$ \\
+$x \leftarrow x - (n - k) = 126$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example Diminished Radix Reduction}
+\label{fig:EXDR}
+\end{figure}
+
+Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$.  Note that even while $x$
+is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast.  In this case only
+three passes were required to find the residue $x \equiv 126$.
+
+
+\subsection{Choice of Moduli}
+On the surface this algorithm looks like a very expensive algorithm.  It requires a couple of subtractions followed by multiplication and other
+modular reductions.  The usefulness of this algorithm becomes exceedingly clear when an appropriate modulus is chosen.
+
+Division in general is a very expensive operation to perform.  The one exception is when the division is by a power of the radix of representation used.  
+Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right.  Similarly division 
+by two (\textit{or powers of two}) is very simple for binary computers to perform.  It would therefore seem logical to choose $n$ of the form $2^p$ 
+which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits.  
+
+However, there is one operation related to division of power of twos that is even faster than this.  If $n = \beta^p$ then the division may be 
+performed by moving whole digits to the right $p$ places.  In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$.  
+Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ merely requires zeroing the digits above the $p-1$'th digit of $x$.  
+
+Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ whereas the term ``unrestricted
+modulus'' will refer to a modulus of the form $2^p - k$.  The word ``restricted'' in this case refers to the fact that it is based on the 
+$2^p$ logic except $p$ must be a multiple of $lg(\beta)$.  
+
+\subsection{Choice of $k$}
+Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$
+in step 2 is the most expensive operation.  Fortunately the choice of $k$ is not terribly limited.  For all intents and purposes it might
+as well be a single digit.  The smaller the value of $k$ is the faster the algorithm will be.  
+
+\subsection{Restricted Diminished Radix Reduction}
+The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$.  This algorithm can reduce 
+an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}.  The implementation
+of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition 
+of $x$ and $q$.  The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular 
+exponentiations are performed.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\
+\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\
+\textbf{Output}.  $x \mbox{ mod } n$ \\
+\hline \\
+1.  $m \leftarrow n.used$ \\
+2.  If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\
+3.  $\mu \leftarrow 0$ \\
+4.  for $i$ from $0$ to $m - 1$ do \\
+\hspace{3mm}4.1  $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\
+\hspace{3mm}4.2  $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}4.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+5.  $x_{m} \leftarrow \mu$ \\
+6.  for $i$ from $m + 1$ to $x.used - 1$ do \\
+\hspace{3mm}6.1  $x_{i} \leftarrow 0$ \\
+7.  Clamp excess digits of $x$. \\
+8.  If $x \ge n$ then \\
+\hspace{3mm}8.1  $x \leftarrow x - n$ \\
+\hspace{3mm}8.2  Goto step 3. \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_dr\_reduce.}
+This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$.  It has similar restrictions to that of the Barrett reduction
+with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$.  
+
+This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization.  The division by $\beta^m$, multiplication by $k$
+and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4.  The division by $\beta^m$ is emulated by accessing
+the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position.  After the loop the $m$'th
+digit is set to the carry and the upper digits are zeroed.  Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to 
+$x$ before the addition of the multiple of the upper half.  
+
+At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required.  First $n$ is subtracted from $x$ and then the algorithm resumes
+at step 3.  
+
+EXAM,bn_mp_dr_reduce.c
+
+The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line @49,top:@ is where
+the algorithm will resume if further reduction passes are required.  In theory it could be placed at the top of the function however, the size of
+the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time.  
+
+The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits.  By reading digits from $x$ offset by $m$ digits
+a division by $\beta^m$ can be simulated virtually for free.  The loop on line @61,for@ performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
+in this algorithm.
+
+By line @68,mu@ the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line @71,for@ the 
+same pointer will point to the $m+1$'th digit where the zeroes will be placed.  
+
+Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required.  
+With the same logic at line @82,sub@ the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
+as well.  Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code
+does not need to be checked.
+
+\subsubsection{Setup}
+To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required.  This algorithm is not really complicated but provided for
+completeness.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_setup}. \\
+\textbf{Input}.   mp\_int $n$ \\
+\textbf{Output}.  $k = \beta - n_0$ \\
+\hline \\
+1.  $k \leftarrow \beta - n_0$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_setup}
+\end{figure}
+
+EXAM,bn_mp_dr_setup.c
+
+\subsubsection{Modulus Detection}
+Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus.  An integer is said to be
+of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\
+\textbf{Input}.   mp\_int $n$ \\
+\textbf{Output}.  $1$ if $n$ is in D.R form, $0$ otherwise \\
+\hline
+1.  If $n.used < 2$ then return($0$). \\
+2.  for $ix$ from $1$ to $n.used - 1$ do \\
+\hspace{3mm}2.1  If $n_{ix} \ne \beta - 1$ return($0$). \\
+3.  Return($1$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_is\_modulus}
+\end{figure}
+
+\textbf{Algorithm mp\_dr\_is\_modulus.}
+This algorithm determines if a value is in Diminished Radix form.  Step 1 rejects obvious cases where fewer than two digits are
+in the mp\_int.  Step 2 tests all but the first digit to see if they are equal to $\beta - 1$.  If the algorithm manages to get to
+step 3 then $n$ must be of Diminished Radix form.
+
+EXAM,bn_mp_dr_is_modulus.c
+
+\subsection{Unrestricted Diminished Radix Reduction}
+The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$.  This algorithm
+is a straightforward adaptation of algorithm~\ref{fig:DR}.
+
+In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead.  However, this new
+algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_2k}. \\
+\textbf{Input}.   mp\_int $a$ and $n$.  mp\_digit $k$  \\
+\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\
+\textbf{Output}.  $a \mbox{ (mod }n\mbox{)}$ \\
+\hline
+1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+2.  While $a \ge n$ do \\
+\hspace{3mm}2.1  $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\
+\hspace{3mm}2.2  $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+\hspace{3mm}2.3  $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\
+\hspace{3mm}2.4  $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.5  If $a \ge n$ then do \\
+\hspace{6mm}2.5.1  $a \leftarrow a - n$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_2k}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_2k.}
+This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$.  Division by $2^p$ is emulated with a right
+shift which makes the algorithm fairly inexpensive to use.  
+
+EXAM,bn_mp_reduce_2k.c
+
+The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$.  The call to mp\_div\_2d
+on line @31,mp_div_2d@ calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
+is kept fairly small.  The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without
+any multiplications.  
+
+The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are 
+positive.  By using the unsigned versions the overhead is kept to a minimum.  
+
+\subsubsection{Unrestricted Setup}
+To setup this reduction algorithm the value of $k = 2^p - n$ is required.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\
+\textbf{Input}.   mp\_int $n$   \\
+\textbf{Output}.  $k = 2^p - n$ \\
+\hline
+1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+2.  $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\
+3.  $x \leftarrow x - n$ (\textit{mp\_sub}) \\
+4.  $k \leftarrow x_0$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_2k\_setup}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_2k\_setup.}
+This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k.  By making a temporary variable $x$ equal to $2^p$ a subtraction
+is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$.  
+
+EXAM,bn_mp_reduce_2k_setup.c
+
+\subsubsection{Unrestricted Detection}
+An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true.
+
+\begin{enumerate}
+\item  The number has only one digit.
+\item  The number has more than one digit and every bit from the $\beta$'th to the most significant is one.
+\end{enumerate}
+
+If either condition is true than there is a power of two $2^p$ such that $0 < 2^p - n < \beta$.   If the input is only
+one digit than it will always be of the correct form.  Otherwise all of the bits above the first digit must be one.  This arises from the fact
+that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most
+significant bit.  The resulting sum will be a power of two.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\
+\textbf{Input}.   mp\_int $n$   \\
+\textbf{Output}.  $1$ if of proper form, $0$ otherwise \\
+\hline
+1.  If $n.used = 0$ then return($0$). \\
+2.  If $n.used = 1$ then return($1$). \\
+3.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+4.  for $x$ from $lg(\beta)$ to $p$ do \\
+\hspace{3mm}4.1  If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\
+5.  Return($1$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_is\_2k}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_is\_2k.}
+This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly.  
+
+EXAM,bn_mp_reduce_is_2k.c
+
+
+
+\section{Algorithm Comparison}
+So far three very different algorithms for modular reduction have been discussed.  Each of the algorithms have their own strengths and weaknesses
+that makes having such a selection very useful.  The following table sumarizes the three algorithms along with comparisons of work factors.  Since
+all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table.  
+
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\
+\hline Barrett    & $m^2 + 2m - 1$ & None              & $79$ & $1087$ & $4223$ \\
+\hline Montgomery & $m^2 + m$      & $n$ must be odd   & $72$ & $1056$ & $4160$ \\
+\hline D.R.       & $2m$           & $n = \beta^m - k$ & $16$ & $64$   & $128$  \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+
+In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete.  However, in practice since Montgomery
+reduction can be written as a single function with the Comba technique it is much faster.  Barrett reduction suffers from the overhead of
+calling the half precision multipliers, addition and division by $\beta$ algorithms.
+
+For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice.  The one set of algorithms where Diminished Radix reduction truly
+shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}.  In these algorithms
+primes of the form $\beta^m - k$ can be found and shared amongst users.  These primes will allow the Diminished Radix algorithm to be used in
+modular exponentiation to greatly speed up the operation.
+
+
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\
+                     & calculates the correct value of $\rho$. \\
+                     & \\
+$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly.  \\
+                     & \\
+$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\
+                     & (\textit{figure~\ref{fig:DR}}) terminates.  Also prove the probability that it will \\
+                     & terminate within $1 \le k \le 10$ iterations. \\
+                     & \\
+\end{tabular}                     
+
+
+\chapter{Exponentiation}
+Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$.  A variant of exponentiation, computed
+in a finite field or ring, is called modular exponentiation.  This latter style of operation is typically used in public key 
+cryptosystems such as RSA and Diffie-Hellman.  The ability to quickly compute modular exponentiations is of great benefit to any
+such cryptosystem and many methods have been sought to speed it up.
+
+\section{Exponentiation Basics}
+A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired.  However, as $b$ grows in size
+the number of multiplications becomes prohibitive.  Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature
+with a $1024$-bit key.  Such a calculation could never be completed as it would take simply far too long.
+
+Fortunately there is a very simple algorithm based on the laws of exponents.  Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which
+are two trivial relationships between the base and the exponent.  Let $b_i$ represent the $i$'th bit of $b$ starting from the least 
+significant bit.  If $b$ is a $k$-bit integer than the following equation is true.
+
+\begin{equation}
+a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i}
+\end{equation}
+
+By taking the base $a$ logarithm of both sides of the equation the following equation is the result.
+
+\begin{equation}
+b = \sum_{i=0}^{k-1}2^i \cdot b_i
+\end{equation}
+
+The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to
+$a^{2^{i+1}}$.  This observation forms the basis of essentially all fast exponentiation algorithms.  It requires $k$ squarings and on average
+$k \over 2$ multiplications to compute the result.  This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times.
+
+While this current method is a considerable speed up there are further improvements to be made.  For example, the $a^{2^i}$ term does not need to 
+be computed in an auxilary variable.  Consider the following equivalent algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Left to Right Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$ and $k$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $k - 1$ to $0$ do \\
+\hspace{3mm}2.1  $c \leftarrow c^2$ \\
+\hspace{3mm}2.2  $c \leftarrow c \cdot a^{b_i}$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Left to Right Exponentiation}
+\label{fig:LTOR}
+\end{figure}
+
+This algorithm starts from the most significant bit and works towards the least significant bit.  When the $i$'th bit of $b$ is set $a$ is
+multiplied against the current product.  In each iteration the product is squared which doubles the exponent of the individual terms of the
+product.  
+
+For example, let $b = 101100_2 \equiv 44_{10}$.  The following chart demonstrates the actions of the algorithm.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|}
+\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\
+\hline - & $1$ \\
+\hline $5$ & $a$ \\
+\hline $4$ & $a^2$ \\
+\hline $3$ & $a^4 \cdot a$ \\
+\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\
+\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\
+\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Left to Right Exponentiation}
+\end{figure}
+
+When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation.  This particular algorithm is 
+called ``Left to Right'' because it reads the exponent in that order.  All of the exponentiation algorithms that will be presented are of this nature.  
+
+\subsection{Single Digit Exponentiation}
+The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit.  It is intended 
+to be used when a small power of an input is required (\textit{e.g. $a^5$}).  It is faster than simply multiplying $b - 1$ times for all values of 
+$b$ that are greater than three.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_expt\_d}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_digit $b$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $g \leftarrow a$ (\textit{mp\_init\_copy}) \\
+2.  $c \leftarrow 1$ (\textit{mp\_set}) \\
+3.  for $x$ from 1 to $lg(\beta)$ do \\
+\hspace{3mm}3.1  $c \leftarrow c^2$ (\textit{mp\_sqr}) \\
+\hspace{3mm}3.2  If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\
+\hspace{6mm}3.2.1  $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\
+\hspace{3mm}3.3  $b \leftarrow b << 1$ \\
+4.  Clear $g$. \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_expt\_d}
+\end{figure}
+
+\textbf{Algorithm mp\_expt\_d.}
+This algorithm computes the value of $a$ raised to the power of a single digit $b$.  It uses the left to right exponentiation algorithm to
+quickly compute the exponentiation.  It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the 
+exponent is a fixed width.  
+
+A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$.  The result is set to the initial value of 
+$1$ in the subsequent step.
+
+Inside the loop the exponent is read from the most significant bit first down to the least significant bit.  First $c$ is invariably squared
+on step 3.1.  In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$.  The value
+of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit.  In effect each
+iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location.
+
+EXAM,bn_mp_expt_d.c
+
+Line @29,mp_set@ sets the initial value of the result to $1$.  Next the loop on line @31,for@ steps through each bit of the exponent starting from
+the most significant down towards the least significant. The invariant squaring operation placed on line @333,mp_sqr@ is performed first.  After 
+the squaring the result $c$ is multiplied by the base $g$ if and only if the most significant bit of the exponent is set.  The shift on line
+@47,<<@ moves all of the bits of the exponent upwards towards the most significant location.  
+
+\section{$k$-ary Exponentiation}
+When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor
+slower than squaring.  Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$.  Suppose instead it referred to
+the $i$'th $k$-bit digit of the exponent of $b$.  For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY}
+computes the same exponentiation.  A group of $k$ bits from the exponent is called a \textit{window}.  That is it is a small window on only a
+portion of the entire exponent.  Consider the following modification to the basic left to right exponentiation algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{$k$-ary Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $t - 1$ to $0$ do \\
+\hspace{3mm}2.1  $c \leftarrow c^{2^k} $ \\
+\hspace{3mm}2.2  Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\
+\hspace{3mm}2.3  $c \leftarrow c \cdot a^g$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{$k$-ary Exponentiation}
+\label{fig:KARY}
+\end{figure}
+
+The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times.  If the values of $a^g$ for $0 < g < 2^k$ have been
+precomputed this algorithm requires only $t$ multiplications and $tk$ squarings.  The table can be generated with $2^{k - 1} - 1$ squarings and
+$2^{k - 1} + 1$ multiplications.  This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$.  
+However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}.
+
+Suppose $k = 4$ and $t = 100$.  This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation.  The
+original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value.  The total number of squarings
+has increased slightly but the number of multiplications has nearly halved.
+
+\subsection{Optimal Values of $k$}
+An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$.  The simplest
+approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result.  Table~\ref{fig:OPTK} lists optimal values of $k$
+for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\
+\hline $16$ & $2$ & $27$ & $24$ \\
+\hline $32$ & $3$ & $49$ & $48$ \\
+\hline $64$ & $3$ & $92$ & $96$ \\
+\hline $128$ & $4$ & $175$ & $192$ \\
+\hline $256$ & $4$ & $335$ & $384$ \\
+\hline $512$ & $5$ & $645$ & $768$ \\
+\hline $1024$ & $6$ & $1257$ & $1536$ \\
+\hline $2048$ & $6$ & $2452$ & $3072$ \\
+\hline $4096$ & $7$ & $4808$ & $6144$ \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Optimal Values of $k$ for $k$-ary Exponentiation}
+\label{fig:OPTK}
+\end{figure}
+
+\subsection{Sliding-Window Exponentiation}
+A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$.  Essentially
+this is a table for all values of $g$ where the most significant bit of $g$ is a one.  However, in order for this to be allowed in the 
+algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided.  
+
+Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\
+\hline $16$ & $3$ & $24$ & $27$ \\
+\hline $32$ & $3$ & $45$ & $49$ \\
+\hline $64$ & $4$ & $87$ & $92$ \\
+\hline $128$ & $4$ & $167$ & $175$ \\
+\hline $256$ & $5$ & $322$ & $335$ \\
+\hline $512$ & $6$ & $628$ & $645$ \\
+\hline $1024$ & $6$ & $1225$ & $1257$ \\
+\hline $2048$ & $7$ & $2403$ & $2452$ \\
+\hline $4096$ & $8$ & $4735$ & $4808$ \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Optimal Values of $k$ for Sliding Window Exponentiation}
+\label{fig:OPTK2}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $t - 1$ to $0$ do \\
+\hspace{3mm}2.1  If the $i$'th bit of $b$ is a zero then \\
+\hspace{6mm}2.1.1   $c \leftarrow c^2$ \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c \leftarrow c^{2^k}$ \\
+\hspace{6mm}2.2.2  Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\
+\hspace{6mm}2.2.3  $c \leftarrow c \cdot a^g$ \\
+\hspace{6mm}2.2.4  $i \leftarrow i - k$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Sliding Window $k$-ary Exponentiation}
+\end{figure}
+
+Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent.  While this
+algorithm requires the same number of squarings it can potentially have fewer multiplications.  The pre-computed table $a^g$ is also half
+the size as the previous table.  
+
+Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms.  The first algorithm will divide the exponent up as 
+the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$.  The second algorithm will break the 
+exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$.  The single digit $0$ in the second representation are where
+a single squaring took place instead of a squaring and multiplication.  In total the first method requires $10$ multiplications and $18$ 
+squarings.  The second method requires $8$ multiplications and $18$ squarings.  
+
+In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster.  
+
+\section{Modular Exponentiation}
+
+Modular exponentiation is essentially computing the power of a base within a finite field or ring.  For example, computing 
+$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation.  Instead of first computing $a^b$ and then reducing it 
+modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation.  
+
+This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using
+one of the algorithms presented in ~REDUCTION~.  
+
+Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first.  This algorithm
+will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The
+value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see \ref{sec;modinv}}).  If no inverse exists the algorithm
+terminates with an error.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_exptmod}. \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+1.  If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
+2.  If $b.sign = MP\_NEG$ then \\
+\hspace{3mm}2.1  $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\
+\hspace{3mm}2.2  $x' \leftarrow \vert x \vert$ \\
+\hspace{3mm}2.3  Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\
+3.  if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\
+\hspace{3mm}3.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\
+4.  else \\
+\hspace{3mm}4.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_exptmod}
+\end{figure}
+
+\textbf{Algorithm mp\_exptmod.}
+The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod.  It is a sliding window $k$-ary algorithm 
+which uses Barrett reduction to reduce the product modulo $p$.  The second algorithm mp\_exptmod\_fast performs the same operation 
+except it uses either Montgomery or Diminished Radix reduction.  The two latter reduction algorithms are clumped in the same exponentiation
+algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}).  
+
+EXAM,bn_mp_exptmod.c
+
+In order to keep the algorithms in a known state the first step on line @29,if@ is to reject any negative modulus as input.  If the exponent is
+negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$.  The temporary variable $tmpG$ is assigned
+the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
+exponent.
+
+If the exponent is positive the algorithm resumes the exponentiation.  Line @63,dr_@ determines if the modulus is of the restricted Diminished Radix 
+form.  If it is not line @65,reduce@ attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
+of three values.
+
+\begin{enumerate}
+\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form.
+\item $dr = 1$ means that the modulus is of restricted Diminished Radix form.
+\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
+\end{enumerate}
+
+Line @69,if@ determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
+the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.  
+
+\subsection{Barrett Modular Exponentiation}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_exptmod}. \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+1.  $k \leftarrow lg(x)$ \\
+2.  $winsize \leftarrow  \left \lbrace \begin{array}{ll}
+                              2 &  \mbox{if }k \le 7 \\
+                              3 &  \mbox{if }7 < k \le 36 \\
+                              4 &  \mbox{if }36 < k \le 140 \\
+                              5 &  \mbox{if }140 < k \le 450 \\
+                              6 &  \mbox{if }450 < k \le 1303 \\
+                              7 &  \mbox{if }1303 < k \le 3529 \\
+                              8 &  \mbox{if }3529 < k \\
+                              \end{array} \right .$ \\
+3.  Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\
+4.  Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\
+5.  $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\
+\\
+Setup the table of small powers of $g$.  First find $g^{2^{winsize}}$ and then all multiples of it. \\
+6.  $k \leftarrow 2^{winsize - 1}$ \\
+7.  $M_{k} \leftarrow M_1$ \\
+8.  for $ix$ from 0 to $winsize - 2$ do \\
+\hspace{3mm}8.1  $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr})  \\
+\hspace{3mm}8.2  $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
+9.  for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\
+\hspace{3mm}9.1  $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\
+\hspace{3mm}9.2  $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
+10.  $res \leftarrow 1$ \\
+\\
+Start Sliding Window. \\
+11.  $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\
+12.  Loop \\
+\hspace{3mm}12.1  $bitcnt \leftarrow bitcnt - 1$ \\
+\hspace{3mm}12.2  If $bitcnt = 0$ then do \\
+\hspace{6mm}12.2.1  If $digidx = -1$ goto step 13. \\
+\hspace{6mm}12.2.2  $buf \leftarrow x_{digidx}$ \\
+\hspace{6mm}12.2.3  $digidx \leftarrow digidx - 1$ \\
+\hspace{6mm}12.2.4  $bitcnt \leftarrow lg(\beta)$ \\
+Continued on next page. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_exptmod}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+\hspace{3mm}12.3  $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\
+\hspace{3mm}12.4  $buf \leftarrow buf << 1$ \\
+\hspace{3mm}12.5  if $mode = 0$ and $y = 0$ then goto step 12. \\
+\hspace{3mm}12.6  if $mode = 1$ and $y = 0$ then do \\
+\hspace{6mm}12.6.1  $res \leftarrow res^2$ \\
+\hspace{6mm}12.6.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}12.6.3  Goto step 12. \\
+\hspace{3mm}12.7  $bitcpy \leftarrow bitcpy + 1$ \\
+\hspace{3mm}12.8  $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\
+\hspace{3mm}12.9  $mode \leftarrow 2$ \\
+\hspace{3mm}12.10  If $bitcpy = winsize$ then do \\
+\hspace{6mm}Window is full so perform the squarings and single multiplication. \\
+\hspace{6mm}12.10.1  for $ix$ from $0$ to $winsize -1$ do \\
+\hspace{9mm}12.10.1.1  $res \leftarrow res^2$ \\
+\hspace{9mm}12.10.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}12.10.2  $res \leftarrow res \cdot M_{bitbuf}$ \\
+\hspace{6mm}12.10.3  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}Reset the window. \\
+\hspace{6mm}12.10.4  $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\
+\\
+No more windows left.  Check for residual bits of exponent. \\
+13.  If $mode = 2$ and $bitcpy > 0$ then do \\
+\hspace{3mm}13.1  for $ix$ form $0$ to $bitcpy - 1$ do \\
+\hspace{6mm}13.1.1  $res \leftarrow res^2$ \\
+\hspace{6mm}13.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}13.1.3  $bitbuf \leftarrow bitbuf << 1$ \\
+\hspace{6mm}13.1.4  If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\
+\hspace{9mm}13.1.4.1  $res \leftarrow res \cdot M_{1}$ \\
+\hspace{9mm}13.1.4.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+14.  $y \leftarrow res$ \\
+15.  Clear $res$, $mu$ and the $M$ array. \\
+16.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_exptmod (continued)}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_exptmod.}
+This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$.  It takes advantage of the Barrett reduction
+algorithm to keep the product small throughout the algorithm.
+
+The first two steps determine the optimal window size based on the number of bits in the exponent.  The larger the exponent the 
+larger the window size becomes.  After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated.  This
+table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$.  
+
+After the table is allocated the first power of $g$ is found.  Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make
+the rest of the algorithm more efficient.  The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$
+times.  The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$.
+
+Now that the table is available the sliding window may begin.  The following list describes the functions of all the variables in the window.
+\begin{enumerate}
+\item The variable $mode$ dictates how the bits of the exponent are interpreted.  
+\begin{enumerate}
+   \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet.  For example, if the exponent were simply 
+         $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit.  In this case bits are ignored until a non-zero bit is found.  
+   \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet.  In this mode leading $0$ bits 
+         are read and a single squaring is performed.  If a non-zero bit is read a new window is created.  
+   \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit
+         downwards.
+\end{enumerate}
+\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read.  When it reaches zero a new digit
+      is fetched from the exponent.
+\item The variable $buf$ holds the currently read digit of the exponent. 
+\item The variable $digidx$ is an index into the exponents digits.  It starts at the leading digit $x.used - 1$ and moves towards the trailing digit.
+\item The variable $bitcpy$ indicates how many bits are in the currently formed window.  When it reaches $winsize$ the window is flushed and
+      the appropriate operations performed.
+\item The variable $bitbuf$ holds the current bits of the window being formed.  
+\end{enumerate}
+
+All of step 12 is the window processing loop.  It will iterate while there are digits available form the exponent to read.  The first step
+inside this loop is to extract a new digit if no more bits are available in the current digit.  If there are no bits left a new digit is
+read and if there are no digits left than the loop terminates.  
+
+After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit
+upwards.  In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to 
+trailing edges the entire exponent is read from most significant bit to least significant bit.
+
+At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read.  This prevents the 
+algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read.  Step 12.6 and 12.7-10 handle
+the two cases of $mode = 1$ and $mode = 2$ respectively.  
+
+FIGU,expt_state,Sliding Window State Diagram
+
+By step 13 there are no more digits left in the exponent.  However, there may be partial bits in the window left.  If $mode = 2$ then 
+a Left-to-Right algorithm is used to process the remaining few bits.  
+
+EXAM,bn_s_mp_exptmod.c
+
+Lines @26,if@ through @40,}@ determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
+from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
+on line @32,if@ the value of $x$ is already known to be greater than $140$.  
+
+The conditional piece of code beginning on line @42,ifdef@ allows the window size to be restricted to five bits.  This logic is used to ensure
+the table of precomputed powers of $G$ remains relatively small.  
+
+The for loop on line @49,for@ initializes the $M$ array while lines @59,mp_init@ and @62,mp_reduce@ compute the value of $\mu$ required for
+Barrett reduction.  
+
+-- More later.
+
+\section{Quick Power of Two}
+Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms.  Recall that a logical shift left $m << k$ is
+equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two can be achieved.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_2expt}. \\
+\textbf{Input}.   integer $b$ \\
+\textbf{Output}.  $a \leftarrow 2^b$ \\
+\hline \\
+1.  $a \leftarrow 0$ \\
+2.  If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\
+3.  $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\
+4.  $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_2expt}
+\end{figure}
+
+\textbf{Algorithm mp\_2expt.}
+
+EXAM,bn_mp_2expt.c
+
+\chapter{Higher Level Algorithms}
+
+This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package.  These
+routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important.  
+
+The first section describes a method of integer division with remainder that is universally well known.  It provides the signed division logic
+for the package.  The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations.  
+These algorithms serve mostly to simplify other algorithms where small constants are required.  The last two sections discuss how to manipulate 
+various representations of integers.  For example, converting from an mp\_int to a string of character.
+
+\section{Integer Division with Remainder}
+\label{sec:division}
+
+Integer division aside from modular exponentiation is the most intensive algorithm to compute.  Like addition, subtraction and multiplication
+the basis of this algorithm is the long-hand division algorithm taught to school children.  Throughout this discussion several common variables
+will be used.  Let $x$ represent the divisor and $y$ represent the dividend.  Let $q$ represent the integer quotient $\lfloor y / x \rfloor$ and 
+let $r$ represent the remainder $r = y - x \lfloor y / x \rfloor$.  The following simple algorithm will be used to start the discussion.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Radix-$\beta$ Integer Division}. \\
+\textbf{Input}.   integer $x$ and $y$ \\
+\textbf{Output}.  $q = \lfloor y/x\rfloor, r = y - xq$ \\
+\hline \\
+1.  $q \leftarrow 0$ \\
+2.  $n \leftarrow \vert \vert y \vert \vert - \vert \vert x \vert \vert$ \\
+3.  for $t$ from $n$ down to $0$ do \\
+\hspace{3mm}3.1  Maximize $k$ such that $kx\beta^t$ is less than or equal to $y$ and $(k + 1)x\beta^t$ is greater. \\
+\hspace{3mm}3.2  $q \leftarrow q + k\beta^t$ \\
+\hspace{3mm}3.3  $y \leftarrow y - kx\beta^t$ \\
+4.  $r \leftarrow y$ \\
+5.  Return($q, r$) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Radix-$\beta$ Integer Division}
+\label{fig:raddiv}
+\end{figure}
+
+As children we are taught this very simple algorithm for the case of $\beta = 10$.  Almost instinctively several optimizations are taught for which
+their reason of existing are never explained.  For this example let $y = 5471$ represent the dividend and $x = 23$ represent the divisor.
+
+To find the first digit of the quotient the value of $k$ must be maximized such that $kx\beta^t$ is less than or equal to $y$ and 
+simultaneously $(k + 1)x\beta^t$ is greater than $y$.  Implicitly $k$ is the maximum value the $t$'th digit of the quotient may have.  The habitual method
+used to find the maximum is to ``eyeball'' the two numbers, typically only the leading digits and quickly estimate a quotient.  By only using leading
+digits a much simpler division may be used to form an educated guess at what the value must be.  In this case $k = \lfloor 54/23\rfloor = 2$ quickly 
+arises as a possible  solution.  Indeed $2x\beta^2 = 4600$ is less than $y = 5471$ and simultaneously $(k + 1)x\beta^2 = 6900$ is larger than $y$.  
+As a  result $k\beta^2$ is added to the quotient which now equals $q = 200$ and $4600$ is subtracted from $y$ to give a remainder of $y = 841$.
+
+Again this process is repeated to produce the quotient digit $k = 3$ which makes the quotient $q = 200 + 3\beta = 230$ and the remainder 
+$y = 841 - 3x\beta = 181$.  Finally the last iteration of the loop produces $k = 7$ which leads to the quotient $q = 230 + 7 = 237$ and the
+remainder $y = 181 - 7x = 20$.  The final quotient and remainder found are $q = 237$ and $r = y = 20$ which are indeed correct since 
+$237 \cdot 23 + 20 = 5471$ is true.  
+
+\subsection{Quotient Estimation}
+\label{sec:divest}
+As alluded to earlier the quotient digit $k$ can be estimated from only the leading digits of both the divisor and dividend.  When $p$ leading
+digits are used from both the divisor and dividend to form an estimation the accuracy of the estimation rises as $p$ grows.  Technically
+speaking the estimation is based on assuming the lower $\vert \vert y \vert \vert - p$ and $\vert \vert x \vert \vert - p$ lower digits of the
+dividend and divisor are zero.  
+
+The value of the estimation may off by a few values in either direction and in general is fairly correct.  A simplification \cite[pp. 271]{TAOCPV2}
+of the estimation technique is to use $t + 1$ digits of the dividend and $t$ digits of the divisor, in particularly when $t = 1$.  The estimate 
+using this technique is never too small.  For the following proof let $t = \vert \vert y \vert \vert - 1$ and $s = \vert \vert x \vert \vert - 1$ 
+represent the most significant digits of the dividend and divisor respectively.
+
+\textbf{Proof.}\textit{  The quotient $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ is greater than or equal to 
+$k = \lfloor y / (x \cdot \beta^{\vert \vert y \vert \vert - \vert \vert x \vert \vert - 1}) \rfloor$. }
+The first obvious case is when $\hat k = \beta - 1$ in which case the proof is concluded since the real quotient cannot be larger.  For all other 
+cases $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ and $\hat k x_s \ge y_t\beta + y_{t-1} - x_s + 1$.  The latter portion of the inequalility
+$-x_s + 1$ arises from the fact that a truncated integer division will give the same quotient for at most $x_s - 1$ values.  Next a series of 
+inequalities will prove the hypothesis.
+
+\begin{equation}
+y - \hat k x \le y - \hat k x_s\beta^s
+\end{equation}
+
+This is trivially true since $x \ge x_s\beta^s$.  Next we replace $\hat kx_s\beta^s$ by the previous inequality for $\hat kx_s$.  
+
+\begin{equation}
+y - \hat k x \le y_t\beta^t + \ldots + y_0 - (y_t\beta^t + y_{t-1}\beta^{t-1} - x_s\beta^t + \beta^s)
+\end{equation}
+
+By simplifying the previous inequality the following inequality is formed.
+
+\begin{equation}
+y - \hat k x \le y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s
+\end{equation}
+
+Subsequently,
+
+\begin{equation}
+y_{t-2}\beta^{t-2} + \ldots +  y_0  + x_s\beta^s - \beta^s < x_s\beta^s \le x
+\end{equation}
+
+Which proves that $y - \hat kx \le x$ and by consequence $\hat k \ge k$ which concludes the proof.  \textbf{QED}
+
+
+\subsection{Normalized Integers}
+For the purposes of division a normalized input is when the divisors leading digit $x_n$ is greater than or equal to $\beta / 2$.  By multiplying both
+$x$ and $y$ by $j = \lfloor (\beta / 2) / x_n \rfloor$ the quotient remains unchanged and the remainder is simply $j$ times the original
+remainder.  The purpose of normalization is to ensure the leading digit of the divisor is sufficiently large such that the estimated quotient will
+lie in the domain of a single digit.  Consider the maximum dividend $(\beta - 1) \cdot \beta + (\beta - 1)$ and the minimum divisor $\beta / 2$.  
+
+\begin{equation} 
+{{\beta^2 - 1} \over { \beta / 2}} \le 2\beta - {2 \over \beta} 
+\end{equation}
+
+At most the quotient approaches $2\beta$, however, in practice this will not occur since that would imply the previous quotient digit was too small.  
+
+\subsection{Radix-$\beta$ Division with Remainder}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div}. \\
+\textbf{Input}.   mp\_int $a, b$ \\
+\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
+\hline \\
+1.  If $b = 0$ return(\textit{MP\_VAL}). \\
+2.  If $\vert a \vert < \vert b \vert$ then do \\
+\hspace{3mm}2.1  $d \leftarrow a$ \\
+\hspace{3mm}2.2  $c \leftarrow 0$ \\
+\hspace{3mm}2.3  Return(\textit{MP\_OKAY}). \\
+\\
+Setup the quotient to receive the digits. \\
+3.  Grow $q$ to $a.used + 2$ digits. \\
+4.  $q \leftarrow 0$ \\
+5.  $x \leftarrow \vert a \vert , y \leftarrow \vert b \vert$ \\
+6.  $sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = b.sign \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\\
+Normalize the inputs such that the leading digit of $y$ is greater than or equal to $\beta / 2$. \\
+7.  $norm \leftarrow (lg(\beta) - 1) - (\lceil lg(y) \rceil \mbox{ (mod }lg(\beta)\mbox{)})$ \\
+8.  $x \leftarrow x \cdot 2^{norm}, y \leftarrow y \cdot 2^{norm}$ \\
+\\
+Find the leading digit of the quotient. \\
+9.  $n \leftarrow x.used - 1, t \leftarrow y.used - 1$ \\
+10.  $y \leftarrow y \cdot \beta^{n - t}$ \\
+11.  While ($x \ge y$) do \\
+\hspace{3mm}11.1  $q_{n - t} \leftarrow q_{n - t} + 1$ \\
+\hspace{3mm}11.2  $x \leftarrow x - y$ \\
+12.  $y \leftarrow \lfloor y / \beta^{n-t} \rfloor$ \\
+\\
+Continued on the next page. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div} (continued). \\
+\textbf{Input}.   mp\_int $a, b$ \\
+\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
+\hline \\
+Now find the remainder fo the digits. \\
+13.  for $i$ from $n$ down to $(t + 1)$ do \\
+\hspace{3mm}13.1  If $i > x.used$ then jump to the next iteration of this loop. \\
+\hspace{3mm}13.2  If $x_{i} = y_{t}$ then \\
+\hspace{6mm}13.2.1  $q_{i - t - 1} \leftarrow \beta - 1$ \\
+\hspace{3mm}13.3  else \\
+\hspace{6mm}13.3.1  $\hat r \leftarrow x_{i} \cdot \beta + x_{i - 1}$ \\
+\hspace{6mm}13.3.2  $\hat r \leftarrow \lfloor \hat r / y_{t} \rfloor$ \\
+\hspace{6mm}13.3.3  $q_{i - t - 1} \leftarrow \hat r$ \\
+\hspace{3mm}13.4  $q_{i - t - 1} \leftarrow q_{i - t - 1} + 1$ \\
+\\
+Fixup quotient estimation. \\
+\hspace{3mm}13.5  Loop \\
+\hspace{6mm}13.5.1  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
+\hspace{6mm}13.5.2  t$1 \leftarrow 0$ \\
+\hspace{6mm}13.5.3  t$1_0 \leftarrow y_{t - 1}, $ t$1_1 \leftarrow y_t,$ t$1.used \leftarrow 2$ \\
+\hspace{6mm}13.5.4  $t1 \leftarrow t1 \cdot q_{i - t - 1}$ \\
+\hspace{6mm}13.5.5  t$2_0 \leftarrow x_{i - 2}, $ t$2_1 \leftarrow x_{i - 1}, $ t$2_2 \leftarrow x_i, $ t$2.used \leftarrow 3$ \\
+\hspace{6mm}13.5.6  If $\vert t1 \vert > \vert t2 \vert$ then goto step 13.5. \\
+\hspace{3mm}13.6  t$1 \leftarrow y \cdot q_{i - t - 1}$ \\
+\hspace{3mm}13.7  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
+\hspace{3mm}13.8  $x \leftarrow x - $ t$1$ \\
+\hspace{3mm}13.9  If $x.sign = MP\_NEG$ then \\
+\hspace{6mm}13.10  t$1 \leftarrow y$ \\
+\hspace{6mm}13.11  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
+\hspace{6mm}13.12  $x \leftarrow x + $ t$1$ \\
+\hspace{6mm}13.13  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
+\\
+Finalize the result. \\
+14.  Clamp excess digits of $q$ \\
+15.  $c \leftarrow q, c.sign \leftarrow sign$ \\
+16.  $x.sign \leftarrow a.sign$ \\
+17.  $d \leftarrow \lfloor x / 2^{norm} \rfloor$ \\
+18.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div (continued)}
+\end{figure}
+\textbf{Algorithm mp\_div.}
+This algorithm will calculate quotient and remainder from an integer division given a dividend and divisor.  The algorithm is a signed
+division and will produce a fully qualified quotient and remainder.
+
+First the divisor $b$ must be non-zero which is enforced in step one.  If the divisor is larger than the dividend than the quotient is implicitly 
+zero and the remainder is the dividend.  
+
+After the first two trivial cases of inputs are handled the variable $q$ is setup to receive the digits of the quotient.  Two unsigned copies of the
+divisor $y$ and dividend $x$ are made as well.  The core of the division algorithm is an unsigned division and will only work if the values are
+positive.  Now the two values $x$ and $y$ must be normalized such that the leading digit of $y$ is greater than or equal to $\beta / 2$.  
+This is performed by shifting both to the left by enough bits to get the desired normalization.  
+
+At this point the division algorithm can begin producing digits of the quotient.  Recall that maximum value of the estimation used is 
+$2\beta - {2 \over \beta}$ which means that a digit of the quotient must be first produced by another means.  In this case $y$ is shifted
+to the left (\textit{step ten}) so that it has the same number of digits as $x$.  The loop on step eleven will subtract multiples of the 
+shifted copy of $y$ until $x$ is smaller.  Since the leading digit of $y$ is greater than or equal to $\beta/2$ this loop will iterate at most two
+times to produce the desired leading digit of the quotient.  
+
+Now the remainder of the digits can be produced.  The equation $\hat q = \lfloor {{x_i \beta + x_{i-1}}\over y_t} \rfloor$ is used to fairly
+accurately approximate the true quotient digit.  The estimation can in theory produce an estimation as high as $2\beta - {2 \over \beta}$ but by
+induction the upper quotient digit is correct (\textit{as established on step eleven}) and the estimate must be less than $\beta$.  
+
+Recall from section~\ref{sec:divest} that the estimation is never too low but may be too high.  The next step of the estimation process is
+to refine the estimation.  The loop on step 13.5 uses $x_i\beta^2 + x_{i-1}\beta + x_{i-2}$ and $q_{i - t - 1}(y_t\beta + y_{t-1})$ as a higher
+order approximation to adjust the quotient digit.
+
+After both phases of estimation the quotient digit may still be off by a value of one\footnote{This is similar to the error introduced
+by optimizing Barrett reduction.}.  Steps 13.6 and 13.7 subtract the multiple of the divisor from the dividend (\textit{Similar to step 3.3 of
+algorithm~\ref{fig:raddiv}} and then subsequently add a multiple of the divisor if the quotient was too large.  
+
+Now that the quotient has been determine finializing the result is a matter of clamping the quotient, fixing the sizes and de-normalizing the 
+remainder.  An important aspect of this algorithm seemingly overlooked in other descriptions such as that of Algorithm 14.20 HAC \cite[pp. 598]{HAC}
+is that when the estimations are being made (\textit{inside the loop on step 13.5}) that the digits $y_{t-1}$, $x_{i-2}$ and $x_{i-1}$ may lie 
+outside their respective boundaries.  For example, if $t = 0$ or $i \le 1$ then the digits would be undefined.  In those cases the digits should
+respectively be replaced with a zero.  
+
+EXAM,bn_mp_div.c
+
+The implementation of this algorithm differs slightly from the pseudo code presented previously.  In this algorithm either of the quotient $c$ or
+remainder $d$ may be passed as a \textbf{NULL} pointer which indicates their value is not desired.  For example, the C code to call the division
+algorithm with only the quotient is 
+
+\begin{verbatim}
+mp_div(&a, &b, &c, NULL);  /* c = [a/b] */
+\end{verbatim}
+
+Lines @37,if@ and @42,if@ handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
+respectively.  After the two trivial cases all of the temporary variables are initialized.  Line @76,neg@ determines the sign of 
+the quotient and line @77,sign@ ensures that both $x$ and $y$ are positive.  
+
+The number of bits in the leading digit is calculated on line @80,norm@.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
+of precision which when reduced modulo $lg(\beta)$ produces the value of $k$.  In this case $k$ is the number of bits in the leading digit which is
+exactly what is required.  For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting
+them to the left by $lg(\beta) - 1 - k$ bits.
+
+Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively.  These are first used to produce the 
+leading digit of the quotient.  The loop beginning on line @113,for@ will produce the remainder of the quotient digits.
+
+The conditional ``continue'' on line @114,if@ is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
+algorithm eliminates multiple non-zero digits in a single iteration.  This ensures that $x_i$ is always non-zero since by definition the digits
+above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}.  
+
+Lines @142,t1@, @143,t1@ and @150,t2@ through @152,t2@ manually construct the high accuracy estimations by setting the digits of the two mp\_int 
+variables directly.  
+
+\section{Single Digit Helpers}
+
+This section briefly describes a series of single digit helper algorithms which come in handy when working with small constants.  All of 
+the helper functions assume the single digit input is positive and will treat them as such.
+
+\subsection{Single Digit Addition and Subtraction}
+
+Both addition and subtraction are performed by ``cheating'' and using mp\_set followed by the higher level addition or subtraction 
+algorithms.   As a result these algorithms are subtantially simpler with a slight cost in performance.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = a + b$ \\
+\hline \\
+1.  $t \leftarrow b$ (\textit{mp\_set}) \\
+2.  $c \leftarrow a + t$ \\
+3.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_add\_d}
+\end{figure}
+
+\textbf{Algorithm mp\_add\_d.}
+This algorithm initiates a temporary mp\_int with the value of the single digit and uses algorithm mp\_add to add the two values together.
+
+EXAM,bn_mp_add_d.c
+
+Clever use of the letter 't'.
+
+\subsubsection{Subtraction}
+The single digit subtraction algorithm mp\_sub\_d is essentially the same except it uses mp\_sub to subtract the digit from the mp\_int.
+
+\subsection{Single Digit Multiplication}
+Single digit multiplication arises enough in division and radix conversion that it ought to be implement as a special case of the baseline
+multiplication algorithm.  Essentially this algorithm is a modified version of algorithm s\_mp\_mul\_digs where one of the multiplicands
+only has one digit.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = ab$ \\
+\hline \\
+1.  $pa \leftarrow a.used$ \\
+2.  Grow $c$ to at least $pa + 1$ digits. \\
+3.  $oldused \leftarrow c.used$ \\
+4.  $c.used \leftarrow pa + 1$ \\
+5.  $c.sign \leftarrow a.sign$ \\
+6.  $\mu \leftarrow 0$ \\
+7.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}7.1  $\hat r \leftarrow \mu + a_{ix}b$ \\
+\hspace{3mm}7.2  $c_{ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}7.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+8.  $c_{pa} \leftarrow \mu$ \\
+9.  for $ix$ from $pa + 1$ to $oldused$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits of $c$. \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_d}
+\end{figure}
+\textbf{Algorithm mp\_mul\_d.}
+This algorithm quickly multiplies an mp\_int by a small single digit value.  It is specially tailored to the job and has a minimal of overhead.  
+Unlike the full multiplication algorithms this algorithm does not require any significnat temporary storage or memory allocations.  
+
+EXAM,bn_mp_mul_d.c
+
+In this implementation the destination $c$ may point to the same mp\_int as the source $a$ since the result is written after the digit is 
+read from the source.  This function uses pointer aliases $tmpa$ and $tmpc$ for the digits of $a$ and $c$ respectively.  
+
+\subsection{Single Digit Division}
+Like the single digit multiplication algorithm, single digit division is also a fairly common algorithm used in radix conversion.  Since the
+divisor is only a single digit a specialized variant of the division algorithm can be used to compute the quotient.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = \lfloor a / b \rfloor, d = a - cb$ \\
+\hline \\
+1.  If $b = 0$ then return(\textit{MP\_VAL}).\\
+2.  If $b = 3$ then use algorithm mp\_div\_3 instead. \\
+3.  Init $q$ to $a.used$ digits.  \\
+4.  $q.used \leftarrow a.used$ \\
+5.  $q.sign \leftarrow a.sign$ \\
+6.  $\hat w \leftarrow 0$ \\
+7.  for $ix$ from $a.used - 1$ down to $0$ do \\
+\hspace{3mm}7.1  $\hat w \leftarrow \hat w \beta + a_{ix}$ \\
+\hspace{3mm}7.2  If $\hat w \ge b$ then \\
+\hspace{6mm}7.2.1  $t \leftarrow \lfloor \hat w / b \rfloor$ \\
+\hspace{6mm}7.2.2  $\hat w \leftarrow \hat w \mbox{ (mod }b\mbox{)}$ \\
+\hspace{3mm}7.3  else\\
+\hspace{6mm}7.3.1  $t \leftarrow 0$ \\
+\hspace{3mm}7.4  $q_{ix} \leftarrow t$ \\
+8.  $d \leftarrow \hat w$ \\
+9.  Clamp excess digits of $q$. \\
+10.  $c \leftarrow q$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_d}
+\end{figure}
+\textbf{Algorithm mp\_div\_d.}
+This algorithm divides the mp\_int $a$ by the single mp\_digit $b$ using an optimized approach.  Essentially in every iteration of the
+algorithm another digit of the dividend is reduced and another digit of quotient produced.  Provided $b < \beta$ the value of $\hat w$
+after step 7.1 will be limited such that $0 \le \lfloor \hat w / b \rfloor < \beta$.  
+
+If the divisor $b$ is equal to three a variant of this algorithm is used which is called mp\_div\_3.  It replaces the division by three with
+a multiplication by $\lfloor \beta / 3 \rfloor$ and the appropriate shift and residual fixup.  In essence it is much like the Barrett reduction
+from chapter seven.  
+
+EXAM,bn_mp_div_d.c
+
+Like the implementation of algorithm mp\_div this algorithm allows either of the quotient or remainder to be passed as a \textbf{NULL} pointer to
+indicate the respective value is not required.  This allows a trivial single digit modular reduction algorithm, mp\_mod\_d to be created.
+
+The division and remainder on lines @44,/@ and @45,%@ can be replaced often by a single division on most processors.  For example, the 32-bit x86 based 
+processors can divide a 64-bit quantity by a 32-bit quantity and produce the quotient and remainder simultaneously.  Unfortunately the GCC 
+compiler does not recognize that optimization and will actually produce two function calls to find the quotient and remainder respectively.  
+
+\subsection{Single Digit Root Extraction}
+
+Finding the $n$'th root of an integer is fairly easy as far as numerical analysis is concerned.  Algorithms such as the Newton-Raphson approximation 
+(\ref{eqn:newton}) series will converge very quickly to a root for any continuous function $f(x)$.  
+
+\begin{equation}
+x_{i+1} = x_i - {f(x_i) \over f'(x_i)}
+\label{eqn:newton}
+\end{equation}
+
+In this case the $n$'th root is desired and $f(x) = x^n - a$ where $a$ is the integer of which the root is desired.  The derivative of $f(x)$ is 
+simply $f'(x) = nx^{n - 1}$.  Of particular importance is that this algorithm will be used over the integers not over the a more continuous domain
+such as the real numbers.  As a result the root found can be above the true root by few and must be manually adjusted.  Ideally at the end of the 
+algorithm the $n$'th root $b$ of an integer $a$ is desired such that $b^n \le a$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_n\_root}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c^b \le a$ \\
+\hline \\
+1.  If $b$ is even and $a.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
+2.  $sign \leftarrow a.sign$ \\
+3.  $a.sign \leftarrow MP\_ZPOS$ \\
+4.  t$2 \leftarrow 2$ \\
+5.  Loop \\
+\hspace{3mm}5.1  t$1 \leftarrow $ t$2$ \\
+\hspace{3mm}5.2  t$3 \leftarrow $ t$1^{b - 1}$ \\
+\hspace{3mm}5.3  t$2 \leftarrow $ t$3 $ $\cdot$ t$1$ \\
+\hspace{3mm}5.4  t$2 \leftarrow $ t$2 - a$ \\
+\hspace{3mm}5.5  t$3 \leftarrow $ t$3 \cdot b$ \\
+\hspace{3mm}5.6  t$3 \leftarrow \lfloor $t$2 / $t$3 \rfloor$ \\
+\hspace{3mm}5.7  t$2 \leftarrow $ t$1 - $ t$3$ \\
+\hspace{3mm}5.8  If t$1 \ne $ t$2$ then goto step 5.  \\
+6.  Loop \\
+\hspace{3mm}6.1  t$2 \leftarrow $ t$1^b$ \\
+\hspace{3mm}6.2  If t$2 > a$ then \\
+\hspace{6mm}6.2.1  t$1 \leftarrow $ t$1 - 1$ \\
+\hspace{6mm}6.2.2  Goto step 6. \\
+7.  $a.sign \leftarrow sign$ \\
+8.  $c \leftarrow $ t$1$ \\
+9.  $c.sign \leftarrow sign$  \\
+10.  Return(\textit{MP\_OKAY}).  \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_n\_root}
+\end{figure}
+\textbf{Algorithm mp\_n\_root.}
+This algorithm finds the integer $n$'th root of an input using the Newton-Raphson approach.  It is partially optimized based on the observation
+that the numerator of ${f(x) \over f'(x)}$ can be derived from a partial denominator.  That is at first the denominator is calculated by finding
+$x^{b - 1}$.  This value can then be multiplied by $x$ and have $a$ subtracted from it to find the numerator.  This saves a total of $b - 1$ 
+multiplications by t$1$ inside the loop.  
+
+The initial value of the approximation is t$2 = 2$ which allows the algorithm to start with very small values and quickly converge on the
+root.  Ideally this algorithm is meant to find the $n$'th root of an input where $n$ is bounded by $2 \le n \le 5$.  
+
+EXAM,bn_mp_n_root.c
+
+\section{Random Number Generation}
+
+Random numbers come up in a variety of activities from public key cryptography to simple simulations and various randomized algorithms.  Pollard-Rho 
+factoring for example, can make use of random values as starting points to find factors of a composite integer.  In this case the algorithm presented
+is solely for simulations and not intended for cryptographic use.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rand}. \\
+\textbf{Input}.   An integer $b$ \\
+\textbf{Output}.  A pseudo-random number of $b$ digits \\
+\hline \\
+1.  $a \leftarrow 0$ \\
+2.  If $b \le 0$ return(\textit{MP\_OKAY}) \\
+3.  Pick a non-zero random digit $d$. \\
+4.  $a \leftarrow a + d$ \\
+5.  for $ix$ from 1 to $d - 1$ do \\
+\hspace{3mm}5.1  $a \leftarrow a \cdot \beta$ \\
+\hspace{3mm}5.2  Pick a random digit $d$. \\
+\hspace{3mm}5.3  $a \leftarrow a + d$ \\
+6.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rand}
+\end{figure}
+\textbf{Algorithm mp\_rand.}
+This algorithm produces a pseudo-random integer of $b$ digits.  By ensuring that the first digit is non-zero the algorithm also guarantees that the
+final result has at least $b$ digits.  It relies heavily on a third-part random number generator which should ideally generate uniformly all of
+the integers from $0$ to $\beta - 1$.  
+
+EXAM,bn_mp_rand.c
+
+\section{Formatted Representations}
+The ability to emit a radix-$n$ textual representation of an integer is useful for interacting with human parties.  For example, the ability to
+be given a string of characters such as ``114585'' and turn it into the radix-$\beta$ equivalent would make it easier to enter numbers
+into a program.
+
+\subsection{Reading Radix-n Input}
+For the purposes of this text we will assume that a simple lower ASCII map (\ref{fig:ASC}) is used for the values of from $0$ to $63$ to 
+printable characters.  For example, when the character ``N'' is read it represents the integer $23$.  The first $16$ characters of the
+map are for the common representations up to hexadecimal.  After that they match the ``base64'' encoding scheme which are suitable chosen
+such that they are printable.  While outputting as base64 may not be too helpful for human operators it does allow communication via non binary
+mediums.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{cc|cc|cc|cc}
+\hline \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} &  \textbf{Value} & \textbf{Char} \\
+\hline 
+0 & 0 & 1 & 1 & 2 & 2 & 3 & 3 \\
+4 & 4 & 5 & 5 & 6 & 6 & 7 & 7 \\
+8 & 8 & 9 & 9 & 10 & A & 11 & B \\
+12 & C & 13 & D & 14 & E & 15 & F \\
+16 & G & 17 & H & 18 & I & 19 & J \\
+20 & K & 21 & L & 22 & M & 23 & N \\
+24 & O & 25 & P & 26 & Q & 27 & R \\
+28 & S & 29 & T & 30 & U & 31 & V \\
+32 & W & 33 & X & 34 & Y & 35 & Z \\
+36 & a & 37 & b & 38 & c & 39 & d \\
+40 & e & 41 & f & 42 & g & 43 & h \\
+44 & i & 45 & j & 46 & k & 47 & l \\
+48 & m & 49 & n & 50 & o & 51 & p \\
+52 & q & 53 & r & 54 & s & 55 & t \\
+56 & u & 57 & v & 58 & w & 59 & x \\
+60 & y & 61 & z & 62 & $+$ & 63 & $/$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Lower ASCII Map}
+\label{fig:ASC}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_read\_radix}. \\
+\textbf{Input}.   A string $str$ of length $sn$ and radix $r$. \\
+\textbf{Output}.  The radix-$\beta$ equivalent mp\_int. \\
+\hline \\
+1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
+2.  $ix \leftarrow 0$ \\
+3.  If $str_0 =$ ``-'' then do \\
+\hspace{3mm}3.1  $ix \leftarrow ix + 1$ \\
+\hspace{3mm}3.2  $sign \leftarrow MP\_NEG$ \\
+4.  else \\
+\hspace{3mm}4.1  $sign \leftarrow MP\_ZPOS$ \\
+5.  $a \leftarrow 0$ \\
+6.  for $iy$ from $ix$ to $sn - 1$ do \\
+\hspace{3mm}6.1  Let $y$ denote the position in the map of $str_{iy}$. \\
+\hspace{3mm}6.2  If $str_{iy}$ is not in the map or $y \ge r$ then goto step 7. \\
+\hspace{3mm}6.3  $a \leftarrow a \cdot r$ \\
+\hspace{3mm}6.4  $a \leftarrow a + y$ \\
+7.  If $a \ne 0$ then $a.sign \leftarrow sign$ \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_read\_radix}
+\end{figure}
+\textbf{Algorithm mp\_read\_radix.}
+This algorithm will read an ASCII string and produce the radix-$\beta$ mp\_int representation of the same integer.  A minus symbol ``-'' may precede the 
+string  to indicate the value is negative, otherwise it is assumed to be positive.  The algorithm will read up to $sn$ characters from the input
+and will stop when it reads a character it cannot map the algorithm stops reading characters from the string.  This allows numbers to be embedded
+as part of larger input without any significant problem.
+
+EXAM,bn_mp_read_radix.c
+
+\subsection{Generating Radix-$n$ Output}
+Generating radix-$n$ output is fairly trivial with a division and remainder algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toradix}. \\
+\textbf{Input}.   A mp\_int $a$ and an integer $r$\\
+\textbf{Output}.  The radix-$r$ representation of $a$ \\
+\hline \\
+1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
+2.  If $a = 0$ then $str = $ ``$0$'' and return(\textit{MP\_OKAY}).  \\
+3.  $t \leftarrow a$ \\
+4.  $str \leftarrow$ ``'' \\
+5.  if $t.sign = MP\_NEG$ then \\
+\hspace{3mm}5.1  $str \leftarrow str + $ ``-'' \\
+\hspace{3mm}5.2  $t.sign = MP\_ZPOS$ \\
+6.  While ($t \ne 0$) do \\
+\hspace{3mm}6.1  $d \leftarrow t \mbox{ (mod }r\mbox{)}$ \\
+\hspace{3mm}6.2  $t \leftarrow \lfloor t / r \rfloor$ \\
+\hspace{3mm}6.3  Look up $d$ in the map and store the equivalent character in $y$. \\
+\hspace{3mm}6.4  $str \leftarrow str + y$ \\
+7.  If $str_0 = $``$-$'' then \\
+\hspace{3mm}7.1  Reverse the digits $str_1, str_2, \ldots str_n$. \\
+8.  Otherwise \\
+\hspace{3mm}8.1  Reverse the digits $str_0, str_1, \ldots str_n$. \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toradix}
+\end{figure}
+\textbf{Algorithm mp\_toradix.}
+This algorithm computes the radix-$r$ representation of an mp\_int $a$.  The ``digits'' of the representation are extracted by reducing 
+successive powers of $\lfloor a / r^k \rfloor$ the input modulo $r$ until $r^k > a$.  Note that instead of actually dividing by $r^k$ in
+each iteration the quotient $\lfloor a / r \rfloor$ is saved for the next iteration.  As a result a series of trivial $n \times 1$ divisions
+are required instead of a series of $n \times k$ divisions.  One design flaw of this approach is that the digits are produced in the reverse order 
+(see~\ref{fig:mpradix}).  To remedy this flaw the digits must be swapped or simply ``reversed''.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Value of $a$} & \textbf{Value of $d$} & \textbf{Value of $str$} \\
+\hline $1234$ & -- & -- \\
+\hline $123$  & $4$ & ``4'' \\
+\hline $12$   & $3$ & ``43'' \\
+\hline $1$    & $2$ & ``432'' \\
+\hline $0$    & $1$ & ``4321'' \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Algorithm mp\_toradix.}
+\label{fig:mpradix}
+\end{figure}
+
+EXAM,bn_mp_toradix.c
+
+\chapter{Number Theoretic Algorithms}
+This chapter discusses several fundamental number theoretic algorithms such as the greatest common divisor, least common multiple and Jacobi 
+symbol computation.  These algorithms arise as essential components in several key cryptographic algorithms such as the RSA public key algorithm and
+various Sieve based factoring algorithms.
+
+\section{Greatest Common Divisor}
+The greatest common divisor of two integers $a$ and $b$, often denoted as $(a, b)$ is the largest integer $k$ that is a proper divisor of
+both $a$ and $b$.  That is, $k$ is the largest integer such that $0 \equiv a \mbox{ (mod }k\mbox{)}$ and $0 \equiv b \mbox{ (mod }k\mbox{)}$ occur
+simultaneously.
+
+The most common approach (cite) is to reduce one input modulo another.  That is if $a$ and $b$ are divisible by some integer $k$ and if $qa + r = b$ then
+$r$ is also divisible by $k$.  The reduction pattern follows $\left < a , b \right > \rightarrow \left < b, a \mbox{ mod } b \right >$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (I)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  While ($b > 0$) do \\
+\hspace{3mm}1.1  $r \leftarrow a \mbox{ (mod }b\mbox{)}$ \\
+\hspace{3mm}1.2  $a \leftarrow b$ \\
+\hspace{3mm}1.3  $b \leftarrow r$ \\
+2.  Return($a$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (I)}
+\label{fig:gcd1}
+\end{figure}
+
+This algorithm will quickly converge on the greatest common divisor since the residue $r$ tends diminish rapidly.  However, divisions are
+relatively expensive operations to perform and should ideally be avoided.  There is another approach based on a similar relationship of 
+greatest common divisors.  The faster approach is based on the observation that if $k$ divides both $a$ and $b$ it will also divide $a - b$.  
+In particular, we would like $a - b$ to decrease in magnitude which implies that $b \ge a$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (II)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  While ($b > 0$) do \\
+\hspace{3mm}1.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
+\hspace{3mm}1.2  $b \leftarrow b - a$ \\
+2.  Return($a$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (II)}
+\label{fig:gcd2}
+\end{figure}
+
+\textbf{Proof} \textit{Algorithm~\ref{fig:gcd2} will return the greatest common divisor of $a$ and $b$.}
+The algorithm in figure~\ref{fig:gcd2} will eventually terminate since $b \ge a$ the subtraction in step 1.2 will be a value less than $b$.  In other
+words in every iteration that tuple $\left < a, b \right >$ decrease in magnitude until eventually $a = b$.  Since both $a$ and $b$ are always 
+divisible by the greatest common divisor (\textit{until the last iteration}) and in the last iteration of the algorithm $b = 0$, therefore, in the 
+second to last iteration of the algorithm $b = a$ and clearly $(a, a) = a$ which concludes the proof.  \textbf{QED}.
+
+As a matter of practicality algorithm \ref{fig:gcd1} decreases far too slowly to be useful.  Specially if $b$ is much larger than $a$ such that 
+$b - a$ is still very much larger than $a$.  A simple addition to the algorithm is to divide $b - a$ by a power of some integer $p$ which does
+not divide the greatest common divisor but will divide $b - a$.  In this case ${b - a} \over p$ is also an integer and still divisible by
+the greatest common divisor.
+
+However, instead of factoring $b - a$ to find a suitable value of $p$ the powers of $p$ can be removed from $a$ and $b$ that are in common first.  
+Then inside the loop whenever $b - a$ is divisible by some power of $p$ it can be safely removed.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (III)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  $k \leftarrow 0$ \\
+2.  While $a$ and $b$ are both divisible by $p$ do \\
+\hspace{3mm}2.1  $a \leftarrow \lfloor a / p \rfloor$ \\
+\hspace{3mm}2.2  $b \leftarrow \lfloor b / p \rfloor$ \\
+\hspace{3mm}2.3  $k \leftarrow k + 1$ \\
+3.  While $a$ is divisible by $p$ do \\
+\hspace{3mm}3.1  $a \leftarrow \lfloor a / p \rfloor$ \\
+4.  While $b$ is divisible by $p$ do \\
+\hspace{3mm}4.1  $b \leftarrow \lfloor b / p \rfloor$ \\
+5.  While ($b > 0$) do \\
+\hspace{3mm}5.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
+\hspace{3mm}5.2  $b \leftarrow b - a$ \\
+\hspace{3mm}5.3  While $b$ is divisible by $p$ do \\
+\hspace{6mm}5.3.1  $b \leftarrow \lfloor b / p \rfloor$ \\
+6.  Return($a \cdot p^k$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (III)}
+\label{fig:gcd3}
+\end{figure}
+
+This algorithm is based on the first except it removes powers of $p$ first and inside the main loop to ensure the tuple $\left < a, b \right >$ 
+decreases more rapidly.  The first loop on step two removes powers of $p$ that are in common.  A count, $k$, is kept which will present a common
+divisor of $p^k$.  After step two the remaining common divisor of $a$ and $b$ cannot be divisible by $p$.  This means that $p$ can be safely 
+divided out of the difference $b - a$ so long as the division leaves no remainder.  
+
+In particular the value of $p$ should be chosen such that the division on step 5.3.1 occur often.  It also helps that division by $p$ be easy
+to compute.  The ideal choice of $p$ is two since division by two amounts to a right logical shift.  Another important observation is that by
+step five both $a$ and $b$ are odd.  Therefore, the diffrence $b - a$ must be even which means that each iteration removes one bit from the 
+largest of the pair.
+
+\subsection{Complete Greatest Common Divisor}
+The algorithms presented so far cannot handle inputs which are zero or negative.  The following algorithm can handle all input cases properly
+and will produce the greatest common divisor.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_gcd}. \\
+\textbf{Input}.   mp\_int $a$ and $b$ \\
+\textbf{Output}.  The greatest common divisor $c = (a, b)$.  \\
+\hline \\
+1.  If $a = 0$ and $b \ne 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow b$ \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $a \ne 0$ and $b = 0$ then \\
+\hspace{3mm}2.1  $c \leftarrow a$ \\
+\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
+3.  If $a = b = 0$ then \\
+\hspace{3mm}3.1  $c \leftarrow 1$ \\
+\hspace{3mm}3.2  Return(\textit{MP\_OKAY}). \\
+4.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
+5.  $k \leftarrow 0$ \\
+6.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}6.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}6.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+7.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+8.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}8.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+9.  While $v.used > 0$ \\
+\hspace{3mm}9.1  If $\vert u \vert > \vert v \vert$ then \\
+\hspace{6mm}9.1.1  Swap $u$ and $v$. \\
+\hspace{3mm}9.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
+\hspace{3mm}9.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{6mm}9.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+10.  $c \leftarrow u \cdot 2^k$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_gcd}
+\end{figure}
+\textbf{Algorithm mp\_gcd.}
+This algorithm will produce the greatest common divisor of two mp\_ints $a$ and $b$.  The algorithm was originally based on Algorithm B of
+Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain.  In theory it achieves the same asymptotic working time as
+Algorithm B and in practice this appears to be true.  
+
+The first three steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
+largest input or zero if they are both zero.  If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of 
+$a$ and $b$ respectively and the algorithm will proceed to reduce the pair.
+
+Step six will divide out any common factors of two and keep track of the count in the variable $k$.  After this step two is no longer a
+factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even.  Step 
+seven and eight ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while loops will iterate since 
+they cannot both be even.
+
+By step nine both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
+or greater than $u$.  This ensures that the subtraction on step 9.2 will always produce a positive and even result.  Step 9.3 removes any
+factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd.
+
+After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six.  The result
+must be adjusted by multiplying by the common factors of two ($2^k$) removed earlier.  
+
+EXAM,bn_mp_gcd.c
+
+This function makes use of the macros mp\_iszero and mp\_iseven.  The former evaluates to $1$ if the input mp\_int is equivalent to the 
+integer zero otherwise it evaluates to $0$.  The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise
+it evaluates to $0$.  Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero.  The three 
+trivial cases of inputs are handled on lines @25,zero@ through @34,}@.  After those lines the inputs are assumed to be non-zero.
+
+Lines @36,if@ and @40,if@ make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
+must be divided out of the two inputs.  The while loop on line @49,while@ iterates so long as both are even.  The local integer $k$ is used to
+keep track of how many factors of $2$ are pulled out of both values.  It is assumed that the number of factors will not exceed the maximum 
+value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than entries than are accessible by an ``int'' so this is not 
+a limitation.}.  
+
+At this point there are no more common factors of two in the two values.  The while loops on lines @60,while@ and @65,while@ remove any independent
+factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
+on line @71, while@ performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
+place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative.
+
+\section{Least Common Multiple}
+The least common multiple of a pair of integers is their product divided by their greatest common divisor.  For two integers $a$ and $b$ the
+least common multiple is normally denoted as $[ a, b ]$ and numerically equivalent to ${ab} \over {(a, b)}$.  For example, if $a = 2 \cdot 2 \cdot 3 = 12$
+and $b = 2 \cdot 3 \cdot 3 \cdot 7 = 126$ the least common multiple is ${126 \over {(12, 126)}} = {126 \over 6} = 21$.
+
+The least common multiple arises often in coding theory as well as number theory.  If two functions have periods of $a$ and $b$ respectively they will
+collide, that is be in synchronous states, after only $[ a, b ]$ iterations.  This is why, for example, random number generators based on 
+Linear Feedback Shift Registers (LFSR) tend to use registers with periods which are co-prime (\textit{e.g. the greatest common divisor is one.}).  
+Similarly in number theory if a composite $n$ has two prime factors $p$ and $q$ then maximal order of any unit of $\Z/n\Z$ will be $[ p - 1, q - 1] $.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lcm}. \\
+\textbf{Input}.   mp\_int $a$ and $b$ \\
+\textbf{Output}.  The least common multiple $c = [a, b]$.  \\
+\hline \\
+1.  $c \leftarrow (a, b)$ \\
+2.  $t \leftarrow a \cdot b$ \\
+3.  $c \leftarrow \lfloor t / c \rfloor$ \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lcm}
+\end{figure}
+\textbf{Algorithm mp\_lcm.}
+This algorithm computes the least common multiple of two mp\_int inputs $a$ and $b$.  It computes the least common multiple directly by
+dividing the product of the two inputs by their greatest common divisor.
+
+EXAM,bn_mp_lcm.c
+
+\section{Jacobi Symbol Computation}
+To explain the Jacobi Symbol we shall first discuss the Legendre function\footnote{Arrg.  What is the name of this?} off which the Jacobi symbol is 
+defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
+equivalent to equation \ref{eqn:legendre}.
+
+\begin{equation}
+a^{(p-1)/2} \equiv \begin{array}{rl}
+                              -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
+                              0  &  \mbox{if }a\mbox{ divides }p\mbox{.} \\
+                              1  &  \mbox{if }a\mbox{ is a quadratic residue}. 
+                              \end{array} \mbox{ (mod }p\mbox{)}
+\label{eqn:legendre}                              
+\end{equation}
+
+\textbf{Proof.} \textit{Equation \ref{eqn:legendre} correctly identifies the residue status of an integer $a$ modulo a prime $p$.}
+An integer $a$ is a quadratic residue if the following equation has a solution.
+
+\begin{equation}
+x^2 \equiv a \mbox{ (mod }p\mbox{)}
+\label{eqn:root}
+\end{equation}
+
+Consider the following equation.
+
+\begin{equation}
+0 \equiv x^{p-1} - 1 \equiv \left \lbrace \left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \right \rbrace + \left ( a^{(p-1)/2} - 1 \right ) \mbox{ (mod }p\mbox{)}
+\label{eqn:rooti}
+\end{equation}
+
+Whether equation \ref{eqn:root} has a solution or not equation \ref{eqn:rooti} is always true.  If $a^{(p-1)/2} - 1 \equiv 0 \mbox{ (mod }p\mbox{)}$
+then the quantity in the braces must be zero.  By reduction,
+
+\begin{eqnarray}
+\left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \equiv 0  \nonumber \\
+\left (x^2 \right )^{(p-1)/2} \equiv a^{(p-1)/2} \nonumber \\
+x^2 \equiv a \mbox{ (mod }p\mbox{)} 
+\end{eqnarray}
+
+As a result there must be a solution to the quadratic equation and in turn $a$ must be a quadratic residue.  If $a$ does not divide $p$ and $a$
+is not a quadratic residue then the only other value $a^{(p-1)/2}$ may be congruent to is $-1$ since
+\begin{equation}
+0 \equiv a^{p - 1} - 1 \equiv (a^{(p-1)/2} + 1)(a^{(p-1)/2} - 1) \mbox{ (mod }p\mbox{)}
+\end{equation}
+One of the terms on the right hand side must be zero.  \textbf{QED}
+
+\subsection{Jacobi Symbol}
+The Jacobi symbol is a generalization of the Legendre function for any odd non prime moduli $p$ greater than 2.  If $p = \prod_{i=0}^n p_i$ then
+the Jacobi symbol $\left ( { a \over p } \right )$ is equal to the following equation.
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { a \over p_0} \right ) \left ( { a \over p_1} \right ) \ldots \left ( { a \over p_n} \right )
+\end{equation}
+
+By inspection if $p$ is prime the Jacobi symbol is equivalent to the Legendre function.  The following facts\footnote{See HAC \cite[pp. 72-74]{HAC} for
+further details.} will be used to derive an efficient Jacobi symbol algorithm.  Where $p$ is an odd integer greater than two and $a, b \in \Z$ the
+following are true.  
+
+\begin{enumerate}
+\item $\left ( { a \over p} \right )$ equals $-1$, $0$ or $1$. 
+\item $\left ( { ab \over p} \right ) = \left ( { a \over p} \right )\left ( { b \over p} \right )$.
+\item If $a \equiv b$ then $\left ( { a \over p} \right ) = \left ( { b \over p} \right )$.
+\item $\left ( { 2 \over p} \right )$ equals $1$ if $p \equiv 1$ or $7 \mbox{ (mod }8\mbox{)}$.  Otherwise, it equals $-1$.
+\item $\left ( { a \over p} \right ) \equiv \left ( { p \over a} \right ) \cdot (-1)^{(p-1)(a-1)/4}$.  More specifically 
+$\left ( { a \over p} \right ) = \left ( { p \over a} \right )$ if $p \equiv a \equiv 1 \mbox{ (mod }4\mbox{)}$.  
+\end{enumerate}
+
+Using these facts if $a = 2^k \cdot a'$ then
+
+\begin{eqnarray}
+\left ( { a \over p } \right ) = \left ( {{2^k} \over p } \right ) \left ( {a' \over p} \right ) \nonumber \\
+                               = \left ( {2 \over p } \right )^k \left ( {a' \over p} \right ) 
+\label{eqn:jacobi}
+\end{eqnarray}
+
+By fact five, 
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { p \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} 
+\end{equation}
+
+Subsequently by fact three since $p \equiv (p \mbox{ mod }a) \mbox{ (mod }a\mbox{)}$ then 
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { {p \mbox{ mod } a} \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} 
+\end{equation}
+
+By putting both observations into equation \ref{eqn:jacobi} the following simplified equation is formed.
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( {2 \over p } \right )^k \left ( {{p\mbox{ mod }a'} \over a'} \right )  \cdot (-1)^{(p-1)(a'-1)/4} 
+\end{equation}
+
+The value of $\left ( {{p \mbox{ mod }a'} \over a'} \right )$ can be found by using the same equation recursively.  The value of 
+$\left ( {2 \over p } \right )^k$ equals $1$ if $k$ is even otherwise it equals $\left ( {2 \over p } \right )$.  Using this approach the 
+factors of $p$ do not have to be known.  Furthermore, if $(a, p) = 1$ then the algorithm will terminate when the recursion requests the 
+Jacobi symbol computation of $\left ( {1 \over a'} \right )$ which is simply $1$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_jacobi}. \\
+\textbf{Input}.   mp\_int $a$ and $p$, $a \ge 0$, $p \ge 3$, $p \equiv 1 \mbox{ (mod }2\mbox{)}$ \\
+\textbf{Output}.  The Jacobi symbol $c = \left ( {a \over p } \right )$. \\
+\hline \\
+1.  If $a = 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow 0$ \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $a = 1$ then \\
+\hspace{3mm}2.1  $c \leftarrow 1$ \\
+\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
+3.  $a' \leftarrow a$ \\
+4.  $k \leftarrow 0$ \\
+5.  While $a'.used > 0$ and $a'_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}5.2  $a' \leftarrow \lfloor a' / 2 \rfloor$ \\
+6.  If $k \equiv 0 \mbox{ (mod }2\mbox{)}$ then \\
+\hspace{3mm}6.1  $s \leftarrow 1$ \\
+7.  else \\
+\hspace{3mm}7.1  $r \leftarrow p_0 \mbox{ (mod }8\mbox{)}$ \\
+\hspace{3mm}7.2  If $r = 1$ or $r = 7$ then \\
+\hspace{6mm}7.2.1  $s \leftarrow 1$ \\
+\hspace{3mm}7.3  else \\
+\hspace{6mm}7.3.1  $s \leftarrow -1$ \\
+8.  If $p_0 \equiv a'_0 \equiv 3 \mbox{ (mod }4\mbox{)}$ then \\
+\hspace{3mm}8.1  $s \leftarrow -s$ \\
+9.  If $a' \ne 1$ then \\
+\hspace{3mm}9.1  $p' \leftarrow p \mbox{ (mod }a'\mbox{)}$ \\
+\hspace{3mm}9.2  $s \leftarrow s \cdot \mbox{mp\_jacobi}(p', a')$ \\
+10.  $c \leftarrow s$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_jacobi}
+\end{figure}
+\textbf{Algorithm mp\_jacobi.}
+This algorithm computes the Jacobi symbol for an arbitrary positive integer $a$ with respect to an odd integer $p$ greater than three.  The algorithm
+is based on algorithm 2.149 of HAC \cite[pp. 73]{HAC}.  
+
+Step numbers one and two handle the trivial cases of $a = 0$ and $a = 1$ respectively.  Step five determines the number of two factors in the
+input $a$.  If $k$ is even than the term $\left ( { 2 \over p } \right )^k$ must always evaluate to one.  If $k$ is odd than the term evaluates to one 
+if $p_0$ is congruent to one or seven modulo eight, otherwise it evaluates to $-1$. After the the $\left ( { 2 \over p } \right )^k$ term is handled 
+the $(-1)^{(p-1)(a'-1)/4}$ is computed and multiplied against the current product $s$.  The latter term evaluates to one if both $p$ and $a'$ 
+are congruent to one modulo four, otherwise it evaluates to negative one.
+
+By step nine if $a'$ does not equal one a recursion is required.  Step 9.1 computes $p' \equiv p \mbox{ (mod }a'\mbox{)}$ and will recurse to compute
+$\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi product.
+
+EXAM,bn_mp_jacobi.c
+
+As a matter of practicality the variable $a'$ as per the pseudo-code is reprensented by the variable $a1$ since the $'$ symbol is not valid for a C 
+variable name character. 
+
+The two simple cases of $a = 0$ and $a = 1$ are handled at the very beginning to simplify the algorithm.  If the input is non-trivial the algorithm
+has to proceed compute the Jacobi.  The variable $s$ is used to hold the current Jacobi product.  Note that $s$ is merely a C ``int'' data type since
+the values it may obtain are merely $-1$, $0$ and $1$.  
+
+After a local copy of $a$ is made all of the factors of two are divided out and the total stored in $k$.  Technically only the least significant
+bit of $k$ is required, however, it makes the algorithm simpler to follow to perform an addition. In practice an exclusive-or and addition have the same 
+processor requirements and neither is faster than the other.
+
+Line @59, if@ through @70, }@ determines the value of $\left ( { 2 \over p } \right )^k$.  If the least significant bit of $k$ is zero than
+$k$ is even and the value is one.  Otherwise, the value of $s$ depends on which residue class $p$ belongs to modulo eight.  The value of
+$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines @73, if@ through @75, }@.  
+
+Finally, if $a1$ does not equal one the algorithm must recurse and compute $\left ( {p' \over a'} \right )$.  
+
+\textit{-- Comment about default $s$ and such...}
+
+\section{Modular Inverse}
+\label{sec:modinv}
+The modular inverse of a number actually refers to the modular multiplicative inverse.  Essentially for any integer $a$ such that $(a, p) = 1$ there
+exist another integer $b$ such that $ab \equiv 1 \mbox{ (mod }p\mbox{)}$.  The integer $b$ is called the multiplicative inverse of $a$ which is
+denoted as $b = a^{-1}$.  Technically speaking modular inversion is a well defined operation for any finite ring or field not just for rings and 
+fields of integers.  However, the former will be the matter of discussion.
+
+The simplest approach is to compute the algebraic inverse of the input.  That is to compute $b \equiv a^{\Phi(p) - 1}$.  If $\Phi(p)$ is the 
+order of the multiplicative subgroup modulo $p$ then $b$ must be the multiplicative inverse of $a$.  The proof of which is trivial.
+
+\begin{equation}
+ab \equiv a \left (a^{\Phi(p) - 1} \right ) \equiv a^{\Phi(p)} \equiv a^0 \equiv 1 \mbox{ (mod }p\mbox{)}
+\end{equation}
+
+However, as simple as this approach may be it has two serious flaws.  It requires that the value of $\Phi(p)$ be known which if $p$ is composite 
+requires all of the prime factors.  This approach also is very slow as the size of $p$ grows.  
+
+A simpler approach is based on the observation that solving for the multiplicative inverse is equivalent to solving the linear 
+Diophantine\footnote{See LeVeque \cite[pp. 40-43]{LeVeque} for more information.} equation.
+
+\begin{equation}
+ab + pq = 1
+\end{equation}
+
+Where $a$, $b$, $p$ and $q$ are all integers.  If such a pair of integers $ \left < b, q \right >$ exist than $b$ is the multiplicative inverse of 
+$a$ modulo $p$.  The extended Euclidean algorithm (Knuth \cite[pp. 342]{TAOCPV2}) can be used to solve such equations provided $(a, p) = 1$.  
+However, instead of using that algorithm directly a variant known as the binary Extended Euclidean algorithm will be used in its place.  The
+binary approach is very similar to the binary greatest common divisor algorithm except it will produce a full solution to the Diophantine 
+equation.  
+
+\subsection{General Case}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_invmod}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $(a, b) = 1$, $p \ge 2$, $0 < a < p$.  \\
+\textbf{Output}.  The modular inverse $c \equiv a^{-1} \mbox{ (mod }b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_VAL}). \\
+2.  If $b_0 \equiv 1 \mbox{ (mod }2\mbox{)}$ then use algorithm fast\_mp\_invmod. \\
+3.  $x \leftarrow \vert a \vert, y \leftarrow b$ \\
+4.  If $x_0 \equiv y_0  \equiv 0 \mbox{ (mod }2\mbox{)}$ then return(\textit{MP\_VAL}). \\
+5.  $B \leftarrow 0, C \leftarrow 0, A \leftarrow 1, D \leftarrow 1$ \\
+6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}6.2  If ($A.used > 0$ and $A_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($B.used > 0$ and $B_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
+\hspace{6mm}6.2.1  $A \leftarrow A + y$ \\
+\hspace{6mm}6.2.2  $B \leftarrow B - x$ \\
+\hspace{3mm}6.3  $A \leftarrow \lfloor A / 2 \rfloor$ \\
+\hspace{3mm}6.4  $B \leftarrow \lfloor B / 2 \rfloor$ \\
+7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+\hspace{3mm}7.2  If ($C.used > 0$ and $C_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($D.used > 0$ and $D_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
+\hspace{6mm}7.2.1  $C \leftarrow C + y$ \\
+\hspace{6mm}7.2.2  $D \leftarrow D - x$ \\
+\hspace{3mm}7.3  $C \leftarrow \lfloor C / 2 \rfloor$ \\
+\hspace{3mm}7.4  $D \leftarrow \lfloor D / 2 \rfloor$ \\
+8.  If $u \ge v$ then \\
+\hspace{3mm}8.1  $u \leftarrow u - v$ \\
+\hspace{3mm}8.2  $A \leftarrow A - C$ \\
+\hspace{3mm}8.3  $B \leftarrow B - D$ \\
+9.  else \\
+\hspace{3mm}9.1  $v \leftarrow v - u$ \\
+\hspace{3mm}9.2  $C \leftarrow C - A$ \\
+\hspace{3mm}9.3  $D \leftarrow D - B$ \\
+10.  If $u \ne 0$ goto step 6. \\
+11.  If $v \ne 1$ return(\textit{MP\_VAL}). \\
+12.  While $C \le 0$ do \\
+\hspace{3mm}12.1  $C \leftarrow C + b$ \\
+13.  While $C \ge b$ do \\
+\hspace{3mm}13.1  $C \leftarrow C - b$ \\
+14.  $c \leftarrow C$ \\
+15.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\end{figure}
+\textbf{Algorithm mp\_invmod.}
+This algorithm computes the modular multiplicative inverse of an integer $a$ modulo an integer $b$.  This algorithm is a variation of the 
+extended binary Euclidean algorithm from HAC \cite[pp. 608]{HAC}.  It has been modified to only compute the modular inverse and not a complete
+Diophantine solution.  
+
+If $b \le 0$ than the modulus is invalid and MP\_VAL is returned.  Similarly if both $a$ and $b$ are even then there cannot be a multiplicative
+inverse for $a$ and the error is reported.  
+
+The astute reader will observe that steps seven through nine are very similar to the binary greatest common divisor algorithm mp\_gcd.  In this case
+the other variables to the Diophantine equation are solved.  The algorithm terminates when $u = 0$ in which case the solution is
+
+\begin{equation}
+Ca + Db = v
+\end{equation}
+
+If $v$, the greatest common divisor of $a$ and $b$ is not equal to one then the algorithm will report an error as no inverse exists.  Otherwise, $C$
+is the modular inverse of $a$.  The actual value of $C$ is congruent to, but not necessarily equal to, the ideal modular inverse which should lie 
+within $1 \le a^{-1} < b$.  Step numbers twelve and thirteen adjust the inverse until it is in range.  If the original input $a$ is within $0 < a < p$ 
+then only a couple of additions or subtractions will be required to adjust the inverse.
+
+EXAM,bn_mp_invmod.c
+
+\subsubsection{Odd Moduli}
+
+When the modulus $b$ is odd the variables $A$ and $C$ are fixed and are not required to compute the inverse.  In particular by attempting to solve
+the Diophantine $Cb + Da = 1$ only $B$ and $D$ are required to find the inverse of $a$.  
+
+The algorithm fast\_mp\_invmod is a direct adaptation of algorithm mp\_invmod with all all steps involving either $A$ or $C$ removed.  This 
+optimization will halve the time required to compute the modular inverse.
+
+\section{Primality Tests}
+
+A non-zero integer $a$ is said to be prime if it is not divisible by any other integer excluding one and itself.  For example, $a = 7$ is prime 
+since the integers $2 \ldots 6$ do not evenly divide $a$.  By contrast, $a = 6$ is not prime since $a = 6 = 2 \cdot 3$. 
+
+Prime numbers arise in cryptography considerably as they allow finite fields to be formed.  The ability to determine whether an integer is prime or
+not quickly has been a viable subject in cryptography and number theory for considerable time.  The algorithms that will be presented are all
+probablistic algorithms in that when they report an integer is composite it must be composite.  However, when the algorithms report an integer is
+prime the algorithm may be incorrect.  
+
+As will be discussed it is possible to limit the probability of error so well that for practical purposes the probablity of error might as 
+well be zero.  For the purposes of these discussions let $n$ represent the candidate integer of which the primality is in question.
+
+\subsection{Trial Division}
+
+Trial division means to attempt to evenly divide a candidate integer by small prime integers.  If the candidate can be evenly divided it obviously
+cannot be prime.  By dividing by all primes $1 < p \le \sqrt{n}$ this test can actually prove whether an integer is prime.  However, such a test
+would require a prohibitive amount of time as $n$ grows.
+
+Instead of dividing by every prime, a smaller, more mangeable set of primes may be used instead.  By performing trial division with only a subset
+of the primes less than $\sqrt{n} + 1$ the algorithm cannot prove if a candidate is prime.  However, often it can prove a candidate is not prime.
+
+The benefit of this test is that trial division by small values is fairly efficient.  Specially compared to the other algorithms that will be
+discussed shortly.  The probability that this approach correctly identifies a composite candidate when tested with all primes upto $q$ is given by
+$1 - {1.12 \over ln(q)}$.  The graph (\ref{pic:primality}, will be added later) demonstrates the probability of success for the range 
+$3 \le q \le 100$.  
+
+At approximately $q = 30$ the gain of performing further tests diminishes fairly quickly.  At $q = 90$ further testing is generally not going to 
+be of any practical use.  In the case of LibTomMath the default limit $q = 256$ was chosen since it is not too high and will eliminate 
+approximately $80\%$ of all candidate integers.  The constant \textbf{PRIME\_SIZE} is equal to the number of primes in the test base.  The 
+array \_\_prime\_tab is an array of the first \textbf{PRIME\_SIZE} prime numbers.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_is\_divisible}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $c = 1$ if $n$ is divisible by a small prime, otherwise $c = 0$.  \\
+\hline \\
+1.  for $ix$ from $0$ to $PRIME\_SIZE$ do \\
+\hspace{3mm}1.1  $d \leftarrow n \mbox{ (mod }\_\_prime\_tab_{ix}\mbox{)}$ \\
+\hspace{3mm}1.2  If $d = 0$ then \\
+\hspace{6mm}1.2.1  $c \leftarrow 1$ \\
+\hspace{6mm}1.2.2  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow 0$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_is\_divisible}
+\end{figure}
+\textbf{Algorithm mp\_prime\_is\_divisible.}
+This algorithm attempts to determine if a candidate integer $n$ is composite by performing trial divisions.  
+
+EXAM,bn_mp_prime_is_divisible.c
+
+The algorithm defaults to a return of $0$ in case an error occurs.  The values in the prime table are all specified to be in the range of a 
+mp\_digit.  The table \_\_prime\_tab is defined in the following file.
+
+EXAM,bn_prime_tab.c
+
+Note that there are two possible tables.  When an mp\_digit is 7-bits long only the primes upto $127$ may be included, otherwise the primes
+upto $1619$ are used.  Note that the value of \textbf{PRIME\_SIZE} is a constant dependent on the size of a mp\_digit. 
+
+\subsection{The Fermat Test}
+The Fermat test is probably one the oldest tests to have a non-trivial probability of success.  It is based on the fact that if $n$ is in 
+fact prime then $a^{n} \equiv a \mbox{ (mod }n\mbox{)}$ for all $0 < a < n$.  The reason being that if $n$ is prime than the order of
+the multiplicative sub group is $n - 1$.  Any base $a$ must have an order which divides $n - 1$ and as such $a^n$ is equivalent to 
+$a^1 = a$.  
+
+If $n$ is composite then any given base $a$ does not have to have a period which divides $n - 1$.  In which case 
+it is possible that $a^n \nequiv a \mbox{ (mod }n\mbox{)}$.  However, this test is not absolute as it is possible that the order
+of a base will divide $n - 1$ which would then be reported as prime.  Such a base yields what is known as a Fermat pseudo-prime.  Several 
+integers known as Carmichael numbers will be a pseudo-prime to all valid bases.  Fortunately such numbers are extremely rare as $n$ grows
+in size.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_fermat}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
+\textbf{Output}.  $c = 1$ if $b^a \equiv b \mbox{ (mod }a\mbox{)}$, otherwise $c = 0$.  \\
+\hline \\
+1.  $t \leftarrow b^a \mbox{ (mod }a\mbox{)}$ \\
+2.  If $t = b$ then \\
+\hspace{3mm}2.1  $c = 1$ \\
+3.  else \\
+\hspace{3mm}3.1  $c = 0$ \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_fermat}
+\end{figure}
+\textbf{Algorithm mp\_prime\_fermat.}
+This algorithm determines whether an mp\_int $a$ is a Fermat prime to the base $b$ or not.  It uses a single modular exponentiation to
+determine the result.  
+
+EXAM,bn_mp_prime_fermat.c
+
+\subsection{The Miller-Rabin Test}
+The Miller-Rabin (citation) test is another primality test which has tighter error bounds than the Fermat test specifically with sequentially chosen 
+candidate  integers.  The algorithm is based on the observation that if $n - 1 = 2^kr$ and if $b^r \nequiv \pm 1$ then after upto $k - 1$ squarings the 
+value must be equal to $-1$.  The squarings are stopped as soon as $-1$ is observed.  If the value of $1$ is observed first it means that
+some value not congruent to $\pm 1$ when squared equals one which cannot occur if $n$ is prime.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_miller\_rabin}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
+\textbf{Output}.  $c = 1$ if $a$ is a Miller-Rabin prime to the base $a$, otherwise $c = 0$.  \\
+\hline
+1.  $a' \leftarrow a - 1$ \\
+2.  $r  \leftarrow n1$    \\
+3.  $c \leftarrow 0, s  \leftarrow 0$ \\
+4.  While $r.used > 0$ and $r_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}4.1  $s \leftarrow s + 1$ \\
+\hspace{3mm}4.2  $r \leftarrow \lfloor r / 2 \rfloor$ \\
+5.  $y \leftarrow b^r \mbox{ (mod }a\mbox{)}$ \\
+6.  If $y \nequiv \pm 1$ then \\
+\hspace{3mm}6.1  $j \leftarrow 1$ \\
+\hspace{3mm}6.2  While $j \le (s - 1)$ and $y \nequiv a'$ \\
+\hspace{6mm}6.2.1  $y \leftarrow y^2 \mbox{ (mod }a\mbox{)}$ \\
+\hspace{6mm}6.2.2  If $y = 1$ then goto step 8. \\
+\hspace{6mm}6.2.3  $j \leftarrow j + 1$ \\
+\hspace{3mm}6.3  If $y \nequiv a'$ goto step 8. \\
+7.  $c \leftarrow 1$\\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_miller\_rabin}
+\end{figure}
+\textbf{Algorithm mp\_prime\_miller\_rabin.}
+This algorithm performs one trial round of the Miller-Rabin algorithm to the base $b$.  It will set $c = 1$ if the algorithm cannot determine
+if $b$ is composite or $c = 0$ if $b$ is provably composite.  The values of $s$ and $r$ are computed such that $a' = a - 1 = 2^sr$.  
+
+If the value $y \equiv b^r$ is congruent to $\pm 1$ then the algorithm cannot prove if $a$ is composite or not.  Otherwise, the algorithm will
+square $y$ upto $s - 1$ times stopping only when $y \equiv -1$.  If $y^2 \equiv 1$ and $y \nequiv \pm 1$ then the algorithm can report that $a$
+is provably composite.  If the algorithm performs $s - 1$ squarings and $y \nequiv -1$ then $a$ is provably composite.  If $a$ is not provably 
+composite then it is \textit{probably} prime.
+
+EXAM,bn_mp_prime_miller_rabin.c
+
+
+
+
+\backmatter
+\appendix
+\begin{thebibliography}{ABCDEF}
+\bibitem[1]{TAOCPV2}
+Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
+
+\bibitem[2]{HAC}
+A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
+
+\bibitem[3]{ROSE}
+Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
+
+\bibitem[4]{COMBA}
+Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
+
+\bibitem[5]{KARA}
+A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
+
+\bibitem[6]{KARAP}
+Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
+
+\bibitem[7]{BARRETT}
+Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag.
+
+\bibitem[8]{MONT}
+P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985.
+
+\bibitem[9]{DRMET}
+Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories
+
+\bibitem[10]{MMB}
+J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89
+
+\bibitem[11]{RSAREF}
+R.L. Rivest, A. Shamir, L. Adleman, \textit{A Method for Obtaining Digital Signatures and Public-Key Cryptosystems}
+
+\bibitem[12]{DHREF}
+Whitfield Diffie, Martin E. Hellman, \textit{New Directions in Cryptography}, IEEE Transactions on Information Theory, 1976
+
+\bibitem[13]{IEEE}
+IEEE Standard for Binary Floating-Point Arithmetic (ANSI/IEEE Std 754-1985)
+
+\bibitem[14]{GMP}
+GNU Multiple Precision (GMP), \url{http://www.swox.com/gmp/}
+
+\bibitem[15]{MPI}
+Multiple Precision Integer Library (MPI), Michael Fromberger, \url{http://thayer.dartmouth.edu/~sting/mpi/}
+
+\bibitem[16]{OPENSSL}
+OpenSSL Cryptographic Toolkit, \url{http://openssl.org}
+
+\bibitem[17]{LIP}
+Large Integer Package, \url{http://home.hetnet.nl/~ecstr/LIP.zip}
+
+\bibitem[18]{ISOC}
+JTC1/SC22/WG14, ISO/IEC 9899:1999, ``A draft rationale for the C99 standard.''
+
+\bibitem[19]{JAVA}
+The Sun Java Website, \url{http://java.sun.com/}
+
+\end{thebibliography}
+
+\input{tommath.ind}
+
+\end{document}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tommath.tex	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,10768 @@
+\documentclass[b5paper]{book}
+\usepackage{hyperref}
+\usepackage{makeidx}
+\usepackage{amssymb}
+\usepackage{color}
+\usepackage{alltt}
+\usepackage{graphicx}
+\usepackage{layout}
+\def\union{\cup}
+\def\intersect{\cap}
+\def\getsrandom{\stackrel{\rm R}{\gets}}
+\def\cross{\times}
+\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
+\def\catn{$\|$}
+\def\divides{\hspace{0.3em} | \hspace{0.3em}}
+\def\nequiv{\not\equiv}
+\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
+\def\lcm{{\rm lcm}}
+\def\gcd{{\rm gcd}}
+\def\log{{\rm log}}
+\def\ord{{\rm ord}}
+\def\abs{{\mathit abs}}
+\def\rep{{\mathit rep}}
+\def\mod{{\mathit\ mod\ }}
+\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
+\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
+\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
+\def\Or{{\rm\ or\ }}
+\def\And{{\rm\ and\ }}
+\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
+\def\implies{\Rightarrow}
+\def\undefined{{\rm ``undefined"}}
+\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
+\let\oldphi\phi
+\def\phi{\varphi}
+\def\Pr{{\rm Pr}}
+\newcommand{\str}[1]{{\mathbf{#1}}}
+\def\F{{\mathbb F}}
+\def\N{{\mathbb N}}
+\def\Z{{\mathbb Z}}
+\def\R{{\mathbb R}}
+\def\C{{\mathbb C}}
+\def\Q{{\mathbb Q}}
+\definecolor{DGray}{gray}{0.5}
+\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
+\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
+\def\gap{\vspace{0.5ex}}
+\makeindex
+\begin{document}
+\frontmatter
+\pagestyle{empty}
+\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition }
+\author{\mbox{
+%\begin{small}
+\begin{tabular}{c}
+Tom St Denis \\
+Algonquin College \\
+\\
+Mads Rasmussen \\
+Open Communications Security \\
+\\
+Greg Rose \\
+QUALCOMM Australia \\
+\end{tabular}
+%\end{small}
+}
+}
+\maketitle
+This text has been placed in the public domain.  This text corresponds to the v0.30 release of the 
+LibTomMath project.
+
+\begin{alltt}
+Tom St Denis
+111 Banning Rd
+Ottawa, Ontario
+K2L 1C3
+Canada
+
+Phone: 1-613-836-3160
+Email: [email protected]
+\end{alltt}
+
+This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
+{\em book} macro package and the Perl {\em booker} package.
+
+\tableofcontents
+\listoffigures
+\chapter*{Prefaces to the Draft Edition}
+I started this text in April 2003 to complement my LibTomMath library.  That is, explain how to implement the functions
+contained in LibTomMath.  The goal is to have a textbook that any Computer Science student can use when implementing their
+own multiple precision arithmetic.  The plan I wanted to follow was flesh out all the
+ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time.  Chance
+would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the
+text.  
+
+Choosing to not waste any time I dove right into the project even before my spring semester was finished.  I wrote a bit
+off and on at first.  The moment my exams were finished I jumped into long 12 to 16 hour days.  The result after only
+a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted
+to read it.  I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara.  So far I have
+managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain
+rewarding.
+
+Now we are past December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
+Currently I am far off from this goal.  I've done partial re-writes of chapters one, two and three but they are not even
+finished yet.  I haven't given up on the project, only had some setbacks.  First O'Reilly declined to publish the text then
+Addison-Wesley and Greg is tried another which I don't know the name of.  However, at this point I want to focus my energy
+onto finishing the book not securing a contract.
+
+So why am I writing this text?  It seems like a lot of work right?  Most certainly it is a lot of work writing a textbook.  
+Even the simplest introductory material has to be lined with references and figures.  A lot of the text has to be re-written
+from point form to prose form to ensure an easier read.  Why am I doing all this work for free then?  Simple. My philosophy
+is quite simply ``Open Source.  Open Academia.  Open Minds'' which means that to achieve a goal of open minds, that is,
+people willing to accept new ideas and explore the unknown you have to make available material they can access freely 
+without hinderance.  
+
+I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come
+to depend upon.  I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their
+software.  Several educational institutions use it as a matter of course and many freelance developers use it as
+part of their projects.  To further my contributions I started the LibTomMath project in December 2002 aimed at providing
+multiple precision arithmetic routines that students could learn from.  That is write routines that are not only easy
+to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C.  
+
+The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in.  In the end, when all is
+said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic.  
+
+At this time I feel I should share a little information about myself.  The most common question I was asked at 
+Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended.  The unfortunate
+truth is that I neither teach at or attend a school of academic reputation.  I'm currently at Algonquin College which 
+is what I'd like to call ``somewhat academic but mostly vocational'' college.  In otherwords, job training.
+
+I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen
+computer science fields, a few fields of mathematics and some English).  I look forward to teaching someday but I am
+still far off from that goal.  
+
+Now it would be improper for me to not introduce the rest of the texts co-authors.  While they are only contributing 
+corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out
+in the text so far.  Greg has always been there for me.  He has tracked my LibTom projects since their inception and even
+sent cheques to help pay tuition from time to time.  His background has provided a wonderful source to bounce ideas off
+of and improve the quality of my writing.  Mads is another fellow who has just ``been there''.  I don't even recall what
+his interest in the LibTom projects is but I'm definitely glad he has been around.  His ability to catch logical errors
+in my written English have saved me on several occasions to say the least.
+
+What to expect next?  Well this is still a rough draft.  I've only had the chance to update a few chapters.  However, I've
+been getting the feeling that people are starting to use my text and I owe them some updated material.  My current tenative
+plan is to edit one chapter every two weeks starting January 4th.  It seems insane but my lower course load at college
+should provide ample time.  By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many
+people who will take it.
+
+\begin{flushright} Tom St Denis \end{flushright}
+
+\newpage
+I found the opportunity to work with Tom appealing for several reasons, not only could I broaden my own horizons, but also 
+contribute to educate others facing the problem of having to handle big number mathematical calculations.
+
+This book is Tom's child and he has been caring and fostering the project ever since the beginning with a clear mind of 
+how he wanted the project to turn out. I have helped by proofreading the text and we have had several discussions about 
+the layout and language used.
+
+I hold a masters degree in cryptography from the University of Southern Denmark and have always been interested in the 
+practical aspects of cryptography. 
+
+Having worked in the security consultancy business for several years in S\~{a}o Paulo, Brazil, I have been in touch with a 
+great deal of work in which multiple precision mathematics was needed. Understanding the possibilities for speeding up 
+multiple precision calculations is often very important since we deal with outdated machine architecture where modular 
+reductions, for example, become painfully slow.
+
+This text is for people who stop and wonder when first examining algorithms such as RSA for the first time and asks 
+themselves, ``You tell me this is only secure for large numbers, fine; but how do you implement these numbers?''
+
+\begin{flushright}
+Mads Rasmussen
+
+S\~{a}o Paulo - SP
+
+Brazil
+\end{flushright}
+
+\newpage
+It's all because I broke my leg. That just happened to be at about the same time that Tom asked for someone to review the section of the book about 
+Karatsuba multiplication. I was laid up, alone and immobile, and thought ``Why not?'' I vaguely knew what Karatsuba multiplication was, but not 
+really, so I thought I could help, learn, and stop myself from watching daytime cable TV, all at once.
+
+At the time of writing this, I've still not met Tom or Mads in meatspace. I've been following Tom's progress since his first splash on the 
+sci.crypt Usenet news group. I watched him go from a clueless newbie, to the cryptographic equivalent of a reformed smoker, to a real
+contributor to the field, over a period of about two years. I've been impressed with his obvious intelligence, and astounded by his productivity. 
+Of course, he's young enough to be my own child, so he doesn't have my problems with staying awake.
+
+When I reviewed that single section of the book, in its very earliest form, I was very pleasantly surprised. So I decided to collaborate more fully, 
+and at least review all of it, and perhaps write some bits too. There's still a long way to go with it, and I have watched a number of close 
+friends go through the mill of publication, so I think that the way to go is longer than Tom thinks it is. Nevertheless, it's a good effort, 
+and I'm pleased to be involved with it.
+
+\begin{flushright}
+Greg Rose, Sydney, Australia, June 2003. 
+\end{flushright}
+
+\mainmatter
+\pagestyle{headings}
+\chapter{Introduction}
+\section{Multiple Precision Arithmetic}
+
+\subsection{What is Multiple Precision Arithmetic?}
+When we think of long-hand arithmetic such as addition or multiplication we rarely consider the fact that we instinctively
+raise or lower the precision of the numbers we are dealing with.  For example, in decimal we almost immediate can 
+reason that $7$ times $6$ is $42$.  However, $42$ has two digits of precision as opposed to one digit we started with.  
+Further multiplications of say $3$ result in a larger precision result $126$.  In these few examples we have multiple 
+precisions for the numbers we are working with.  Despite the various levels of precision a single subset\footnote{With the occasional optimization.}
+ of algorithms can be designed to accomodate them.  
+
+By way of comparison a fixed or single precision operation would lose precision on various operations.  For example, in
+the decimal system with fixed precision $6 \cdot 7 = 2$.
+
+Essentially at the heart of computer based multiple precision arithmetic are the same long-hand algorithms taught in
+schools to manually add, subtract, multiply and divide.  
+
+\subsection{The Need for Multiple Precision Arithmetic}
+The most prevalent need for multiple precision arithmetic, often referred to as ``bignum'' math, is within the implementation
+of public-key cryptography algorithms.   Algorithms such as RSA \cite{RSAREF} and Diffie-Hellman \cite{DHREF} require 
+integers of significant magnitude to resist known cryptanalytic attacks.  For example, at the time of this writing a 
+typical RSA modulus would be at least greater than $10^{309}$.  However, modern programming languages such as ISO C \cite{ISOC} and 
+Java \cite{JAVA} only provide instrinsic support for integers which are relatively small and single precision.
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{|r|c|}
+\hline \textbf{Data Type} & \textbf{Range} \\
+\hline char  & $-128 \ldots 127$ \\
+\hline short & $-32768 \ldots 32767$ \\
+\hline long  & $-2147483648 \ldots 2147483647$ \\
+\hline long long & $-9223372036854775808 \ldots 9223372036854775807$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Typical Data Types for the C Programming Language}
+\label{fig:ISOC}
+\end{figure}
+
+The largest data type guaranteed to be provided by the ISO C programming 
+language\footnote{As per the ISO C standard.  However, each compiler vendor is allowed to augment the precision as they 
+see fit.}  can only represent values up to $10^{19}$ as shown in figure \ref{fig:ISOC}. On its own the C language is 
+insufficient to accomodate the magnitude required for the problem at hand.  An RSA modulus of magnitude $10^{19}$ could be 
+trivially factored\footnote{A Pollard-Rho factoring would take only $2^{16}$ time.} on the average desktop computer, 
+rendering any protocol based on the algorithm insecure.  Multiple precision algorithms solve this very problem by 
+extending the range of representable integers while using single precision data types.
+
+Most advancements in fast multiple precision arithmetic stem from the need for faster and more efficient cryptographic 
+primitives.  Faster modular reduction and exponentiation algorithms such as Barrett's algorithm, which have appeared in 
+various cryptographic journals, can render algorithms such as RSA and Diffie-Hellman more efficient.  In fact, several 
+major companies such as RSA Security, Certicom and Entrust have built entire product lines on the implementation and 
+deployment of efficient algorithms.
+
+However, cryptography is not the only field of study that can benefit from fast multiple precision integer routines.  
+Another auxiliary use of multiple precision integers is high precision floating point data types.  
+The basic IEEE \cite{IEEE} standard floating point type is made up of an integer mantissa $q$, an exponent $e$ and a sign bit $s$.  
+Numbers are given in the form $n = q \cdot b^e \cdot -1^s$ where $b = 2$ is the most common base for IEEE.  Since IEEE 
+floating point is meant to be implemented in hardware the precision of the mantissa is often fairly small 
+(\textit{23, 48 and 64 bits}).  The mantissa is merely an integer and a multiple precision integer could be used to create
+a mantissa of much larger precision than hardware alone can efficiently support.  This approach could be useful where 
+scientific applications must minimize the total output error over long calculations.
+
+Yet another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
+In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}.
+
+\subsection{Benefits of Multiple Precision Arithmetic}
+\index{precision}
+The benefit of multiple precision representations over single or fixed precision representations is that 
+no precision is lost while representing the result of an operation which requires excess precision.  For example, 
+the product of two $n$-bit integers requires at least $2n$ bits of precision to be represented faithfully.  A multiple 
+precision algorithm would augment the precision of the destination to accomodate the result while a single precision system 
+would truncate excess bits to maintain a fixed level of precision.
+
+It is possible to implement algorithms which require large integers with fixed precision algorithms.  For example, elliptic
+curve cryptography (\textit{ECC}) is often implemented on smartcards by fixing the precision of the integers to the maximum 
+size the system will ever need.  Such an approach can lead to vastly simpler algorithms which can accomodate the 
+integers required even if the host platform cannot natively accomodate them\footnote{For example, the average smartcard 
+processor has an 8 bit accumulator.}.  However, as efficient as such an approach may be, the resulting source code is not
+normally very flexible.  It cannot, at runtime, accomodate inputs of higher magnitude than the designer anticipated.
+
+Multiple precision algorithms have the most overhead of any style of arithmetic.  For the the most part the 
+overhead can be kept to a minimum with careful planning, but overall, it is not well suited for most memory starved
+platforms.  However, multiple precision algorithms do offer the most flexibility in terms of the magnitude of the 
+inputs.  That is, the same algorithms based on multiple precision integers can accomodate any reasonable size input 
+without the designer's explicit forethought.  This leads to lower cost of ownership for the code as it only has to 
+be written and tested once.
+
+\section{Purpose of This Text}
+The purpose of this text is to instruct the reader regarding how to implement efficient multiple precision algorithms.  
+That is to not only explain a limited subset of the core theory behind the algorithms but also the various ``house keeping'' 
+elements that are neglected by authors of other texts on the subject.  Several well reknowned texts \cite{TAOCPV2,HAC} 
+give considerably detailed explanations of the theoretical aspects of algorithms and often very little information 
+regarding the practical implementation aspects.  
+
+In most cases how an algorithm is explained and how it is actually implemented are two very different concepts.  For 
+example, the Handbook of Applied Cryptography (\textit{HAC}), algorithm 14.7 on page 594, gives a relatively simple 
+algorithm for performing multiple precision integer addition.  However, the description lacks any discussion concerning 
+the fact that the two integer inputs may be of differing magnitudes.  As a result the implementation is not as simple
+as the text would lead people to believe.  Similarly the division routine (\textit{algorithm 14.20, pp. 598}) does not 
+discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{step \#3}).
+
+Both texts also do not discuss several key optimal algorithms required such as ``Comba'' and Karatsuba multipliers 
+and fast modular inversion, which we consider practical oversights.  These optimal algorithms are vital to achieve 
+any form of useful performance in non-trivial applications.  
+
+To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer
+package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.org}} package is used 
+to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field 
+tested and work very well.  The LibTomMath library is freely available on the Internet for all uses and this text 
+discusses a very large portion of the inner workings of the library.
+
+The algorithms that are presented will always include at least one ``pseudo-code'' description followed 
+by the actual C source code that implements the algorithm.  The pseudo-code can be used to implement the same 
+algorithm in other programming languages as the reader sees fit.  
+
+This text shall also serve as a walkthrough of the creation of multiple precision algorithms from scratch.  Showing
+the reader how the algorithms fit together as well as where to start on various taskings.  
+
+\section{Discussion and Notation}
+\subsection{Notation}
+A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1}, \ldots, x_1, x_0)_{ \beta }$ and represent
+the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$.  The elements of the array $x$ are said to be the radix $\beta$ digits 
+of the integer.  For example, $x = (1,2,3)_{10}$ would represent the integer 
+$1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.  
+
+\index{mp\_int}
+The term ``mp\_int'' shall refer to a composite structure which contains the digits of the integer it represents, as well 
+as auxilary data required to manipulate the data.  These additional members are discussed further in section 
+\ref{sec:MPINT}.  For the purposes of this text a ``multiple precision integer'' and an ``mp\_int'' are assumed to be 
+synonymous.  When an algorithm is specified to accept an mp\_int variable it is assumed the various auxliary data members 
+are present as well.  An expression of the type \textit{variablename.item} implies that it should evaluate to the 
+member named ``item'' of the variable.  For example, a string of characters may have a member ``length'' which would 
+evaluate to the number of characters in the string.  If the string $a$ equals ``hello'' then it follows that 
+$a.length = 5$.  
+
+For certain discussions more generic algorithms are presented to help the reader understand the final algorithm used
+to solve a given problem.  When an algorithm is described as accepting an integer input it is assumed the input is 
+a plain integer with no additional multiple-precision members.  That is, algorithms that use integers as opposed to 
+mp\_ints as inputs do not concern themselves with the housekeeping operations required such as memory management.  These 
+algorithms will be used to establish the relevant theory which will subsequently be used to describe a multiple
+precision algorithm to solve the same problem.  
+
+\subsection{Precision Notation}
+The variable $\beta$ represents the radix of a single digit of a multiple precision integer and 
+must be of the form $q^p$ for $q, p \in \Z^+$.  A single precision variable must be able to represent integers in 
+the range $0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range 
+$0 \le x < q \beta^2$.  The extra radix-$q$ factor allows additions and subtractions to proceed without truncation of the 
+carry.  Since all modern computers are binary, it is assumed that $q$ is two.
+
+\index{mp\_digit} \index{mp\_word}
+Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent 
+a single precision integer type, while, the data type \textbf{mp\_word} will represent a double precision integer type.  In 
+several algorithms (notably the Comba routines) temporary results will be stored in arrays of double precision mp\_words.  
+For the purposes of this text $x_j$ will refer to the $j$'th digit of a single precision array and $\hat x_j$ will refer to 
+the $j$'th digit of a double precision array.  Whenever an expression is to be assigned to a double precision
+variable it is assumed that all single precision variables are promoted to double precision during the evaluation.  
+Expressions that are assigned to a single precision variable are truncated to fit within the precision of a single
+precision data type.
+
+For example, if $\beta = 10^2$ a single precision data type may represent a value in the 
+range $0 \le x < 10^3$, while a double precision data type may represent a value in the range $0 \le x < 10^5$.  Let
+$a = 23$ and $b = 49$ represent two single precision variables.  The single precision product shall be written
+as $c \leftarrow a \cdot b$ while the double precision product shall be written as $\hat c \leftarrow a \cdot b$.
+In this particular case, $\hat c = 1127$ and $c = 127$.  The most significant digit of the product would not fit 
+in a single precision data type and as a result $c \ne \hat c$.  
+
+\subsection{Algorithm Inputs and Outputs}
+Within the algorithm descriptions all variables are assumed to be scalars of either single or double precision
+as indicated.  The only exception to this rule is when variables have been indicated to be of type mp\_int.  This 
+distinction is important as scalars are often used as array indicies and various other counters.  
+
+\subsection{Mathematical Expressions}
+The $\lfloor \mbox{ } \rfloor$ brackets imply an expression truncated to an integer not greater than the expression 
+itself.  For example, $\lfloor 5.7 \rfloor = 5$.  Similarly the $\lceil \mbox{ } \rceil$ brackets imply an expression
+rounded to an integer not less than the expression itself.  For example, $\lceil 5.1 \rceil = 6$.  Typically when 
+the $/$ division symbol is used the intention is to perform an integer division with truncation.  For example, 
+$5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When an expression is written as a 
+fraction a real value division is implied, for example ${5 \over 2} = 2.5$.  
+
+The norm of a multiple precision integer, for example $\vert \vert x \vert \vert$, will be used to represent the number of digits in the representation
+of the integer.  For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$.  
+
+\subsection{Work Effort}
+\index{big-Oh}
+To measure the efficiency of the specified algorithms, a modified big-Oh notation is used.  In this system all 
+single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.  
+That is a single precision addition, multiplication and division are assumed to take the same time to 
+complete.  While this is generally not true in practice, it will simplify the discussions considerably.
+
+Some algorithms have slight advantages over others which is why some constants will not be removed in 
+the notation.  For example, a normal baseline multiplication (section \ref{sec:basemult}) requires $O(n^2)$ work while a 
+baseline squaring (section \ref{sec:basesquare}) requires $O({{n^2 + n}\over 2})$ work.  In standard big-Oh notation these 
+would both be said to be equivalent to $O(n^2)$.  However, 
+in the context of the this text this is not the case as the magnitude of the inputs will typically be rather small.  As a 
+result small constant factors in the work effort will make an observable difference in algorithm efficiency.
+
+All of the algorithms presented in this text have a polynomial time work level.  That is, of the form 
+$O(n^k)$ for $n, k \in \Z^{+}$.  This will help make useful comparisons in terms of the speed of the algorithms and how 
+various optimizations will help pay off in the long run.
+
+\section{Exercises}
+Within the more advanced chapters a section will be set aside to give the reader some challenging exercises related to
+the discussion at hand.  These exercises are not designed to be prize winning problems, but instead to be thought 
+provoking.  Wherever possible the problems are forward minded, stating problems that will be answered in subsequent 
+chapters.  The reader is encouraged to finish the exercises as they appear to get a better understanding of the 
+subject material.  
+
+That being said, the problems are designed to affirm knowledge of a particular subject matter.  Students in particular
+are encouraged to verify they can answer the problems correctly before moving on.
+
+Similar to the exercises of \cite[pp. ix]{TAOCPV2} these exercises are given a scoring system based on the difficulty of
+the problem.  However, unlike \cite{TAOCPV2} the problems do not get nearly as hard.  The scoring of these 
+exercises ranges from one (the easiest) to five (the hardest).  The following table sumarizes the 
+scoring system used.
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|l|}
+\hline $\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
+                            & minutes to solve.  Usually does not involve much computer time \\
+                            & to solve. \\
+\hline $\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
+                     & time usage.  Usually requires a program to be written to \\
+                     & solve the problem. \\
+\hline $\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
+                     & of work.  Usually involves trivial research and development of \\
+                     & new theory from the perspective of a student. \\
+\hline $\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
+                     & of work and research, the solution to which will demonstrate \\
+                     & a higher mastery of the subject matter. \\
+\hline $\left [ 5 \right ]$ & A hard problem that involves concepts that are difficult for a \\
+                     & novice to solve.  Solutions to these problems will demonstrate a \\
+                     & complete mastery of the given subject. \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Exercise Scoring System}
+\end{figure}
+
+Problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
+devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level 
+are also designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  These
+two levels are essentially entry level questions.  
+
+Problems at the third level are meant to be a bit more difficult than the first two levels.  The answer is often 
+fairly obvious but arriving at an exacting solution requires some thought and skill.  These problems will almost always 
+involve devising a new algorithm or implementing a variation of another algorithm previously presented.  Readers who can
+answer these questions will feel comfortable with the concepts behind the topic at hand.
+
+Problems at the fourth level are meant to be similar to those of the level three questions except they will require 
+additional research to be completed.  The reader will most likely not know the answer right away, nor will the text provide 
+the exact details of the answer until a subsequent chapter.  
+
+Problems at the fifth level are meant to be the hardest 
+problems relative to all the other problems in the chapter.  People who can correctly answer fifth level problems have a 
+mastery of the subject matter at hand.
+
+Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
+is encouraged to answer the follow-up problems and try to draw the relevance of problems.
+
+\section{Introduction to LibTomMath}
+
+\subsection{What is LibTomMath?}
+LibTomMath is a free and open source multiple precision integer library written entirely in portable ISO C.  By portable it 
+is meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on 
+any given platform.  
+
+The library has been successfully tested under numerous operating systems including Unix\footnote{All of these
+trademarks belong to their respective rightful owners.}, MacOS, Windows, Linux, PalmOS and on standalone hardware such 
+as the Gameboy Advance.  The library is designed to contain enough functionality to be able to develop applications such 
+as public key cryptosystems and still maintain a relatively small footprint.
+
+\subsection{Goals of LibTomMath}
+
+Libraries which obtain the most efficiency are rarely written in a high level programming language such as C.  However, 
+even though this library is written entirely in ISO C, considerable care has been taken to optimize the algorithm implementations within the 
+library.  Specifically the code has been written to work well with the GNU C Compiler (\textit{GCC}) on both x86 and ARM 
+processors.  Wherever possible, highly efficient algorithms, such as Karatsuba multiplication, sliding window 
+exponentiation and Montgomery reduction have been provided to make the library more efficient.  
+
+Even with the nearly optimal and specialized algorithms that have been included the Application Programing Interface 
+(\textit{API}) has been kept as simple as possible.  Often generic place holder routines will make use of specialized 
+algorithms automatically without the developer's specific attention.  One such example is the generic multiplication 
+algorithm \textbf{mp\_mul()} which will automatically use Toom--Cook, Karatsuba, Comba or baseline multiplication 
+based on the magnitude of the inputs and the configuration of the library.  
+
+Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should 
+be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
+MPI library was used as a API template for all the basic functions.  MPI was chosen because it is another library that fits 
+in the same niche as LibTomMath.  Even though LibTomMath uses MPI as the template for the function names and argument 
+passing conventions, it has been written from scratch by Tom St Denis.
+
+The project is also meant to act as a learning tool for students, the logic being that no easy-to-follow ``bignum'' 
+library exists which can be used to teach computer science students how to perform fast and reliable multiple precision 
+integer arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.  
+
+\section{Choice of LibTomMath}
+LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
+for more worthy reasons.  Other libraries such as GMP \cite{GMP}, MPI \cite{MPI}, LIP \cite{LIP} and OpenSSL 
+\cite{OPENSSL} have multiple precision integer arithmetic routines but would not be ideal for this text for 
+reasons that will be explained in the following sub-sections.
+
+\subsection{Code Base}
+The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
+segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
+developer can more readily discern the true intent of a given section of source code without trying to keep track of
+what conditional code will be used.
+
+The code base of LibTomMath is well organized.  Each function is in its own separate source code file 
+which allows the reader to find a given function very quickly.  On average there are $76$ lines of code per source
+file which makes the source very easily to follow.  By comparison MPI and LIP are single file projects making code tracing
+very hard.  GMP has many conditional code segments which also hinder tracing.  
+
+When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $100$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.}
+ which is fairly small compared to GMP (over $250$KiB).  LibTomMath is slightly larger than MPI (which compiles to about 
+$50$KiB) but LibTomMath is also much faster and more complete than MPI.
+
+\subsection{API Simplicity}
+LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build 
+with LibTomMath without change. The function names correlate directly to the action they perform.  Almost all of the 
+functions share the same parameter passing convention.  The learning curve is fairly shallow with the API provided 
+which is an extremely valuable benefit for the student and developer alike.  
+
+The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to 
+illegible short hand.  LibTomMath does not share this characteristic.  
+
+The GMP library also does not return error codes.  Instead it uses a POSIX.1 \cite{POSIX1} signal system where errors
+are signaled to the host application.  This happens to be the fastest approach but definitely not the most versatile.  In
+effect a math error (i.e. invalid input, heap error, etc) can cause a program to stop functioning which is definitely 
+undersireable in many situations.
+
+\subsection{Optimizations}
+While LibTomMath is certainly not the fastest library (GMP often beats LibTomMath by a factor of two) it does
+feature a set of optimal algorithms for tasks such as modular reduction, exponentiation, multiplication and squaring.  GMP 
+and LIP also feature such optimizations while MPI only uses baseline algorithms with no optimizations.  GMP lacks a few
+of the additional modular reduction optimizations that LibTomMath features\footnote{At the time of this writing GMP
+only had Barrett and Montgomery modular reduction algorithms.}.  
+
+LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular
+exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually  
+slower than the best libraries such as GMP and OpenSSL by only a small factor.
+
+\subsection{Portability and Stability}
+LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler 
+(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any 
+variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of 
+MPI has recently stopped working on his library and LIP has long since been discontinued.  
+
+GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
+development and are very stable across a variety of platforms.
+
+\subsection{Choice}
+LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
+the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However, 
+the reader is encouraged to download their own copy of the library to actually be able to work with the library.  
+
+\chapter{Getting Started}
+\section{Library Basics}
+The trick to writing any useful library of source code is to build a solid foundation and work outwards from it.  First, 
+a problem along with allowable solution parameters should be identified and analyzed.  In this particular case the 
+inability to accomodate multiple precision integers is the problem.  Futhermore, the solution must be written
+as portable source code that is reasonably efficient across several different computer platforms.
+
+After a foundation is formed the remainder of the library can be designed and implemented in a hierarchical fashion.  
+That is, to implement the lowest level dependencies first and work towards the most abstract functions last.  For example, 
+before implementing a modular exponentiation algorithm one would implement a modular reduction algorithm.
+By building outwards from a base foundation instead of using a parallel design methodology the resulting project is 
+highly modular.  Being highly modular is a desirable property of any project as it often means the resulting product
+has a small footprint and updates are easy to perform.  
+
+Usually when I start a project I will begin with the header files.  I define the data types I think I will need and 
+prototype the initial functions that are not dependent on other functions (within the library).  After I 
+implement these base functions I prototype more dependent functions and implement them.   The process repeats until
+I implement all of the functions I require.  For example, in the case of LibTomMath I implemented functions such as 
+mp\_init() well before I implemented mp\_mul() and even further before I implemented mp\_exptmod().  As an example as to 
+why this design works note that the Karatsuba and Toom-Cook multipliers were written \textit{after} the 
+dependent function mp\_exptmod() was written.  Adding the new multiplication algorithms did not require changes to the 
+mp\_exptmod() function itself and lowered the total cost of ownership (\textit{so to speak}) and of development 
+for new algorithms.  This methodology allows new algorithms to be tested in a complete framework with relative ease.
+
+\begin{center}
+\begin{figure}[here]
+\includegraphics{pics/design_process.ps}
+\caption{Design Flow of the First Few Original LibTomMath Functions.}
+\label{pic:design_process}
+\end{figure}
+\end{center}
+
+Only after the majority of the functions were in place did I pursue a less hierarchical approach to auditing and optimizing
+the source code.  For example, one day I may audit the multipliers and the next day the polynomial basis functions.  
+
+It only makes sense to begin the text with the preliminary data types and support algorithms required as well.  
+This chapter discusses the core algorithms of the library which are the dependents for every other algorithm.
+
+\section{What is a Multiple Precision Integer?}
+Recall that most programming languages, in particular ISO C \cite{ISOC}, only have fixed precision data types that on their own cannot 
+be used to represent values larger than their precision will allow. The purpose of multiple precision algorithms is 
+to use fixed precision data types to create and manipulate multiple precision integers which may represent values 
+that are very large.  
+
+As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits.  In the decimal system
+the largest single digit value is $9$.  However, by concatenating digits together larger numbers may be represented.  Newly prepended digits 
+(\textit{to the left}) are said to be in a different power of ten column.  That is, the number $123$ can be described as having a $1$ in the hundreds 
+column, $2$ in the tens column and $3$ in the ones column.  Or more formally $123 = 1 \cdot 10^2 + 2 \cdot 10^1 + 3 \cdot 10^0$.  Computer based 
+multiple precision arithmetic is essentially the same concept.  Larger integers are represented by adjoining fixed 
+precision computer words with the exception that a different radix is used.
+
+What most people probably do not think about explicitly are the various other attributes that describe a multiple precision 
+integer.  For example, the integer $154_{10}$ has two immediately obvious properties.  First, the integer is positive, 
+that is the sign of this particular integer is positive as opposed to negative.  Second, the integer has three digits in 
+its representation.  There is an additional property that the integer posesses that does not concern pencil-and-paper 
+arithmetic.  The third property is how many digits placeholders are available to hold the integer.  
+
+The human analogy of this third property is ensuring there is enough space on the paper to write the integer.  For example,
+if one starts writing a large number too far to the right on a piece of paper they will have to erase it and move left.  
+Similarly, computer algorithms must maintain strict control over memory usage to ensure that the digits of an integer
+will not exceed the allowed boundaries.  These three properties make up what is known as a multiple precision 
+integer or mp\_int for short.  
+
+\subsection{The mp\_int Structure}
+\label{sec:MPINT}
+The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer.  The ISO C standard does not provide for 
+any such data type but it does provide for making composite data types known as structures.  The following is the structure definition 
+used within LibTomMath.
+
+\index{mp\_int}
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+%\begin{verbatim}
+\begin{tabular}{|l|}
+\hline
+typedef struct \{ \\
+\hspace{3mm}int used, alloc, sign;\\
+\hspace{3mm}mp\_digit *dp;\\
+\} \textbf{mp\_int}; \\
+\hline
+\end{tabular}
+%\end{verbatim}
+\end{small}
+\caption{The mp\_int Structure}
+\label{fig:mpint}
+\end{center}
+\end{figure}
+
+The mp\_int structure (fig. \ref{fig:mpint}) can be broken down as follows.
+
+\begin{enumerate}
+\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
+a given integer.  The \textbf{used} count must be positive (or zero) and may not exceed the \textbf{alloc} count.  
+
+\item The \textbf{alloc} parameter denotes how 
+many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count 
+of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the 
+array to accommodate the precision of the result.  
+
+\item The pointer \textbf{dp} points to a dynamically allocated array of digits that represent the given multiple 
+precision integer.  It is padded with $(\textbf{alloc} - \textbf{used})$ zero digits.  The array is maintained in a least 
+significant digit order.  As a pencil and paper analogy the array is organized such that the right most digits are stored
+first starting at the location indexed by zero\footnote{In C all arrays begin at zero.} in the array.  For example, 
+if \textbf{dp} contains $\lbrace a, b, c, \ldots \rbrace$ where \textbf{dp}$_0 = a$, \textbf{dp}$_1 = b$, \textbf{dp}$_2 = c$, $\ldots$ then 
+it would represent the integer $a + b\beta + c\beta^2 + \ldots$  
+
+\index{MP\_ZPOS} \index{MP\_NEG}
+\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).  
+\end{enumerate}
+
+\subsubsection{Valid mp\_int Structures}
+Several rules are placed on the state of an mp\_int structure and are assumed to be followed for reasons of efficiency.  
+The only exceptions are when the structure is passed to initialization functions such as mp\_init() and mp\_init\_copy().
+
+\begin{enumerate}
+\item The value of \textbf{alloc} may not be less than one.  That is \textbf{dp} always points to a previously allocated
+array of digits.
+\item The value of \textbf{used} may not exceed \textbf{alloc} and must be greater than or equal to zero.
+\item The value of \textbf{used} implies the digit at index $(used - 1)$ of the \textbf{dp} array is non-zero.  That is, 
+leading zero digits in the most significant positions must be trimmed.
+   \begin{enumerate}
+   \item Digits in the \textbf{dp} array at and above the \textbf{used} location must be zero.
+   \end{enumerate}
+\item The value of \textbf{sign} must be \textbf{MP\_ZPOS} if \textbf{used} is zero; 
+this represents the mp\_int value of zero.
+\end{enumerate}
+
+\section{Argument Passing}
+A convention of argument passing must be adopted early on in the development of any library.  Making the function 
+prototypes consistent will help eliminate many headaches in the future as the library grows to significant complexity.  
+In LibTomMath the multiple precision integer functions accept parameters from left to right as pointers to mp\_int 
+structures.  That means that the source (input) operands are placed on the left and the destination (output) on the right.   
+Consider the following examples.
+
+\begin{verbatim}
+   mp_mul(&a, &b, &c);   /* c = a * b */
+   mp_add(&a, &b, &a);   /* a = a + b */
+   mp_sqr(&a, &b);       /* b = a * a */
+\end{verbatim}
+
+The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
+functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
+
+Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around, to mimic the order
+of assignment expressions.  That is, the destination (output) is on the left and arguments (inputs) are on the right.  In 
+truth, it is entirely a matter of preference.  In the case of LibTomMath the convention from the MPI library has been 
+adopted.  
+
+Another very useful design consideration, provided for in LibTomMath, is whether to allow argument sources to also be a 
+destination.  For example, the second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important 
+feature to implement since it allows the calling functions to cut down on the number of variables it must maintain.  
+However, to implement this feature specific care has to be given to ensure the destination is not modified before the 
+source is fully read.
+
+\section{Return Values}
+A well implemented application, no matter what its purpose, should trap as many runtime errors as possible and return them 
+to the caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour.  However, the end 
+developer can still manage to cause a library to crash.  For example, by passing an invalid pointer an application may
+fault by dereferencing memory not owned by the application.
+
+In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for 
+instance) and memory allocation errors.  It will not check that the mp\_int passed to any function is valid nor 
+will it check pointers for validity.  Any function that can cause a runtime error will return an error code as an 
+\textbf{int} data type with one of the following values (fig \ref{fig:errcodes}).
+
+\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Value} & \textbf{Meaning} \\
+\hline \textbf{MP\_OKAY} & The function was successful \\
+\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
+\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
+\hline
+\end{tabular}
+\end{center}
+\caption{LibTomMath Error Codes}
+\label{fig:errcodes}
+\end{figure}
+
+When an error is detected within a function it should free any memory it allocated, often during the initialization of
+temporary mp\_ints, and return as soon as possible.  The goal is to leave the system in the same state it was when the 
+function was called.  Error checking with this style of API is fairly simple.
+
+\begin{verbatim}
+   int err;
+   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
+      printf("Error: %s\n", mp_error_to_string(err));
+      exit(EXIT_FAILURE);
+   }
+\end{verbatim}
+
+The GMP \cite{GMP} library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal 
+and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases.
+
+\section{Initialization and Clearing}
+The logical starting point when actually writing multiple precision integer functions is the initialization and 
+clearing of the mp\_int structures.  These two algorithms will be used by the majority of the higher level algorithms.
+
+Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
+the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even though
+the initial integer will represent zero.  If only a single digit were allocated quite a few subsequent re-allocations
+would occur when operations are performed on the integers.  There is a tradeoff between how many default digits to allocate
+and how many re-allocations are tolerable.  Obviously allocating an excessive amount of digits initially will waste 
+memory and become unmanageable.  
+
+If the memory for the digits has been successfully allocated then the rest of the members of the structure must
+be initialized.  Since the initial state of an mp\_int is to represent the zero integer, the allocated digits must be set
+to zero.  The \textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
+
+\subsection{Initializing an mp\_int}
+An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the
+structure are set to valid values.  The mp\_init algorithm will perform such an action.
+
+\index{mp\_init}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Allocate memory and initialize $a$ to a known valid mp\_int state.  \\
+\hline \\
+1.  Allocate memory for \textbf{MP\_PREC} digits. \\
+2.  If the allocation failed return(\textit{MP\_MEM}) \\
+3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$\\
+4.  $a.sign \leftarrow MP\_ZPOS$\\
+5.  $a.used \leftarrow 0$\\
+6.  $a.alloc \leftarrow MP\_PREC$\\
+7.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init}
+\end{figure}
+
+\textbf{Algorithm mp\_init.}
+The purpose of this function is to initialize an mp\_int structure so that the rest of the library can properly
+manipulte it.  It is assumed that the input may not have had any of its members previously initialized which is certainly
+a valid assumption if the input resides on the stack.  
+
+Before any of the members such as \textbf{sign}, \textbf{used} or \textbf{alloc} are initialized the memory for
+the digits is allocated.  If this fails the function returns before setting any of the other members.  The \textbf{MP\_PREC} 
+name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} 
+used to dictate the minimum precision of newly initialized mp\_int integers.  Ideally, it is at least equal to the smallest
+precision number you'll be working with.
+
+Allocating a block of digits at first instead of a single digit has the benefit of lowering the number of usually slow
+heap operations later functions will have to perform in the future.  If \textbf{MP\_PREC} is set correctly the slack 
+memory and the number of heap operations will be trivial.
+
+Once the allocation has been made the digits have to be set to zero as well as the \textbf{used}, \textbf{sign} and
+\textbf{alloc} members initialized.  This ensures that the mp\_int will always represent the default state of zero regardless
+of the original condition of the input.
+
+\textbf{Remark.}
+This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally
+when the ``to'' keyword is placed between two expressions.  For example, ``for $a$ from $b$ to $c$ do'' means that
+a subsequent expression (or body of expressions) are to be evaluated upto $c - b$ times so long as $b \le c$.  In each
+iteration the variable $a$ is substituted for a new integer that lies inclusively between $b$ and $c$.  If $b > c$ occured
+the loop would not iterate.  By contrast if the ``downto'' keyword were used in place of ``to'' the loop would iterate 
+decrementally.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* init a new mp_int */
+018   int mp_init (mp_int * a)
+019   \{
+020     int i;
+021   
+022     /* allocate memory required and clear it */
+023     a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
+024     if (a->dp == NULL) \{
+025       return MP_MEM;
+026     \}
+027   
+028     /* set the digits to zero */
+029     for (i = 0; i < MP_PREC; i++) \{
+030         a->dp[i] = 0;
+031     \}
+032   
+033     /* set the used to zero, allocated digits to the default precision
+034      * and sign to positive */
+035     a->used  = 0;
+036     a->alloc = MP_PREC;
+037     a->sign  = MP_ZPOS;
+038   
+039     return MP_OKAY;
+040   \}
+041   #endif
+\end{alltt}
+\end{small}
+
+One immediate observation of this initializtion function is that it does not return a pointer to a mp\_int structure.  It 
+is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack.  The 
+call to mp\_init() is used only to initialize the members of the structure to a known default state.  
+
+Here we see (line 23) the memory allocation is performed first.  This allows us to exit cleanly and quickly
+if there is an error.  If the allocation fails the routine will return \textbf{MP\_MEM} to the caller to indicate there
+was a memory error.  The function XMALLOC is what actually allocates the memory.  Technically XMALLOC is not a function
+but a macro defined in ``tommath.h``.  By default, XMALLOC will evaluate to malloc() which is the C library's built--in
+memory allocation routine.
+
+In order to assure the mp\_int is in a known state the digits must be set to zero.  On most platforms this could have been
+accomplished by using calloc() instead of malloc().  However,  to correctly initialize a integer type to a given value in a 
+portable fashion you have to actually assign the value.  The for loop (line 29) performs this required
+operation.
+
+After the memory has been successfully initialized the remainder of the members are initialized 
+(lines 33 through 34) to their respective default states.  At this point the algorithm has succeeded and
+a success code is returned to the calling function.  If this function returns \textbf{MP\_OKAY} it is safe to assume the 
+mp\_int structure has been properly initialized and is safe to use with other functions within the library.  
+
+\subsection{Clearing an mp\_int}
+When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be 
+returned to the application's memory pool with the mp\_clear algorithm.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clear}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  The memory for $a$ shall be deallocated.  \\
+\hline \\
+1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
+2.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}2.1  $a_n \leftarrow 0$ \\
+3.  Free the memory allocated for the digits of $a$. \\
+4.  $a.used \leftarrow 0$ \\
+5.  $a.alloc \leftarrow 0$ \\
+6.  $a.sign \leftarrow MP\_ZPOS$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clear}
+\end{figure}
+
+\textbf{Algorithm mp\_clear.}
+This algorithm accomplishes two goals.  First, it clears the digits and the other mp\_int members.  This ensures that 
+if a developer accidentally re-uses a cleared structure it is less likely to cause problems.  The second goal
+is to free the allocated memory.
+
+The logic behind the algorithm is extended by marking cleared mp\_int structures so that subsequent calls to this
+algorithm will not try to free the memory multiple times.  Cleared mp\_ints are detectable by having a pre-defined invalid 
+digit pointer \textbf{dp} setting.
+
+Once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
+with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_clear.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* clear one (frees)  */
+018   void
+019   mp_clear (mp_int * a)
+020   \{
+021     int i;
+022   
+023     /* only do anything if a hasn't been freed previously */
+024     if (a->dp != NULL) \{
+025       /* first zero the digits */
+026       for (i = 0; i < a->used; i++) \{
+027           a->dp[i] = 0;
+028       \}
+029   
+030       /* free ram */
+031       XFREE(a->dp);
+032   
+033       /* reset members to make debugging easier */
+034       a->dp    = NULL;
+035       a->alloc = a->used = 0;
+036       a->sign  = MP_ZPOS;
+037     \}
+038   \}
+039   #endif
+\end{alltt}
+\end{small}
+
+The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line 24)
+checks to see if the \textbf{dp} member is not \textbf{NULL}.  If the mp\_int is a valid mp\_int then \textbf{dp} cannot be
+\textbf{NULL} in which case the if statement will evaluate to true.
+
+The digits of the mp\_int are cleared by the for loop (line 26) which assigns a zero to every digit.  Similar to mp\_init()
+the digits are assigned zero instead of using block memory operations (such as memset()) since this is more portable.  
+
+The digits are deallocated off the heap via the XFREE macro.  Similar to XMALLOC the XFREE macro actually evaluates to
+a standard C library function.  In this case the free() function.  Since free() only deallocates the memory the pointer
+still has to be reset to \textbf{NULL} manually (line 34).  
+
+Now that the digits have been cleared and deallocated the other members are set to their final values (lines 35 and 36).
+
+\section{Maintenance Algorithms}
+
+The previous sections describes how to initialize and clear an mp\_int structure.  To further support operations
+that are to be performed on mp\_int structures (such as addition and multiplication) the dependent algorithms must be
+able to augment the precision of an mp\_int and 
+initialize mp\_ints with differing initial conditions.  
+
+These algorithms complete the set of low level algorithms required to work with mp\_int structures in the higher level
+algorithms such as addition, multiplication and modular exponentiation.
+
+\subsection{Augmenting an mp\_int's Precision}
+When storing a value in an mp\_int structure, a sufficient number of digits must be available to accomodate the entire 
+result of an operation without loss of precision.  Quite often the size of the array given by the \textbf{alloc} member 
+is large enough to simply increase the \textbf{used} digit count.  However, when the size of the array is too small it 
+must be re-sized appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_grow}. \\
+\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
+\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
+\hline \\
+1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
+2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
+3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+4.  Re-allocate the array of digits $a$ to size $v$ \\
+5.  If the allocation failed then return(\textit{MP\_MEM}). \\
+6.  for n from a.alloc to $v - 1$ do  \\
+\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.alloc \leftarrow v$ \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_grow}
+\end{figure}
+
+\textbf{Algorithm mp\_grow.}
+It is ideal to prevent re-allocations from being performed if they are not required (step one).  This is useful to 
+prevent mp\_ints from growing excessively in code that erroneously calls mp\_grow.  
+
+The requested digit count is padded up to next multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} (steps two and three).  
+This helps prevent many trivial reallocations that would grow an mp\_int by trivially small values.  
+
+It is assumed that the reallocation (step four) leaves the lower $a.alloc$ digits of the mp\_int intact.  This is much 
+akin to how the \textit{realloc} function from the standard C library works.  Since the newly allocated digits are 
+assumed to contain undefined values they are initially set to zero.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_grow.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* grow as required */
+018   int mp_grow (mp_int * a, int size)
+019   \{
+020     int     i;
+021     mp_digit *tmp;
+022   
+023     /* if the alloc size is smaller alloc more ram */
+024     if (a->alloc < size) \{
+025       /* ensure there are always at least MP_PREC digits extra on top */
+026       size += (MP_PREC * 2) - (size % MP_PREC);
+027   
+028       /* reallocate the array a->dp
+029        *
+030        * We store the return in a temporary variable
+031        * in case the operation failed we don't want
+032        * to overwrite the dp member of a.
+033        */
+034       tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
+035       if (tmp == NULL) \{
+036         /* reallocation failed but "a" is still valid [can be freed] */
+037         return MP_MEM;
+038       \}
+039   
+040       /* reallocation succeeded so set a->dp */
+041       a->dp = tmp;
+042   
+043       /* zero excess digits */
+044       i        = a->alloc;
+045       a->alloc = size;
+046       for (; i < a->alloc; i++) \{
+047         a->dp[i] = 0;
+048       \}
+049     \}
+050     return MP_OKAY;
+051   \}
+052   #endif
+\end{alltt}
+\end{small}
+
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line 23) checks
+if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
+the function skips the re-allocation part thus saving time.
+
+When a re-allocation is performed it is turned into an optimal request to save time in the future.  The requested digit count is
+padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line 26).  The XREALLOC function is used
+to re-allocate the memory.  As per the other functions XREALLOC is actually a macro which evaluates to realloc by default.  The realloc
+function leaves the base of the allocation intact which means the first \textbf{alloc} digits of the mp\_int are the same as before
+the re-allocation.  All	that is left is to clear the newly allocated digits and return.
+
+Note that the re-allocation result is actually stored in a temporary pointer $tmp$.  This is to allow this function to return
+an error with a valid pointer.  Earlier releases of the library stored the result of XREALLOC into the mp\_int $a$.  That would
+result in a memory leak if XREALLOC ever failed.  
+
+\subsection{Initializing Variable Precision mp\_ints}
+Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size 
+of input mp\_ints to a given algorithm.  The purpose of algorithm mp\_init\_size is similar to mp\_init except that it 
+will allocate \textit{at least} a specified number of digits.  
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_size}. \\
+\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$. \\
+\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
+\hline \\
+1.  $u \leftarrow b \mbox{ (mod }MP\_PREC\mbox{)}$ \\
+2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+3.  Allocate $v$ digits. \\
+4.  for $n$ from $0$ to $v - 1$ do \\
+\hspace{3mm}4.1  $a_n \leftarrow 0$ \\
+5.  $a.sign \leftarrow MP\_ZPOS$\\
+6.  $a.used \leftarrow 0$\\
+7.  $a.alloc \leftarrow v$\\
+8.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_init\_size}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_size.}
+This algorithm will initialize an mp\_int structure $a$ like algorithm mp\_init with the exception that the number of 
+digits allocated can be controlled by the second input argument $b$.  The input size is padded upwards so it is a 
+multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} digits.  This padding is used to prevent trivial 
+allocations from becoming a bottleneck in the rest of the algorithms.
+
+Like algorithm mp\_init, the mp\_int structure is initialized to a default state representing the integer zero.  This 
+particular algorithm is useful if it is known ahead of time the approximate size of the input.  If the approximation is
+correct no further memory re-allocations are required to work with the mp\_int.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_size.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* init an mp_init for a given size */
+018   int mp_init_size (mp_int * a, int size)
+019   \{
+020     int x;
+021   
+022     /* pad size so there are always extra digits */
+023     size += (MP_PREC * 2) - (size % MP_PREC);    
+024     
+025     /* alloc mem */
+026     a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
+027     if (a->dp == NULL) \{
+028       return MP_MEM;
+029     \}
+030   
+031     /* set the members */
+032     a->used  = 0;
+033     a->alloc = size;
+034     a->sign  = MP_ZPOS;
+035   
+036     /* zero the digits */
+037     for (x = 0; x < size; x++) \{
+038         a->dp[x] = 0;
+039     \}
+040   
+041     return MP_OKAY;
+042   \}
+043   #endif
+\end{alltt}
+\end{small}
+
+The number of digits $b$ requested is padded (line 23) by first augmenting it to the next multiple of 
+\textbf{MP\_PREC} and then adding \textbf{MP\_PREC} to the result.  If the memory can be successfully allocated the 
+mp\_int is placed in a default state representing the integer zero.  Otherwise, the error code \textbf{MP\_MEM} will be 
+returned (line 28).  
+
+The digits are allocated and set to zero at the same time with the calloc() function (line @25,XCALLOC@).  The 
+\textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set 
+to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines 32, 33 and 34).  If the function 
+returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the 
+functions to work with.
+
+\subsection{Multiple Integer Initializations and Clearings}
+Occasionally a function will require a series of mp\_int data types to be made available simultaneously.  
+The purpose of algorithm mp\_init\_multi is to initialize a variable length array of mp\_int structures in a single
+statement.  It is essentially a shortcut to multiple initializations.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_multi}. \\
+\textbf{Input}.   Variable length array $V_k$ of mp\_int variables of length $k$. \\
+\textbf{Output}.  The array is initialized such that each mp\_int of $V_k$ is ready to use. \\
+\hline \\
+1.  for $n$ from 0 to $k - 1$ do \\
+\hspace{+3mm}1.1.  Initialize the mp\_int $V_n$ (\textit{mp\_init}) \\
+\hspace{+3mm}1.2.  If initialization failed then do \\
+\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
+\hspace{+9mm}1.2.1.1.  Free the mp\_int $V_j$ (\textit{mp\_clear}) \\
+\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
+2.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_multi}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_multi.}
+The algorithm will initialize the array of mp\_int variables one at a time.  If a runtime error has been detected 
+(\textit{step 1.2}) all of the previously initialized variables are cleared.  The goal is an ``all or nothing'' 
+initialization which allows for quick recovery from runtime errors.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_multi.c
+\vspace{-3mm}
+\begin{alltt}
+016   #include <stdarg.h>
+017   
+018   int mp_init_multi(mp_int *mp, ...) 
+019   \{
+020       mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
+021       int n = 0;                 /* Number of ok inits */
+022       mp_int* cur_arg = mp;
+023       va_list args;
+024   
+025       va_start(args, mp);        /* init args to next argument from caller */
+026       while (cur_arg != NULL) \{
+027           if (mp_init(cur_arg) != MP_OKAY) \{
+028               /* Oops - error! Back-track and mp_clear what we already
+029                  succeeded in init-ing, then return error.
+030               */
+031               va_list clean_args;
+032               
+033               /* end the current list */
+034               va_end(args);
+035               
+036               /* now start cleaning up */            
+037               cur_arg = mp;
+038               va_start(clean_args, mp);
+039               while (n--) \{
+040                   mp_clear(cur_arg);
+041                   cur_arg = va_arg(clean_args, mp_int*);
+042               \}
+043               va_end(clean_args);
+044               res = MP_MEM;
+045               break;
+046           \}
+047           n++;
+048           cur_arg = va_arg(args, mp_int*);
+049       \}
+050       va_end(args);
+051       return res;                /* Assumed ok, if error flagged above. */
+052   \}
+053   
+054   #endif
+\end{alltt}
+\end{small}
+
+This function intializes a variable length list of mp\_int structure pointers.  However, instead of having the mp\_int
+structures in an actual C array they are simply passed as arguments to the function.  This function makes use of the 
+``...'' argument syntax of the C programming language.  The list is terminated with a final \textbf{NULL} argument 
+appended on the right.  
+
+The function uses the ``stdarg.h'' \textit{va} functions to step portably through the arguments to the function.  A count
+$n$ of succesfully initialized mp\_int structures is maintained (line 47) such that if a failure does occur,
+the algorithm can backtrack and free the previously initialized structures (lines 27 to 46).  
+
+
+\subsection{Clamping Excess Digits}
+When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of 
+the function instead of checking during the computation.  For example, a multiplication of a $i$ digit number by a 
+$j$ digit produces a result of at most $i + j$ digits.  It is entirely possible that the result is $i + j - 1$ 
+though, with no final carry into the last position.  However, suppose the destination had to be first expanded 
+(\textit{via mp\_grow}) to accomodate $i + j - 1$ digits than further expanded to accomodate the final carry.  
+That would be a considerable waste of time since heap operations are relatively slow.
+
+The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function
+terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
+there would be an excess high order zero digit.  
+
+For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit 
+will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
+accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very 
+low the representation is excessively large.  
+
+The mp\_clamp algorithm is designed to solve this very problem.  It will trim high-order zeros by decrementing the 
+\textbf{used} count until a non-zero most significant digit is found.  Also in this system, zero is considered to be a 
+positive number which means that if the \textbf{used} count is decremented to zero, the sign must be set to 
+\textbf{MP\_ZPOS}.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clamp}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
+\hline \\
+1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
+\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
+2.  if $a.used = 0$ then do \\
+\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
+\hline \\
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clamp}
+\end{figure}
+
+\textbf{Algorithm mp\_clamp.}
+As can be expected this algorithm is very simple.  The loop on step one is expected to iterate only once or twice at
+the most.  For example, this will happen in cases where there is not a carry to fill the last position.  Step two fixes the sign for 
+when all of the digits are zero to ensure that the mp\_int is valid at all times.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_clamp.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* trim unused digits 
+018    *
+019    * This is used to ensure that leading zero digits are
+020    * trimed and the leading "used" digit will be non-zero
+021    * Typically very fast.  Also fixes the sign if there
+022    * are no more leading digits
+023    */
+024   void
+025   mp_clamp (mp_int * a)
+026   \{
+027     /* decrease used while the most significant digit is
+028      * zero.
+029      */
+030     while (a->used > 0 && a->dp[a->used - 1] == 0) \{
+031       --(a->used);
+032     \}
+033   
+034     /* reset the sign flag if used == 0 */
+035     if (a->used == 0) \{
+036       a->sign = MP_ZPOS;
+037     \}
+038   \}
+039   #endif
+\end{alltt}
+\end{small}
+
+Note on line 27 how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
+language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is 
+important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously 
+undesirable.  The parenthesis on line 30 is used to make sure the \textbf{used} count is decremented and not
+the pointer ``a''.  
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
+                     & \\
+$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
+                     & encryption when $\beta = 2^{28}$.  \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
+                     & \\
+$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
+                     & \\
+\end{tabular}
+
+
+%%%
+% CHAPTER FOUR
+%%%
+
+\chapter{Basic Operations}
+
+\section{Introduction}
+In the previous chapter a series of low level algorithms were established that dealt with initializing and maintaining
+mp\_int structures.  This chapter will discuss another set of seemingly non-algebraic algorithms which will form the low 
+level basis of the entire library.  While these algorithm are relatively trivial it is important to understand how they
+work before proceeding since these algorithms will be used almost intrinsically in the following chapters.
+
+The algorithms in this chapter deal primarily with more ``programmer'' related tasks such as creating copies of
+mp\_int structures, assigning small values to mp\_int structures and comparisons of the values mp\_int structures
+represent.   
+
+\section{Assigning Values to mp\_int Structures}
+\subsection{Copying an mp\_int}
+Assigning the value that a given mp\_int structure represents to another mp\_int structure shall be known as making
+a copy for the purposes of this text.  The copy of the mp\_int will be a separate entity that represents the same
+value as the mp\_int it was copied from.  The mp\_copy algorithm provides this functionality. 
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_copy}. \\
+\textbf{Input}.  An mp\_int $a$ and $b$. \\
+\textbf{Output}.  Store a copy of $a$ in $b$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{mp\_grow}) \\
+2.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}2.1  $b_{n} \leftarrow a_{n}$ \\
+3.  for $n$ from $a.used$ to $b.used - 1$ do \\
+\hspace{3mm}3.1  $b_{n} \leftarrow 0$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $b.sign \leftarrow a.sign$ \\
+6.  return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_copy.}
+This algorithm copies the mp\_int $a$ such that upon succesful termination of the algorithm the mp\_int $b$ will
+represent the same integer as the mp\_int $a$.  The mp\_int $b$ shall be a complete and distinct copy of the 
+mp\_int $a$ meaing that the mp\_int $a$ can be modified and it shall not affect the value of the mp\_int $b$.
+
+If $b$ does not have enough room for the digits of $a$ it must first have its precision augmented via the mp\_grow 
+algorithm.  The digits of $a$ are copied over the digits of $b$ and any excess digits of $b$ are set to zero (step two
+and three).  The \textbf{used} and \textbf{sign} members of $a$ are finally copied over the respective members of
+$b$.
+
+\textbf{Remark.}  This algorithm also introduces a new idiosyncrasy that will be used throughout the rest of the
+text.  The error return codes of other algorithms are not explicitly checked in the pseudo-code presented.  For example, in 
+step one of the mp\_copy algorithm the return of mp\_grow is not explicitly checked to ensure it succeeded.  Text space is 
+limited so it is assumed that if a algorithm fails it will clear all temporarily allocated mp\_ints and return
+the error code itself.  However, the C code presented will demonstrate all of the error handling logic required to 
+implement the pseudo-code.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_copy.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* copy, b = a */
+018   int
+019   mp_copy (mp_int * a, mp_int * b)
+020   \{
+021     int     res, n;
+022   
+023     /* if dst == src do nothing */
+024     if (a == b) \{
+025       return MP_OKAY;
+026     \}
+027   
+028     /* grow dest */
+029     if (b->alloc < a->used) \{
+030        if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
+031           return res;
+032        \}
+033     \}
+034   
+035     /* zero b and copy the parameters over */
+036     \{
+037       register mp_digit *tmpa, *tmpb;
+038   
+039       /* pointer aliases */
+040   
+041       /* source */
+042       tmpa = a->dp;
+043   
+044       /* destination */
+045       tmpb = b->dp;
+046   
+047       /* copy all the digits */
+048       for (n = 0; n < a->used; n++) \{
+049         *tmpb++ = *tmpa++;
+050       \}
+051   
+052       /* clear high digits */
+053       for (; n < b->used; n++) \{
+054         *tmpb++ = 0;
+055       \}
+056     \}
+057   
+058     /* copy used count and sign */
+059     b->used = a->used;
+060     b->sign = a->sign;
+061     return MP_OKAY;
+062   \}
+063   #endif
+\end{alltt}
+\end{small}
+
+Occasionally a dependent algorithm may copy an mp\_int effectively into itself such as when the input and output
+mp\_int structures passed to a function are one and the same.  For this case it is optimal to return immediately without 
+copying digits (line 24).  
+
+The mp\_int $b$ must have enough digits to accomodate the used digits of the mp\_int $a$.  If $b.alloc$ is less than
+$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines 29 to 33).  In order to
+simplify the inner loop that copies the digits from $a$ to $b$, two aliases $tmpa$ and $tmpb$ point directly at the digits
+of the mp\_ints $a$ and $b$ respectively.  These aliases (lines 42 and 45) allow the compiler to access the digits without first dereferencing the
+mp\_int pointers and then subsequently the pointer to the digits.  
+
+After the aliases are established the digits from $a$ are copied into $b$ (lines 48 to 50) and then the excess 
+digits of $b$ are set to zero (lines 53 to 55).  Both ``for'' loops make use of the pointer aliases and in 
+fact the alias for $b$ is carried through into the second ``for'' loop to clear the excess digits.  This optimization 
+allows the alias to stay in a machine register fairly easy between the two loops.
+
+\textbf{Remarks.}  The use of pointer aliases is an implementation methodology first introduced in this function that will
+be used considerably in other functions.  Technically, a pointer alias is simply a short hand alias used to lower the 
+number of pointer dereferencing operations required to access data.  For example, a for loop may resemble
+
+\begin{alltt}
+for (x = 0; x < 100; x++) \{
+    a->num[4]->dp[x] = 0;
+\}
+\end{alltt}
+
+This could be re-written using aliases as 
+
+\begin{alltt}
+mp_digit *tmpa;
+a = a->num[4]->dp;
+for (x = 0; x < 100; x++) \{
+    *a++ = 0;
+\}
+\end{alltt}
+
+In this case an alias is used to access the 
+array of digits within an mp\_int structure directly.  It may seem that a pointer alias is strictly not required 
+as a compiler may optimize out the redundant pointer operations.  However, there are two dominant reasons to use aliases.
+
+The first reason is that most compilers will not effectively optimize pointer arithmetic.  For example, some optimizations 
+may work for the Microsoft Visual C++ compiler (MSVC) and not for the GNU C Compiler (GCC).  Also some optimizations may 
+work for GCC and not MSVC.  As such it is ideal to find a common ground for as many compilers as possible.  Pointer 
+aliases optimize the code considerably before the compiler even reads the source code which means the end compiled code 
+stands a better chance of being faster.
+
+The second reason is that pointer aliases often can make an algorithm simpler to read.  Consider the first ``for'' 
+loop of the function mp\_copy() re-written to not use pointer aliases.
+
+\begin{alltt}
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) \{
+      b->dp[n] = a->dp[n];
+    \}
+\end{alltt}
+
+Whether this code is harder to read depends strongly on the individual.  However, it is quantifiably slightly more 
+complicated as there are four variables within the statement instead of just two.
+
+\subsubsection{Nested Statements}
+Another commonly used technique in the source routines is that certain sections of code are nested.  This is used in
+particular with the pointer aliases to highlight code phases.  For example, a Comba multiplier (discussed in chapter six)
+will typically have three different phases.  First the temporaries are initialized, then the columns calculated and 
+finally the carries are propagated.  In this example the middle column production phase will typically be nested as it
+uses temporary variables and aliases the most.
+
+The nesting also simplies the source code as variables that are nested are only valid for their scope.  As a result
+the various temporary variables required do not propagate into other sections of code.
+
+
+\subsection{Creating a Clone}
+Another common operation is to make a local temporary copy of an mp\_int argument.  To initialize an mp\_int 
+and then copy another existing mp\_int into the newly intialized mp\_int will be known as creating a clone.  This is 
+useful within functions that need to modify an argument but do not wish to actually modify the original copy.  The 
+mp\_init\_copy algorithm has been designed to help perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_copy}. \\
+\textbf{Input}.   An mp\_int $a$ and $b$\\
+\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
+\hline \\
+1.  Init $a$.  (\textit{mp\_init}) \\
+2.  Copy $b$ to $a$.  (\textit{mp\_copy}) \\
+3.  Return the status of the copy operation. \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_copy.}
+This algorithm will initialize an mp\_int variable and copy another previously initialized mp\_int variable into it.  As 
+such this algorithm will perform two operations in one step.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_copy.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* creates "a" then copies b into it */
+018   int mp_init_copy (mp_int * a, mp_int * b)
+019   \{
+020     int     res;
+021   
+022     if ((res = mp_init (a)) != MP_OKAY) \{
+023       return res;
+024     \}
+025     return mp_copy (b, a);
+026   \}
+027   #endif
+\end{alltt}
+\end{small}
+
+This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that 
+\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
+and \textbf{a} will be left intact.  
+
+\section{Zeroing an Integer}
+Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
+perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_zero}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Zero the contents of $a$ \\
+\hline \\
+1.  $a.used \leftarrow 0$ \\
+2.  $a.sign \leftarrow$ MP\_ZPOS \\
+3.  for $n$ from 0 to $a.alloc - 1$ do \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_zero}
+\end{figure}
+
+\textbf{Algorithm mp\_zero.}
+This algorithm simply resets a mp\_int to the default state.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_zero.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* set to zero */
+018   void
+019   mp_zero (mp_int * a)
+020   \{
+021     a->sign = MP_ZPOS;
+022     a->used = 0;
+023     memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+024   \}
+025   #endif
+\end{alltt}
+\end{small}
+
+After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the 
+\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
+
+\section{Sign Manipulation}
+\subsection{Absolute Value}
+With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
+the absolute value of an mp\_int.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_abs}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = \vert a \vert$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  $b.sign \leftarrow MP\_ZPOS$ \\
+4.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_abs}
+\end{figure}
+
+\textbf{Algorithm mp\_abs.}
+This algorithm computes the absolute of an mp\_int input.  First it copies $a$ over $b$.  This is an example of an
+algorithm where the check in mp\_copy that determines if the source and destination are equal proves useful.  This allows,
+for instance, the developer to pass the same mp\_int as the source and destination to this function without addition 
+logic to handle it.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_abs.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = |a| 
+018    *
+019    * Simple function copies the input and fixes the sign to positive
+020    */
+021   int
+022   mp_abs (mp_int * a, mp_int * b)
+023   \{
+024     int     res;
+025   
+026     /* copy a to b */
+027     if (a != b) \{
+028        if ((res = mp_copy (a, b)) != MP_OKAY) \{
+029          return res;
+030        \}
+031     \}
+032   
+033     /* force the sign of b to positive */
+034     b->sign = MP_ZPOS;
+035   
+036     return MP_OKAY;
+037   \}
+038   #endif
+\end{alltt}
+\end{small}
+
+\subsection{Integer Negation}
+With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
+the negative of an mp\_int input.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_neg}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = -a$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  If $a.used = 0$ then return(\textit{MP\_OKAY}). \\
+4.  If $a.sign = MP\_ZPOS$ then do \\
+\hspace{3mm}4.1  $b.sign = MP\_NEG$. \\
+5.  else do \\
+\hspace{3mm}5.1  $b.sign = MP\_ZPOS$. \\
+6.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_neg}
+\end{figure}
+
+\textbf{Algorithm mp\_neg.}
+This algorithm computes the negation of an input.  First it copies $a$ over $b$.  If $a$ has no used digits then
+the algorithm returns immediately.  Otherwise it flips the sign flag and stores the result in $b$.  Note that if 
+$a$ had no digits then it must be positive by definition.  Had step three been omitted then the algorithm would return
+zero as negative.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_neg.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = -a */
+018   int mp_neg (mp_int * a, mp_int * b)
+019   \{
+020     int     res;
+021     if ((res = mp_copy (a, b)) != MP_OKAY) \{
+022       return res;
+023     \}
+024     if (mp_iszero(b) != MP_YES) \{
+025        b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+026     \}
+027     return MP_OKAY;
+028   \}
+029   #endif
+\end{alltt}
+\end{small}
+
+\section{Small Constants}
+\subsection{Setting Small Constants}
+Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set}. \\
+\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{mp\_zero}). \\
+2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
+3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
+                              1 &  \mbox{if }a_0 > 0 \\
+                              0 &  \mbox{if }a_0 = 0 
+                              \end{array} \right .$ \\
+\hline                              
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set}
+\end{figure}
+
+\textbf{Algorithm mp\_set.}
+This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
+single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_set.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* set to a digit */
+018   void mp_set (mp_int * a, mp_digit b)
+019   \{
+020     mp_zero (a);
+021     a->dp[0] = b & MP_MASK;
+022     a->used  = (a->dp[0] != 0) ? 1 : 0;
+023   \}
+024   #endif
+\end{alltt}
+\end{small}
+
+Line 20 calls mp\_zero() to clear the mp\_int and reset the sign.  Line 21 copies the digit 
+into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
+reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
+$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line 22 will set the \textbf{used} member with respect to the 
+digit actually set. This function will always make the integer positive.
+
+One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
+this function should take that into account.  Only trivially small constants can be set using this function.
+
+\subsection{Setting Large Constants}
+To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is ideal.  It accepts a ``long''
+data type as input and will always treat it as a 32-bit integer.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set\_int}. \\
+\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{mp\_zero}) \\
+2.  for $n$ from 0 to 7 do \\
+\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\
+\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
+\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
+\hspace{3mm}2.4  $a.used \leftarrow a.used + 1$ \\
+3.  Clamp excess used digits (\textit{mp\_clamp}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set\_int}
+\end{figure}
+
+\textbf{Algorithm mp\_set\_int.}
+The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the 
+mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions.  In step 2.2 the
+next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is 
+incremented to reflect the addition.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
+zero digits used and the newly added four bits would be ignored.
+
+Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_set\_int.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* set a 32-bit const */
+018   int mp_set_int (mp_int * a, unsigned long b)
+019   \{
+020     int     x, res;
+021   
+022     mp_zero (a);
+023     
+024     /* set four bits at a time */
+025     for (x = 0; x < 8; x++) \{
+026       /* shift the number up four bits */
+027       if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) \{
+028         return res;
+029       \}
+030   
+031       /* OR in the top four bits of the source */
+032       a->dp[0] |= (b >> 28) & 15;
+033   
+034       /* shift the source up to the next four bits */
+035       b <<= 4;
+036   
+037       /* ensure that digits are not clamped off */
+038       a->used += 1;
+039     \}
+040     mp_clamp (a);
+041     return MP_OKAY;
+042   \}
+043   #endif
+\end{alltt}
+\end{small}
+
+This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
+addition on line 38 ensures that the newly added in bits are added to the number of digits.  While it may not 
+seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 27 
+as well as the  call to mp\_clamp() on line 40.  Both functions will clamp excess leading digits which keeps 
+the number of used digits low.
+
+\section{Comparisons}
+\subsection{Unsigned Comparisions}
+Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
+to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
+to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude 
+positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.  
+
+The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
+mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the 
+signs are known to agree in advance.
+
+To facilitate working with the results of the comparison functions three constants are required.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|r|l|}
+\hline \textbf{Constant} & \textbf{Meaning} \\
+\hline \textbf{MP\_GT} & Greater Than \\
+\hline \textbf{MP\_EQ} & Equal To \\
+\hline \textbf{MP\_LT} & Less Than \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Comparison Return Codes}
+\end{figure}
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp\_mag}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
+\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
+\hline \\
+1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
+2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
+3.  for n from $a.used - 1$ to 0 do \\
+\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
+\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
+4.  Return(\textit{MP\_EQ}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp\_mag}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp\_mag.}
+By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
+\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.  
+Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.  
+If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.  
+
+By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
+the zero'th digit.  If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp\_mag.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* compare maginitude of two ints (unsigned) */
+018   int mp_cmp_mag (mp_int * a, mp_int * b)
+019   \{
+020     int     n;
+021     mp_digit *tmpa, *tmpb;
+022   
+023     /* compare based on # of non-zero digits */
+024     if (a->used > b->used) \{
+025       return MP_GT;
+026     \}
+027     
+028     if (a->used < b->used) \{
+029       return MP_LT;
+030     \}
+031   
+032     /* alias for a */
+033     tmpa = a->dp + (a->used - 1);
+034   
+035     /* alias for b */
+036     tmpb = b->dp + (a->used - 1);
+037   
+038     /* compare based on digits  */
+039     for (n = 0; n < a->used; ++n, --tmpa, --tmpb) \{
+040       if (*tmpa > *tmpb) \{
+041         return MP_GT;
+042       \}
+043   
+044       if (*tmpa < *tmpb) \{
+045         return MP_LT;
+046       \}
+047     \}
+048     return MP_EQ;
+049   \}
+050   #endif
+\end{alltt}
+\end{small}
+
+The two if statements on lines 24 and 28 compare the number of digits in the two inputs.  These two are performed before all of the digits
+are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
+without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
+array of digits.
+
+\subsection{Signed Comparisons}
+Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
+comparison a trivial signed comparison algorithm can be written.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
+\hline \\
+1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
+2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
+3.  if $a.sign = MP\_NEG$ then \\
+\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\
+4   Otherwise \\
+\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp.}
+The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate 
+comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step 
+three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then 
+$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* compare two ints (signed)*/
+018   int
+019   mp_cmp (mp_int * a, mp_int * b)
+020   \{
+021     /* compare based on sign */
+022     if (a->sign != b->sign) \{
+023        if (a->sign == MP_NEG) \{
+024           return MP_LT;
+025        \} else \{
+026           return MP_GT;
+027        \}
+028     \}
+029     
+030     /* compare digits */
+031     if (a->sign == MP_NEG) \{
+032        /* if negative compare opposite direction */
+033        return mp_cmp_mag(b, a);
+034     \} else \{
+035        return mp_cmp_mag(a, b);
+036     \}
+037   \}
+038   #endif
+\end{alltt}
+\end{small}
+
+The two if statements on lines 22 and 23 perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   At line 31, the inputs are compared based on magnitudes.  If the signs were both negative then 
+the unsigned comparison is performed in the opposite direction (\textit{line 33}).  Otherwise, the signs are assumed to 
+be both positive and a forward direction unsigned comparison is performed.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
+                     & \\
+$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
+                     & of two random digits (of equal magnitude) before a difference is found. \\
+                     & \\
+$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
+                     & on the observations made in the previous problem. \\
+                     &
+\end{tabular}
+
+\chapter{Basic Arithmetic}
+\section{Introduction}
+At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been 
+established.  The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms.  These 
+algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms.  It is very important 
+that these algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms 
+which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.  
+
+All of the algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right 
+logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real 
+number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $\beta^2 = 10^2$}).  
+Algebraically a binary logical shift is equivalent to a division or multiplication by a power of two.  
+For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
+
+One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
+from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the 
+result is $110_2$.  
+
+\section{Addition and Subtraction}
+In common twos complement fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
+$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.  
+As a result subtraction can be performed with a trivial series of logical operations and an addition.
+
+However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
+sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or 
+subtraction algorithms with the sign fixed up appropriately.
+
+The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
+the integers respectively.
+
+\subsection{Low Level Addition}
+An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the 
+trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.  
+Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
+
+\newpage
+\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
+\hline \\
+1.  if $a.used > b.used$ then \\
+\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
+\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
+\hspace{+3mm}1.3  $x   \leftarrow a$ \\
+2.  else  \\
+\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
+\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
+\hspace{+3mm}2.3  $x   \leftarrow b$ \\
+3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\
+4.  $oldused \leftarrow c.used$ \\
+5.  $c.used \leftarrow max + 1$ \\
+6.  $u \leftarrow 0$ \\
+7.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{+3mm}7.1  $c_n \leftarrow a_n + b_n + u$ \\
+\hspace{+3mm}7.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+8.  if $min \ne max$ then do \\
+\hspace{+3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{+6mm}8.1.1  $c_n \leftarrow x_n + u$ \\
+\hspace{+6mm}8.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9.  $c_{max} \leftarrow u$ \\
+10.  if $olduse > max$ then \\
+\hspace{+3mm}10.1  for $n$ from $max + 1$ to $oldused - 1$ do \\
+\hspace{+6mm}10.1.1  $c_n \leftarrow 0$ \\
+11.  Clamp excess digits in $c$.  (\textit{mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_add}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_add.}
+This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.  
+Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}.  Even the 
+MIX pseudo  machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
+
+The first thing that has to be accomplished is to sort out which of the two inputs is the largest.  The addition logic
+will simply add all of the smallest input to the largest input and store that first part of the result in the
+destination.  Then it will apply a simpler addition loop to excess digits of the larger input.
+
+The first two steps will handle sorting the inputs such that $min$ and $max$ hold the digit counts of the two 
+inputs.  The variable $x$ will be an mp\_int alias for the largest input or the second input $b$ if they have the
+same number of digits.  After the inputs are sorted the destination $c$ is grown as required to accomodate the sum 
+of the two inputs.  The original \textbf{used} count of $c$ is copied and set to the new used count.  
+
+At this point the first addition loop will go through as many digit positions that both inputs have.  The carry
+variable $\mu$ is set to zero outside the loop.  Inside the loop an ``addition'' step requires three statements to produce
+one digit of the summand.  First
+two digits from $a$ and $b$ are added together along with the carry $\mu$.  The carry of this step is extracted and stored
+in $\mu$ and finally the digit of the result $c_n$ is truncated within the range $0 \le c_n < \beta$.
+
+Now all of the digit positions that both inputs have in common have been exhausted.  If $min \ne max$ then $x$ is an alias
+for one of the inputs that has more digits.  A simplified addition loop is then used to essentially copy the remaining digits
+and the carry to the destination.
+
+The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are zeroed which completes the addition.
+
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_add.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* low level addition, based on HAC pp.594, Algorithm 14.7 */
+018   int
+019   s_mp_add (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     mp_int *x;
+022     int     olduse, res, min, max;
+023   
+024     /* find sizes, we let |a| <= |b| which means we have to sort
+025      * them.  "x" will point to the input with the most digits
+026      */
+027     if (a->used > b->used) \{
+028       min = b->used;
+029       max = a->used;
+030       x = a;
+031     \} else \{
+032       min = a->used;
+033       max = b->used;
+034       x = b;
+035     \}
+036   
+037     /* init result */
+038     if (c->alloc < max + 1) \{
+039       if ((res = mp_grow (c, max + 1)) != MP_OKAY) \{
+040         return res;
+041       \}
+042     \}
+043   
+044     /* get old used digit count and set new one */
+045     olduse = c->used;
+046     c->used = max + 1;
+047   
+048     \{
+049       register mp_digit u, *tmpa, *tmpb, *tmpc;
+050       register int i;
+051   
+052       /* alias for digit pointers */
+053   
+054       /* first input */
+055       tmpa = a->dp;
+056   
+057       /* second input */
+058       tmpb = b->dp;
+059   
+060       /* destination */
+061       tmpc = c->dp;
+062   
+063       /* zero the carry */
+064       u = 0;
+065       for (i = 0; i < min; i++) \{
+066         /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+067         *tmpc = *tmpa++ + *tmpb++ + u;
+068   
+069         /* U = carry bit of T[i] */
+070         u = *tmpc >> ((mp_digit)DIGIT_BIT);
+071   
+072         /* take away carry bit from T[i] */
+073         *tmpc++ &= MP_MASK;
+074       \}
+075   
+076       /* now copy higher words if any, that is in A+B 
+077        * if A or B has more digits add those in 
+078        */
+079       if (min != max) \{
+080         for (; i < max; i++) \{
+081           /* T[i] = X[i] + U */
+082           *tmpc = x->dp[i] + u;
+083   
+084           /* U = carry bit of T[i] */
+085           u = *tmpc >> ((mp_digit)DIGIT_BIT);
+086   
+087           /* take away carry bit from T[i] */
+088           *tmpc++ &= MP_MASK;
+089         \}
+090       \}
+091   
+092       /* add carry */
+093       *tmpc++ = u;
+094   
+095       /* clear digits above oldused */
+096       for (i = c->used; i < olduse; i++) \{
+097         *tmpc++ = 0;
+098       \}
+099     \}
+100   
+101     mp_clamp (c);
+102     return MP_OKAY;
+103   \}
+104   #endif
+\end{alltt}
+\end{small}
+
+Lines 27 to 35 perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
+mp\_int assigned to the largest input, in effect it is a local alias.  Lines 37 to 42 ensure that the destination is grown to 
+accomodate the result of the addition. 
+
+Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
+lines 55, 58 and 61 represent the two inputs and destination variables respectively.  These aliases are used to ensure the
+compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
+
+The initial carry $u$ is cleared on line 64, note that $u$ is of type mp\_digit which ensures type compatibility within the 
+implementation.  The initial addition loop begins on line 65 and ends on line 74.  Similarly the conditional addition loop
+begins on line 80 and ends on line 90.  The addition is finished with the final carry being stored in $tmpc$ on line 93.  
+Note the ``++'' operator on the same line.  After line 93 $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop on lines 96 to 99 which set any old upper digits to zero.
+
+\subsection{Low Level Subtraction}
+The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
+unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must 
+be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.  
+This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
+
+
+For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
+the range $0 \le x < 2\beta$ for the algorithms to work correctly.  However, it is allowable that a mp\_digit represent a larger range of values.  For 
+this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a 
+mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
+
+For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
+\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
+\hline \\
+1.  $min \leftarrow b.used$ \\
+2.  $max \leftarrow a.used$ \\
+3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{mp\_grow}) \\
+4.  $oldused \leftarrow c.used$ \\ 
+5.  $c.used \leftarrow max$ \\
+6.  $u \leftarrow 0$ \\
+7.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{3mm}7.1  $c_n \leftarrow a_n - b_n - u$ \\
+\hspace{3mm}7.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+8.  if $min < max$ then do \\
+\hspace{3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{6mm}8.1.1  $c_n \leftarrow a_n - u$ \\
+\hspace{6mm}8.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9. if $oldused > max$ then do \\
+\hspace{3mm}9.1  for $n$ from $max$ to $oldused - 1$ do \\
+\hspace{6mm}9.1.1  $c_n \leftarrow 0$ \\
+10. Clamp excess digits of $c$.  (\textit{mp\_clamp}). \\
+11. Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_sub}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sub.}
+This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
+passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
+algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
+of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
+
+The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2 
+set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at 
+most $max$ digits in length as opposed to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and 
+set to the maximal count for the operation.
+
+The subtraction loop that begins on step seven is essentially the same as the addition loop of algorithm s\_mp\_add except single precision 
+subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction 
+loops.  Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry.  
+
+For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$.  The least significant bit will force a carry upwards to 
+the third bit which will be set to zero after the borrow.  After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the 
+third bit of $0101_2$ is subtracted from the result it will cause another carry.  In this case though the carry will be forced to propagate all the 
+way to the most significant bit.  
+
+Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most 
+significant bit.  Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that
+is needed is a single zero or one bit for the carry.  Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the 
+carry.  This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed.  
+
+If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
+10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sub.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
+018   int
+019   s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     int     olduse, res, min, max;
+022   
+023     /* find sizes */
+024     min = b->used;
+025     max = a->used;
+026   
+027     /* init result */
+028     if (c->alloc < max) \{
+029       if ((res = mp_grow (c, max)) != MP_OKAY) \{
+030         return res;
+031       \}
+032     \}
+033     olduse = c->used;
+034     c->used = max;
+035   
+036     \{
+037       register mp_digit u, *tmpa, *tmpb, *tmpc;
+038       register int i;
+039   
+040       /* alias for digit pointers */
+041       tmpa = a->dp;
+042       tmpb = b->dp;
+043       tmpc = c->dp;
+044   
+045       /* set carry to zero */
+046       u = 0;
+047       for (i = 0; i < min; i++) \{
+048         /* T[i] = A[i] - B[i] - U */
+049         *tmpc = *tmpa++ - *tmpb++ - u;
+050   
+051         /* U = carry bit of T[i]
+052          * Note this saves performing an AND operation since
+053          * if a carry does occur it will propagate all the way to the
+054          * MSB.  As a result a single shift is enough to get the carry
+055          */
+056         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+057   
+058         /* Clear carry from T[i] */
+059         *tmpc++ &= MP_MASK;
+060       \}
+061   
+062       /* now copy higher words if any, e.g. if A has more digits than B  */
+063       for (; i < max; i++) \{
+064         /* T[i] = A[i] - U */
+065         *tmpc = *tmpa++ - u;
+066   
+067         /* U = carry bit of T[i] */
+068         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+069   
+070         /* Clear carry from T[i] */
+071         *tmpc++ &= MP_MASK;
+072       \}
+073   
+074       /* clear digits above used (since we may not have grown result above) */
+      
+075       for (i = c->used; i < olduse; i++) \{
+076         *tmpc++ = 0;
+077       \}
+078     \}
+079   
+080     mp_clamp (c);
+081     return MP_OKAY;
+082   \}
+083   
+084   #endif
+\end{alltt}
+\end{small}
+
+Line 24 and 25 perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines 41, 42 and 43 initialize the aliases for 
+$a$, $b$ and $c$ respectively.
+
+The first subtraction loop occurs on lines 46 through 60.  The theory behind the subtraction loop is exactly the same as that for
+the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
+(\textit{see line 56}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
+the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
+occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
+shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
+twos compliment machines which is a safe assumption to make.
+
+If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines 63 through 72}) is required to propagate the carry through
+$a$ and copy the result to $c$.  
+
+\subsection{High Level Addition}
+Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
+established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data 
+types.  
+
+Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} 
+flag.  A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases.
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed addition $c = a + b$. \\
+\hline \\
+1.  if $a.sign = b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag})  \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_add}
+\end{figure}
+
+\textbf{Algorithm mp\_add.}
+This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from 
+either \cite{TAOCPV2} or \cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly 
+straightforward but restricted since subtraction can only produce positive results.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&&\\
+
+\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
+\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
+
+\hline &&&&\\
+
+\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Addition Guide Chart}
+\label{fig:AddChart}
+\end{figure}
+
+Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three 
+specific cases need to be handled.  The return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are 
+forwarded to step three to check for errors.  This simplifies the description of the algorithm considerably and best 
+follows how the implementation actually was achieved.
+
+Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
+s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
+to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.
+
+For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
+produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp 
+within algorithm s\_mp\_add will force $-0$ to become $0$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_add.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* high level addition (handles signs) */
+018   int mp_add (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     int     sa, sb, res;
+021   
+022     /* get sign of both inputs */
+023     sa = a->sign;
+024     sb = b->sign;
+025   
+026     /* handle two cases, not four */
+027     if (sa == sb) \{
+028       /* both positive or both negative */
+029       /* add their magnitudes, copy the sign */
+030       c->sign = sa;
+031       res = s_mp_add (a, b, c);
+032     \} else \{
+033       /* one positive, the other negative */
+034       /* subtract the one with the greater magnitude from */
+035       /* the one of the lesser magnitude.  The result gets */
+036       /* the sign of the one with the greater magnitude. */
+037       if (mp_cmp_mag (a, b) == MP_LT) \{
+038         c->sign = sb;
+039         res = s_mp_sub (b, a, c);
+040       \} else \{
+041         c->sign = sa;
+042         res = s_mp_sub (a, b, c);
+043       \}
+044     \}
+045     return res;
+046   \}
+047   
+048   #endif
+\end{alltt}
+\end{small}
+
+The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
+is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
+explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
+level functions do so.  Returning their return code is sufficient.
+
+\subsection{High Level Subtraction}
+The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed subtraction $c = a - b$. \\
+\hline \\
+1.  if $a.sign \ne b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_sub}
+\end{figure}
+
+\textbf{Algorithm mp\_sub.}
+This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or 
+\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  Chart \ref{fig:SubChart} lists the eight possible inputs and
+the operations required.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Subtraction Guide Chart}
+\label{fig:SubChart}
+\end{figure}
+
+Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the 
+algorithm from producing $-a - -a = -0$ as a result.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_sub.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* high level subtraction (handles signs) */
+018   int
+019   mp_sub (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     int     sa, sb, res;
+022   
+023     sa = a->sign;
+024     sb = b->sign;
+025   
+026     if (sa != sb) \{
+027       /* subtract a negative from a positive, OR */
+028       /* subtract a positive from a negative. */
+029       /* In either case, ADD their magnitudes, */
+030       /* and use the sign of the first number. */
+031       c->sign = sa;
+032       res = s_mp_add (a, b, c);
+033     \} else \{
+034       /* subtract a positive from a positive, OR */
+035       /* subtract a negative from a negative. */
+036       /* First, take the difference between their */
+037       /* magnitudes, then... */
+038       if (mp_cmp_mag (a, b) != MP_LT) \{
+039         /* Copy the sign from the first */
+040         c->sign = sa;
+041         /* The first has a larger or equal magnitude */
+042         res = s_mp_sub (a, b, c);
+043       \} else \{
+044         /* The result has the *opposite* sign from */
+045         /* the first number. */
+046         c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+047         /* The second has a larger magnitude */
+048         res = s_mp_sub (b, a, c);
+049       \}
+050     \}
+051     return res;
+052   \}
+053   
+054   #endif
+\end{alltt}
+\end{small}
+
+Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
+and forward it to the end of the function.  On line 38 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
+``greater than or equal to'' comparison.  
+
+\section{Bit and Digit Shifting}
+It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.  
+This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.  
+
+In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
+the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
+are on radix-$\beta$ digits.  
+
+\subsection{Multiplication by Two}
+
+In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient 
+operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = 2a$. \\
+\hline \\
+1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{mp\_grow}) \\
+2.  $oldused \leftarrow b.used$ \\
+3.  $b.used \leftarrow a.used$ \\
+4.  $r \leftarrow 0$ \\
+5.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}5.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
+\hspace{3mm}5.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}5.3  $r \leftarrow rr$ \\
+6.  If $r \ne 0$ then do \\
+\hspace{3mm}6.1  $b_{n + 1} \leftarrow r$ \\
+\hspace{3mm}6.2  $b.used \leftarrow b.used + 1$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2.}
+This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such 
+an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since 
+it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.  
+
+Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
+is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
+
+Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together 
+are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
+obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
+the previous carry.  Recall from section 4.1 that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with 
+forwarding the carry to the next iteration.
+
+Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$.  
+Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = a*2 */
+018   int mp_mul_2(mp_int * a, mp_int * b)
+019   \{
+020     int     x, res, oldused;
+021   
+022     /* grow to accomodate result */
+023     if (b->alloc < a->used + 1) \{
+024       if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) \{
+025         return res;
+026       \}
+027     \}
+028   
+029     oldused = b->used;
+030     b->used = a->used;
+031   
+032     \{
+033       register mp_digit r, rr, *tmpa, *tmpb;
+034   
+035       /* alias for source */
+036       tmpa = a->dp;
+037       
+038       /* alias for dest */
+039       tmpb = b->dp;
+040   
+041       /* carry */
+042       r = 0;
+043       for (x = 0; x < a->used; x++) \{
+044       
+045         /* get what will be the *next* carry bit from the 
+046          * MSB of the current digit 
+047          */
+048         rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
+049         
+050         /* now shift up this digit, add in the carry [from the previous] */
+051         *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
+052         
+053         /* copy the carry that would be from the source 
+054          * digit into the next iteration 
+055          */
+056         r = rr;
+057       \}
+058   
+059       /* new leading digit? */
+060       if (r != 0) \{
+061         /* add a MSB which is always 1 at this point */
+062         *tmpb = 1;
+063         ++(b->used);
+064       \}
+065   
+066       /* now zero any excess digits on the destination 
+067        * that we didn't write to 
+068        */
+069       tmpb = b->dp + b->used;
+070       for (x = b->used; x < oldused; x++) \{
+071         *tmpb++ = 0;
+072       \}
+073     \}
+074     b->sign = a->sign;
+075     return MP_OKAY;
+076   \}
+077   #endif
+\end{alltt}
+\end{small}
+
+This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
+is the use of the logical shift operator on line 51 to perform a single precision doubling.  
+
+\subsection{Division by Two}
+A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = a/2$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{mp\_grow}) \\
+2.  If the reallocation failed return(\textit{MP\_MEM}). \\
+3.  $oldused \leftarrow b.used$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $r \leftarrow 0$ \\
+6.  for $n$ from $b.used - 1$ to $0$ do \\
+\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
+\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}6.3  $r \leftarrow rr$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Clamp excess digits of $b$.  (\textit{mp\_clamp}) \\
+10.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2.}
+This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
+core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
+could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
+reading past the end of the array of digits.
+
+Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the 
+least significant bit not the most significant bit.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = a/2 */
+018   int mp_div_2(mp_int * a, mp_int * b)
+019   \{
+020     int     x, res, oldused;
+021   
+022     /* copy */
+023     if (b->alloc < a->used) \{
+024       if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
+025         return res;
+026       \}
+027     \}
+028   
+029     oldused = b->used;
+030     b->used = a->used;
+031     \{
+032       register mp_digit r, rr, *tmpa, *tmpb;
+033   
+034       /* source alias */
+035       tmpa = a->dp + b->used - 1;
+036   
+037       /* dest alias */
+038       tmpb = b->dp + b->used - 1;
+039   
+040       /* carry */
+041       r = 0;
+042       for (x = b->used - 1; x >= 0; x--) \{
+043         /* get the carry for the next iteration */
+044         rr = *tmpa & 1;
+045   
+046         /* shift the current digit, add in carry and store */
+047         *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+048   
+049         /* forward carry to next iteration */
+050         r = rr;
+051       \}
+052   
+053       /* zero excess digits */
+054       tmpb = b->dp + b->used;
+055       for (x = b->used; x < oldused; x++) \{
+056         *tmpb++ = 0;
+057       \}
+058     \}
+059     b->sign = a->sign;
+060     mp_clamp (b);
+061     return MP_OKAY;
+062   \}
+063   #endif
+\end{alltt}
+\end{small}
+
+\section{Polynomial Basis Operations}
+Recall from section 4.3 that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
+the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single 
+place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
+division and Karatsuba multiplication.  
+
+Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
+$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
+polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.  
+
+\subsection{Multiplication by $x$}
+
+Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one 
+degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
+multiplying by the integer $\beta$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
+2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{mp\_grow}). \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  $a.used \leftarrow a.used + b$ \\
+5.  $i \leftarrow a.used - 1$ \\
+6.  $j \leftarrow a.used - 1 - b$ \\
+7.  for $n$ from $a.used - 1$ to $b$ do \\
+\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
+\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
+\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
+8.  for $n$ from 0 to $b - 1$ do \\
+\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lshd}
+\end{figure}
+
+\textbf{Algorithm mp\_lshd.}
+This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs 
+from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location.  The
+motivation behind this change is due to the way this function is typically used.  Algorithms such as mp\_add store the result in an optionally
+different third mp\_int because the original inputs are often still required.  Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is
+typically used on values where the original value is no longer required.  The algorithm will return success immediately if 
+$b \le 0$ since the rest of algorithm is only valid when $b > 0$.  
+
+First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
+the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).  
+The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on 
+step 8 sets the lower $b$ digits to zero.
+
+\newpage
+\begin{center}
+\begin{figure}[here]
+\includegraphics{pics/sliding_window.ps}
+\caption{Sliding Window Movement}
+\label{pic:sliding_window}
+\end{figure}
+\end{center}
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_lshd.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift left a certain amount of digits */
+018   int mp_lshd (mp_int * a, int b)
+019   \{
+020     int     x, res;
+021   
+022     /* if its less than zero return */
+023     if (b <= 0) \{
+024       return MP_OKAY;
+025     \}
+026   
+027     /* grow to fit the new digits */
+028     if (a->alloc < a->used + b) \{
+029        if ((res = mp_grow (a, a->used + b)) != MP_OKAY) \{
+030          return res;
+031        \}
+032     \}
+033   
+034     \{
+035       register mp_digit *top, *bottom;
+036   
+037       /* increment the used by the shift amount then copy upwards */
+038       a->used += b;
+039   
+040       /* top */
+041       top = a->dp + a->used - 1;
+042   
+043       /* base */
+044       bottom = a->dp + a->used - 1 - b;
+045   
+046       /* much like mp_rshd this is implemented using a sliding window
+047        * except the window goes the otherway around.  Copying from
+048        * the bottom to the top.  see bn_mp_rshd.c for more info.
+049        */
+050       for (x = a->used - 1; x >= b; x--) \{
+051         *top-- = *bottom--;
+052       \}
+053   
+054       /* zero the lower digits */
+055       top = a->dp;
+056       for (x = 0; x < b; x++) \{
+057         *top++ = 0;
+058       \}
+059     \}
+060     return MP_OKAY;
+061   \}
+062   #endif
+\end{alltt}
+\end{small}
+
+The if statement on line 23 ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
+the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line 41 is an alias
+for the leading digit while $bottom$ on line 44 is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
+over the input.  
+
+\subsection{Division by $x$}
+
+Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return. \\
+2.  If $a.used \le b$ then do \\
+\hspace{3mm}2.1  Zero $a$.  (\textit{mp\_zero}). \\
+\hspace{3mm}2.2  Return. \\
+3.  $i \leftarrow 0$ \\
+4.  $j \leftarrow b$ \\
+5.  for $n$ from 0 to $a.used - b - 1$ do \\
+\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
+\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
+\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
+6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
+\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.used \leftarrow a.used - b$ \\
+8.  Return. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rshd}
+\end{figure}
+
+\textbf{Algorithm mp\_rshd.}
+This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
+it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.  
+
+If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
+to the shift count $b$ then it will simply zero the input and return.
+
+After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
+is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.  
+Also the digits are copied from the leading to the trailing edge.
+
+Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_rshd.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift right a certain amount of digits */
+018   void mp_rshd (mp_int * a, int b)
+019   \{
+020     int     x;
+021   
+022     /* if b <= 0 then ignore it */
+023     if (b <= 0) \{
+024       return;
+025     \}
+026   
+027     /* if b > used then simply zero it and return */
+028     if (a->used <= b) \{
+029       mp_zero (a);
+030       return;
+031     \}
+032   
+033     \{
+034       register mp_digit *bottom, *top;
+035   
+036       /* shift the digits down */
+037   
+038       /* bottom */
+039       bottom = a->dp;
+040   
+041       /* top [offset into digits] */
+042       top = a->dp + b;
+043   
+044       /* this is implemented as a sliding window where 
+045        * the window is b-digits long and digits from 
+046        * the top of the window are copied to the bottom
+047        *
+048        * e.g.
+049   
+050        b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
+051                    /\symbol{92}                   |      ---->
+052                     \symbol{92}-------------------/      ---->
+053        */
+054       for (x = 0; x < (a->used - b); x++) \{
+055         *bottom++ = *top++;
+056       \}
+057   
+058       /* zero the top digits */
+059       for (; x < a->used; x++) \{
+060         *bottom++ = 0;
+061       \}
+062     \}
+063     
+064     /* remove excess digits */
+065     a->used -= b;
+066   \}
+067   #endif
+\end{alltt}
+\end{small}
+
+The only noteworthy element of this routine is the lack of a return type.  
+
+-- Will update later to give it a return type...Tom
+
+\section{Powers of Two}
+
+Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For 
+example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
+shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.  
+
+\subsection{Multiplication by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
+\hline \\
+1.  $c \leftarrow a$.  (\textit{mp\_copy}) \\
+2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  If $b \ge lg(\beta)$ then \\
+\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\
+\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
+5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $d \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+\hspace{3mm}6.4  If $r > 0$ then do \\
+\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
+\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2d.}
+This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
+quickly compute the product.
+
+First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than 
+$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ 
+left.
+
+After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts 
+required.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
+Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
+variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
+
+This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
+complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift left by a certain bit count */
+018   int mp_mul_2d (mp_int * a, int b, mp_int * c)
+019   \{
+020     mp_digit d;
+021     int      res;
+022   
+023     /* copy */
+024     if (a != c) \{
+025        if ((res = mp_copy (a, c)) != MP_OKAY) \{
+026          return res;
+027        \}
+028     \}
+029   
+030     if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) \{
+031        if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) \{
+032          return res;
+033        \}
+034     \}
+035   
+036     /* shift by as many digits in the bit count */
+037     if (b >= (int)DIGIT_BIT) \{
+038       if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) \{
+039         return res;
+040       \}
+041     \}
+042   
+043     /* shift any bit count < DIGIT_BIT */
+044     d = (mp_digit) (b % DIGIT_BIT);
+045     if (d != 0) \{
+046       register mp_digit *tmpc, shift, mask, r, rr;
+047       register int x;
+048   
+049       /* bitmask for carries */
+050       mask = (((mp_digit)1) << d) - 1;
+051   
+052       /* shift for msbs */
+053       shift = DIGIT_BIT - d;
+054   
+055       /* alias */
+056       tmpc = c->dp;
+057   
+058       /* carry */
+059       r    = 0;
+060       for (x = 0; x < c->used; x++) \{
+061         /* get the higher bits of the current word */
+062         rr = (*tmpc >> shift) & mask;
+063   
+064         /* shift the current word and OR in the carry */
+065         *tmpc = ((*tmpc << d) | r) & MP_MASK;
+066         ++tmpc;
+067   
+068         /* set the carry to the carry bits of the current word */
+069         r = rr;
+070       \}
+071       
+072       /* set final carry */
+073       if (r != 0) \{
+074          c->dp[(c->used)++] = r;
+075       \}
+076     \}
+077     mp_clamp (c);
+078     return MP_OKAY;
+079   \}
+080   #endif
+\end{alltt}
+\end{small}
+
+Notes to be revised when code is updated. -- Tom
+
+\subsection{Division by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
+\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{mp\_zero}) \\
+\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow a$ \\
+3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+4.  If $b \ge lg(\beta)$ then do \\
+\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\
+5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $k \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+7.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2d.}
+This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm 
+mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
+by using algorithm mp\_mod\_2d.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift right by a certain bit count (store quotient in c, optional remaind
+      er in d) */
+018   int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
+019   \{
+020     mp_digit D, r, rr;
+021     int     x, res;
+022     mp_int  t;
+023   
+024   
+025     /* if the shift count is <= 0 then we do no work */
+026     if (b <= 0) \{
+027       res = mp_copy (a, c);
+028       if (d != NULL) \{
+029         mp_zero (d);
+030       \}
+031       return res;
+032     \}
+033   
+034     if ((res = mp_init (&t)) != MP_OKAY) \{
+035       return res;
+036     \}
+037   
+038     /* get the remainder */
+039     if (d != NULL) \{
+040       if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) \{
+041         mp_clear (&t);
+042         return res;
+043       \}
+044     \}
+045   
+046     /* copy */
+047     if ((res = mp_copy (a, c)) != MP_OKAY) \{
+048       mp_clear (&t);
+049       return res;
+050     \}
+051   
+052     /* shift by as many digits in the bit count */
+053     if (b >= (int)DIGIT_BIT) \{
+054       mp_rshd (c, b / DIGIT_BIT);
+055     \}
+056   
+057     /* shift any bit count < DIGIT_BIT */
+058     D = (mp_digit) (b % DIGIT_BIT);
+059     if (D != 0) \{
+060       register mp_digit *tmpc, mask, shift;
+061   
+062       /* mask */
+063       mask = (((mp_digit)1) << D) - 1;
+064   
+065       /* shift for lsb */
+066       shift = DIGIT_BIT - D;
+067   
+068       /* alias */
+069       tmpc = c->dp + (c->used - 1);
+070   
+071       /* carry */
+072       r = 0;
+073       for (x = c->used - 1; x >= 0; x--) \{
+074         /* get the lower  bits of this word in a temp */
+075         rr = *tmpc & mask;
+076   
+077         /* shift the current word and mix in the carry bits from the previous 
+      word */
+078         *tmpc = (*tmpc >> D) | (r << shift);
+079         --tmpc;
+080   
+081         /* set the carry to the carry bits of the current word found above */
+082         r = rr;
+083       \}
+084     \}
+085     mp_clamp (c);
+086     if (d != NULL) \{
+087       mp_exch (&t, d);
+088     \}
+089     mp_clear (&t);
+090     return MP_OKAY;
+091   \}
+092   #endif
+\end{alltt}
+\end{small}
+
+The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally 
+ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the 
+result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
+the quotient is obtained.
+
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+
+\subsection{Remainder of Division by Power of Two}
+
+The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
+algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mod\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{mp\_zero}) \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $b > a.used \cdot lg(\beta)$ then do \\
+\hspace{3mm}2.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
+\hspace{3mm}2.2  Return the result of step 2.1. \\
+3.  $c \leftarrow a$ \\
+4.  If step 3 failed return(\textit{MP\_MEM}). \\
+5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
+\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
+6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
+8.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mod\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mod\_2d.}
+This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the 
+result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$ 
+is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mod\_2d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* calc a value mod 2**b */
+018   int
+019   mp_mod_2d (mp_int * a, int b, mp_int * c)
+020   \{
+021     int     x, res;
+022   
+023     /* if b is <= 0 then zero the int */
+024     if (b <= 0) \{
+025       mp_zero (c);
+026       return MP_OKAY;
+027     \}
+028   
+029     /* if the modulus is larger than the value than return */
+030     if (b > (int) (a->used * DIGIT_BIT)) \{
+031       res = mp_copy (a, c);
+032       return res;
+033     \}
+034   
+035     /* copy */
+036     if ((res = mp_copy (a, c)) != MP_OKAY) \{
+037       return res;
+038     \}
+039   
+040     /* zero digits above the last digit of the modulus */
+041     for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x+
+      +) \{
+042       c->dp[x] = 0;
+043     \}
+044     /* clear the digit that is not completely outside/inside the modulus */
+045     c->dp[b / DIGIT_BIT] &=
+046       (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digi
+      t) 1));
+047     mp_clamp (c);
+048     return MP_OKAY;
+049   \}
+050   #endif
+\end{alltt}
+\end{small}
+
+-- Add comments later, Tom.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
+                      & in $O(n)$ time. \\
+                      &\\
+$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
+                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
+                      & upto $64$ with a hamming weight less than three. \\
+                      &\\
+$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
+                      & $2^k - 1$ as well. \\
+                      &\\
+$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
+                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
+                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
+                      & calculation.  \\
+                      & \\
+$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
+                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
+                      & the cost of addition. \\
+                      & \\
+$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
+                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
+                      & \\
+$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
+                      & calculating the result of a signed comparison. \\
+                      &
+\end{tabular}
+
+\chapter{Multiplication and Squaring}
+\section{The Multipliers}
+For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of 
+algorithms of any multiple precision integer package.  The set of multiplier algorithms include integer multiplication, squaring and modular reduction 
+where in each of the algorithms single precision multiplication is the dominant operation performed.  This chapter will discuss integer multiplication 
+and squaring, leaving modular reductions for the subsequent chapter.  
+
+The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular 
+exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  During a modular
+exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, 
+35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision 
+multiplications.
+
+For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied 
+against every digit of the other multiplicand.  Traditional long-hand multiplication is based on this process;  while the techniques can differ the 
+overall algorithm used is essentially the same.  Only ``recently'' have faster algorithms been studied.  First Karatsuba multiplication was discovered in 
+1962.  This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach.  
+This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions.  
+
+\section{Multiplication}
+\subsection{The Baseline Multiplication}
+\label{sec:basemult}
+\index{baseline multiplication}
+Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
+algorithm that school children are taught.  The algorithm is considered an $O(n^2)$ algorithm since for two $n$-digit inputs $n^2$ single precision 
+multiplications are required.  More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required.  To 
+simplify most discussions, it will be assumed that the inputs have comparable number of digits.  
+
+The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be 
+used.  This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible.    One important 
+facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution.  The importance of this 
+modification will become evident during the discussion of Barrett modular reduction.  Recall that for a $n$ and $m$ digit input the product 
+will be at most $n + m$ digits.  Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product.  
+
+Recall from sub-section 4.2.2 the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend the variable set to 
+include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The 
+constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see sub-section 5.2.2 for more information}).
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+1.  If min$(a.used, b.used) < \delta$ then do \\
+\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}).  \\
+\hspace{3mm}1.2  Return the result of step 1.1 \\
+\\
+Allocate and initialize a temporary mp\_int. \\
+2.  Init $t$ to be of size $digs$ \\
+3.  If step 2 failed return(\textit{MP\_MEM}). \\
+4.  $t.used \leftarrow digs$ \\
+\\
+Compute the product. \\
+5.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}5.1  $u \leftarrow 0$ \\
+\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
+\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
+\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.5  if $ix + pb < digs$ then do \\
+\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
+6.  Clamp excess digits of $t$. \\
+7.  Swap $c$ with $t$ \\
+8.  Clear $t$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_mul\_digs}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_mul\_digs.}
+This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits.  While it may seem
+a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent 
+algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}.  
+Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the 
+inputs.
+
+The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   If the minimum digit count of either
+input is less than $\delta$, then the Comba method may be used instead.    After the Comba method is ruled out, the baseline algorithm begins.  A 
+temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to 
+compute products when either $a = c$ or $b = c$ without overwriting the inputs.  
+
+All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
+is given the count of digits to read from $b$ inside the nested loop.  If $pb \le 1$ then no more output digits can be produced and the algorithm
+will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplications.    That is, in each pass of the 
+innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.  
+
+For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
+visualized in the following table.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|l|}
+\hline   &&          & 5 & 7 & 6 & \\
+\hline   $\times$&&  & 2 & 4 & 1 & \\
+\hline &&&&&&\\
+  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
+  &2 &   3    & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\
+  1 & 3 & 8 & 8 & 1 & 6 &   $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\
+\hline  
+\end{tabular}
+\end{center}
+\caption{Long-Hand Multiplication Diagram}
+\end{figure}
+
+Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate 
+count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
+
+Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable.  The multiplication on that step
+is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
+double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
+5.4.1 is propagated through the nested loop.  If the carry was not propagated immediately it would overflow the single precision digit 
+$t_{ix+iy}$ and the result would be lost.  
+
+At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  The carry does not have to be added to the $ix+pb$'th
+digit since that digit is assumed to be zero at this point.  However, if $ix + pb \ge digs$ the carry is not set as it would make the result
+exceed the precision requested.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_mul\_digs.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* multiplies |a| * |b| and only computes upto digs digits of result
+018    * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
+019    * many digits of output are created.
+020    */
+021   int
+022   s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+023   \{
+024     mp_int  t;
+025     int     res, pa, pb, ix, iy;
+026     mp_digit u;
+027     mp_word r;
+028     mp_digit tmpx, *tmpt, *tmpy;
+029   
+030     /* can we use the fast multiplier? */
+031     if (((digs) < MP_WARRAY) &&
+032         MIN (a->used, b->used) < 
+033             (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
+034       return fast_s_mp_mul_digs (a, b, c, digs);
+035     \}
+036   
+037     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
+038       return res;
+039     \}
+040     t.used = digs;
+041   
+042     /* compute the digits of the product directly */
+043     pa = a->used;
+044     for (ix = 0; ix < pa; ix++) \{
+045       /* set the carry to zero */
+046       u = 0;
+047   
+048       /* limit ourselves to making digs digits of output */
+049       pb = MIN (b->used, digs - ix);
+050   
+051       /* setup some aliases */
+052       /* copy of the digit from a used within the nested loop */
+053       tmpx = a->dp[ix];
+054       
+055       /* an alias for the destination shifted ix places */
+056       tmpt = t.dp + ix;
+057       
+058       /* an alias for the digits of b */
+059       tmpy = b->dp;
+060   
+061       /* compute the columns of the output and propagate the carry */
+062       for (iy = 0; iy < pb; iy++) \{
+063         /* compute the column as a mp_word */
+064         r       = ((mp_word)*tmpt) +
+065                   ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+066                   ((mp_word) u);
+067   
+068         /* the new column is the lower part of the result */
+069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+070   
+071         /* get the carry word from the result */
+072         u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+073       \}
+074       /* set carry if it is placed below digs */
+075       if (ix + iy < digs) \{
+076         *tmpt = u;
+077       \}
+078     \}
+079   
+080     mp_clamp (&t);
+081     mp_exch (&t, c);
+082   
+083     mp_clear (&t);
+084     return MP_OKAY;
+085   \}
+086   #endif
+\end{alltt}
+\end{small}
+
+Lines 31 to 35 determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
+the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
+the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
+
+Of particular importance is the calculation of the $ix+iy$'th column on lines 64, 65 and 66.  Note how all of the
+variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
+are used instead of single precision.  The multiplication on line 65 makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
+the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
+processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
+example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+
+\subsection{Faster Multiplication by the ``Comba'' Method}
+
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
+makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
+(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
+carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
+his 1986 paper \cite{BARRETT} written five years before.
+
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
+the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
+final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
+
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
+simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
+of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
+
+\begin{equation}
+\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
+\end{equation}
+
+Where $\vec x_n$ is the $n'th$ column of the output vector.  Consider the following example which computes the vector $\vec x$ for the multiplication
+of $576$ and $241$.  
+
+\newpage\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|}
+  \hline &          & 5 & 7 & 6 & First Input\\
+  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
+\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
+                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
+   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
+\hline 10 & 34 & 45 & 31 & 6 & Final Result \\   
+\hline   
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Comba Multiplication Diagram}
+\end{figure}
+
+At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.  
+Now the columns must be fixed by propagating the carry upwards.  The resultant vector will have one extra dimension over the input vector which is
+congruent to adding a leading zero digit.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Comba Fixup}. \\
+\textbf{Input}.   Vector $\vec x$ of dimension $k$ \\
+\textbf{Output}.  Vector $\vec x$ such that the carries have been propagated. \\
+\hline \\
+1.  for $n$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\
+\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\
+2.  Return($\vec x$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Comba Fixup}
+\end{figure}
+
+With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case 
+$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
+efficient than the baseline algorithm why not simply always use this algorithm?
+
+\subsubsection{Column Weight.}
+At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output 
+independently.  A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix
+the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
+three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
+an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit inputs the maximum weight of any column is 
+min$(m, n)$ which is fairly obvious.
+
+The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
+from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
+two quantities we must not violate the following
+
+\begin{equation}
+k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
+\end{equation}
+
+Which reduces to 
+
+\begin{equation}
+k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
+\end{equation}
+
+Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
+found.
+
+\begin{equation}
+k  < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}}
+\end{equation}
+
+The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$.  In this configuration 
+the smaller input may not have more than $256$ digits if the Comba method is to be used.  This is quite satisfactory for most applications since 
+$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}).\\
+\\
+Zero the temporary array $\hat W$. \\
+3.  for $n$ from $0$ to $digs - 1$ do \\
+\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
+\\
+Compute the columns. \\
+4.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
+\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+\\
+Propagate the carries upwards. \\
+5.  $oldused \leftarrow c.used$ \\
+6.  $c.used \leftarrow digs$ \\
+7.  If $digs > 1$ then do \\
+\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
+\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
+\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
+8.  else do \\
+\hspace{3mm}8.1  $ix \leftarrow 0$ \\
+9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+\\
+Zero excess digits. \\
+10.  If $digs < oldused$ then do \\
+\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
+\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
+11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_mul\_digs}
+\label{fig:COMBAMULT}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
+essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
+
+The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
+unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
+
+The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
+a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
+iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
+
+To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
+cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
+$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method requires only $O(pn^2 + qn)$ time, however in practice, 
+the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
+and addition operations in the nested loop in parallel.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_mul\_digs.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* Fast (comba) multiplier
+018    *
+019    * This is the fast column-array [comba] multiplier.  It is 
+020    * designed to compute the columns of the product first 
+021    * then handle the carries afterwards.  This has the effect 
+022    * of making the nested loops that compute the columns very
+023    * simple and schedulable on super-scalar processors.
+024    *
+025    * This has been modified to produce a variable number of 
+026    * digits of output so if say only a half-product is required 
+027    * you don't have to compute the upper half (a feature 
+028    * required for fast Barrett reduction).
+029    *
+030    * Based on Algorithm 14.12 on pp.595 of HAC.
+031    *
+032    */
+033   int
+034   fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+035   \{
+036     int     olduse, res, pa, ix, iz;
+037     mp_digit W[MP_WARRAY];
+038     register mp_word  _W;
+039   
+040     /* grow the destination as required */
+041     if (c->alloc < digs) \{
+042       if ((res = mp_grow (c, digs)) != MP_OKAY) \{
+043         return res;
+044       \}
+045     \}
+046   
+047     /* number of output digits to produce */
+048     pa = MIN(digs, a->used + b->used);
+049   
+050     /* clear the carry */
+051     _W = 0;
+052     for (ix = 0; ix <= pa; ix++) \{ 
+053         int      tx, ty;
+054         int      iy;
+055         mp_digit *tmpx, *tmpy;
+056   
+057         /* get offsets into the two bignums */
+058         ty = MIN(b->used-1, ix);
+059         tx = ix - ty;
+060   
+061         /* setup temp aliases */
+062         tmpx = a->dp + tx;
+063         tmpy = b->dp + ty;
+064   
+065         /* this is the number of times the loop will iterrate, essentially its
+       
+066            while (tx++ < a->used && ty-- >= 0) \{ ... \}
+067          */
+068         iy = MIN(a->used-tx, ty+1);
+069   
+070         /* execute loop */
+071         for (iz = 0; iz < iy; ++iz) \{
+072            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+073         \}
+074   
+075         /* store term */
+076         W[ix] = ((mp_digit)_W) & MP_MASK;
+077   
+078         /* make next carry */
+079         _W = _W >> ((mp_word)DIGIT_BIT);
+080     \}
+081   
+082     /* setup dest */
+083     olduse  = c->used;
+084     c->used = digs;
+085   
+086     \{
+087       register mp_digit *tmpc;
+088       tmpc = c->dp;
+089       for (ix = 0; ix < digs; ix++) \{
+090         /* now extract the previous digit [below the carry] */
+091         *tmpc++ = W[ix];
+092       \}
+093   
+094       /* clear unused digits [that existed in the old copy of c] */
+095       for (; ix < olduse; ix++) \{
+096         *tmpc++ = 0;
+097       \}
+098     \}
+099     mp_clamp (c);
+100     return MP_OKAY;
+101   \}
+102   #endif
+\end{alltt}
+\end{small}
+
+The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
+implementation a series of aliases (\textit{lines 62, 63 and 76}) are used to simplify the inner $O(n^2)$ loop.  
+In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
+
+The inner loop on lines 89, 79 and 80 is where the algorithm will spend the majority of the time, which is why it has been 
+stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
+very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
+(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
+and scheduling the instructions so there are very few dependency stalls.
+
+In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
+baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
+digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
+be simultaneously used.  
+
+\subsection{Polynomial Basis Multiplication}
+To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
+the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and  
+$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required.  In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree.
+ 
+The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$.  The coefficients $w_i$ will
+directly yield the desired product when $\beta$ is substituted for $x$.  The direct solution to solve for the $2n + 1$ coefficients
+requires $O(n^2)$ time and would in practice be slower than the Comba technique.
+
+However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown 
+coefficients.   This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with 
+Gaussian elimination.  This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in 
+effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$.  
+
+The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible.  However, since 
+$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place.  The benefit of this technique stems from the 
+fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively.  As a result finding the $2n + 1$ relations required 
+by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs.
+
+When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$.  The $\zeta_0$ term
+is simply the product $W(0) = w_0 = a_0 \cdot b_0$.  The $\zeta_1$ term is the product 
+$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$.  The third point $\zeta_{\infty}$ is less obvious but rather
+simple to explain.  The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication.  
+The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$.  Note that the 
+points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly.
+
+If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} 
+$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ for small values of $q$.  The term ``mirror point'' stems from the fact that 
+$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$.  For
+example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror.
+
+\begin{eqnarray}
+\zeta_{2}                  = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\
+16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0)
+\end{eqnarray}
+
+Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts.  For example, when $n = 2$ the
+polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$.  This technique of polynomial representation is known as Horner's method.  
+
+As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications.  Each multiplication is of 
+multiplicands that have $n$ times fewer digits than the inputs.  The asymptotic running time of this algorithm is 
+$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}).  Figure~\ref{fig:exponent}
+summarizes the exponents for various values of $n$.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Split into $n$ Parts} & \textbf{Exponent}  & \textbf{Notes}\\
+\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\
+\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\
+\hline $4$ & $1.403677461$ &\\
+\hline $5$ & $1.365212389$ &\\
+\hline $10$ & $1.278753601$ &\\
+\hline $100$ & $1.149426538$ &\\
+\hline $1000$ & $1.100270931$ &\\
+\hline $10000$ & $1.075252070$ &\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Asymptotic Running Time of Polynomial Basis Multiplication}
+\label{fig:exponent}
+\end{figure}
+
+At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$.  However, the overhead
+of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large
+numbers.  
+
+\subsubsection{Cutoff Point}
+The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach.  However, 
+the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved.  This makes the
+polynomial basis approach more costly to use with small inputs.
+
+Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}).  There exists a 
+point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and 
+when $m > y$ the Comba methods are slower than the polynomial basis algorithms.  
+
+The exact location of $y$ depends on several key architectural elements of the computer platform in question.
+
+\begin{enumerate}
+\item  The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc.  For example
+on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$.  The higher the ratio in favour of multiplication the lower
+the cutoff point $y$ will be.  
+
+\item  The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is.  Generally speaking as the number of splits
+grows the complexity grows substantially.  Ideally solving the system will only involve addition, subtraction and shifting of integers.  This
+directly reflects on the ratio previous mentioned.
+
+\item  To a lesser extent memory bandwidth and function call overheads.  Provided the values are in the processor cache this is less of an
+influence over the cutoff point.
+
+\end{enumerate}
+
+A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met.  For example, if the point
+is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster.  Finding the cutoff points is fairly simple when
+a high resolution timer is available.  
+
+\subsection{Karatsuba Multiplication}
+Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for
+general purpose multiplication.  Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with 
+light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
+
+\begin{equation}
+f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
+\end{equation}
+
+Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
+this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
+out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
+$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
+
+\begin{center}
+\begin{tabular}{rcrcrcrc}
+$\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
+$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
+$\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
+\end{tabular}
+\end{center}
+
+By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
+of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
+making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
+$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
+\hline \\
+1.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
+2.  If step 2 failed then return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
+3.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
+6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\
+7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
+\\
+Calculate the three products. \\
+8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
+9.  $x1y1 \leftarrow x1 \cdot y1$ \\
+10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+11.  $x0 \leftarrow y1 - y0$ \\
+12.  $t1 \leftarrow t1 \cdot x0$ \\
+\\
+Calculate the middle term. \\
+13.  $x0 \leftarrow x0y0 + x1y1$ \\
+14.  $t1 \leftarrow x0 - t1$ \\
+\\
+Calculate the final product. \\
+15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
+16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
+17.  $t1 \leftarrow x0y0 + t1$ \\
+18.  $c \leftarrow t1 + x1y1$ \\
+19.  Clear all of the temporary variables. \\
+20.  Return(\textit{MP\_OKAY}).\\
+\hline 
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_mul.}
+This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm.  It is loosely based on the description
+from Knuth \cite[pp. 294-295]{TAOCPV2}.  
+
+\index{radix point}
+In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen.  The radix point chosen must
+be used for both of the inputs meaning that it must be smaller than the smallest input.  Step 3 chooses the radix point $B$ as half of the 
+smallest input \textbf{used} count.  After the radix point is chosen the inputs are split into lower and upper halves.  Step 4 and 5 
+compute the lower halves.  Step 6 and 7 computer the upper halves.  
+
+After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
+$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
+of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
+
+The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_mul.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* c = |a| * |b| using Karatsuba Multiplication using 
+018    * three half size multiplications
+019    *
+020    * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
+021    * let n represent half of the number of digits in 
+022    * the min(a,b)
+023    *
+024    * a = a1 * B**n + a0
+025    * b = b1 * B**n + b0
+026    *
+027    * Then, a * b => 
+028      a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+029    *
+030    * Note that a1b1 and a0b0 are used twice and only need to be 
+031    * computed once.  So in total three half size (half # of 
+032    * digit) multiplications are performed, a0b0, a1b1 and 
+033    * (a1-b1)(a0-b0)
+034    *
+035    * Note that a multiplication of half the digits requires
+036    * 1/4th the number of single precision multiplications so in 
+037    * total after one call 25% of the single precision multiplications 
+038    * are saved.  Note also that the call to mp_mul can end up back 
+039    * in this function if the a0, a1, b0, or b1 are above the threshold.  
+040    * This is known as divide-and-conquer and leads to the famous 
+041    * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
+042    * the standard O(N**2) that the baseline/comba methods use.  
+043    * Generally though the overhead of this method doesn't pay off 
+044    * until a certain size (N ~ 80) is reached.
+045    */
+046   int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
+047   \{
+048     mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
+049     int     B, err;
+050   
+051     /* default the return code to an error */
+052     err = MP_MEM;
+053   
+054     /* min # of digits */
+055     B = MIN (a->used, b->used);
+056   
+057     /* now divide in two */
+058     B = B >> 1;
+059   
+060     /* init copy all the temps */
+061     if (mp_init_size (&x0, B) != MP_OKAY)
+062       goto ERR;
+063     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+064       goto X0;
+065     if (mp_init_size (&y0, B) != MP_OKAY)
+066       goto X1;
+067     if (mp_init_size (&y1, b->used - B) != MP_OKAY)
+068       goto Y0;
+069   
+070     /* init temps */
+071     if (mp_init_size (&t1, B * 2) != MP_OKAY)
+072       goto Y1;
+073     if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
+074       goto T1;
+075     if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
+076       goto X0Y0;
+077   
+078     /* now shift the digits */
+079     x0.used = y0.used = B;
+080     x1.used = a->used - B;
+081     y1.used = b->used - B;
+082   
+083     \{
+084       register int x;
+085       register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
+086   
+087       /* we copy the digits directly instead of using higher level functions
+088        * since we also need to shift the digits
+089        */
+090       tmpa = a->dp;
+091       tmpb = b->dp;
+092   
+093       tmpx = x0.dp;
+094       tmpy = y0.dp;
+095       for (x = 0; x < B; x++) \{
+096         *tmpx++ = *tmpa++;
+097         *tmpy++ = *tmpb++;
+098       \}
+099   
+100       tmpx = x1.dp;
+101       for (x = B; x < a->used; x++) \{
+102         *tmpx++ = *tmpa++;
+103       \}
+104   
+105       tmpy = y1.dp;
+106       for (x = B; x < b->used; x++) \{
+107         *tmpy++ = *tmpb++;
+108       \}
+109     \}
+110   
+111     /* only need to clamp the lower words since by definition the 
+112      * upper words x1/y1 must have a known number of digits
+113      */
+114     mp_clamp (&x0);
+115     mp_clamp (&y0);
+116   
+117     /* now calc the products x0y0 and x1y1 */
+118     /* after this x0 is no longer required, free temp [x0==t2]! */
+119     if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
+120       goto X1Y1;          /* x0y0 = x0*y0 */
+121     if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
+122       goto X1Y1;          /* x1y1 = x1*y1 */
+123   
+124     /* now calc x1-x0 and y1-y0 */
+125     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+126       goto X1Y1;          /* t1 = x1 - x0 */
+127     if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+128       goto X1Y1;          /* t2 = y1 - y0 */
+129     if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
+130       goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+131   
+132     /* add x0y0 */
+133     if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
+134       goto X1Y1;          /* t2 = x0y0 + x1y1 */
+135     if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
+136       goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+137   
+138     /* shift by B */
+139     if (mp_lshd (&t1, B) != MP_OKAY)
+140       goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+141     if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
+142       goto X1Y1;          /* x1y1 = x1y1 << 2*B */
+143   
+144     if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
+145       goto X1Y1;          /* t1 = x0y0 + t1 */
+146     if (mp_add (&t1, &x1y1, c) != MP_OKAY)
+147       goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
+148   
+149     /* Algorithm succeeded set the return code to MP_OKAY */
+150     err = MP_OKAY;
+151   
+152   X1Y1:mp_clear (&x1y1);
+153   X0Y0:mp_clear (&x0y0);
+154   T1:mp_clear (&t1);
+155   Y1:mp_clear (&y1);
+156   Y0:mp_clear (&y0);
+157   X1:mp_clear (&x1);
+158   X0:mp_clear (&x0);
+159   ERR:
+160     return err;
+161   \}
+162   #endif
+\end{alltt}
+\end{small}
+
+The new coding element in this routine, not  seen in previous routines, is the usage of goto statements.  The conventional
+wisdom is that goto statements should be avoided.  This is generally true, however when every single function call can fail, it makes sense
+to handle error recovery with a single piece of code.  Lines 61 to 75 handle initializing all of the temporary variables 
+required.  Note how each of the if statements goes to a different label in case of failure.  This allows the routine to correctly free only
+the temporaries that have been successfully allocated so far.
+
+The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large.  This saves the 
+additional reallocation that would have been necessary.  Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective
+number of digits for the next section of code.
+
+The first algebraic portion of the algorithm is to split the two inputs into their halves.  However, instead of using mp\_mod\_2d and mp\_rshd
+to extract the halves, the respective code has been placed inline within the body of the function.  To initialize the halves, the \textbf{used} and 
+\textbf{sign} members are copied first.  The first for loop on line 101 copies the lower halves.  Since they are both the same magnitude it 
+is simpler to calculate both lower halves in a single loop.  The for loop on lines 106 and 106 calculate the upper halves $x1$ and 
+$y1$ respectively.
+
+By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs.
+
+When line 150 is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
+the same code that handles errors can be used to clear the temporary variables and return.  
+
+\subsection{Toom-Cook $3$-Way Multiplication}
+Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 2$ except that the points  are 
+chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce.  Here, the points $\zeta_{0}$, 
+$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients 
+of the $W(x)$.
+
+With the five relations that Toom-Cook specifies, the following system of equations is formed.
+
+\begin{center}
+\begin{tabular}{rcrcrcrcrcr}
+$\zeta_0$                    & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$  \\
+$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$  \\
+$\zeta_1$                    & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$  \\
+$\zeta_2$                    & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$  \\
+$\zeta_{\infty}$             & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$  \\
+\end{tabular}
+\end{center}
+
+A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power
+of two, two divisions by three and one multiplication by three.  All of these $19$ sub-operations require less than quadratic time, meaning that
+the algorithm can be faster than a baseline multiplication.  However, the greater complexity of this algorithm places the cutoff point
+(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toom\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow  a  \cdot  b $ \\
+\hline \\
+Split $a$ and $b$ into three pieces.  E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\
+1.  $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\
+2.  $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+3.  $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+4.  $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+5.  $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+6.  $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+7.  $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+\\
+Find the five equations for $w_0, w_1, ..., w_4$. \\
+8.  $w_0 \leftarrow a_0 \cdot b_0$ \\
+9.  $w_4 \leftarrow a_2 \cdot b_2$ \\
+10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\
+11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
+12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\
+13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\
+14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\
+15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\
+16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
+17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\
+\\
+Continued on the next page.\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toom\_mul}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot  b $ \\
+\hline \\
+Now solve the system of equations. \\
+18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\
+19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\
+20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\
+21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
+22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\
+23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\
+24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
+25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\
+\\
+Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\
+26. for $n$ from $1$ to $4$ do \\
+\hspace{3mm}26.1  $w_n \leftarrow w_n \cdot \beta^{nk}$ \\
+27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\
+28. Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toom\_mul (continued)}
+\end{figure}
+
+\textbf{Algorithm mp\_toom\_mul.}
+This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach.  Compared to the Karatsuba multiplication, this 
+algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead.  In this
+description, several statements have been compounded to save space.  The intention is that the statements are executed from left to right across
+any given step.
+
+The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively.  From these smaller
+integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required.
+
+The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively.  The relation $w_1, w_2$ and $w_3$ correspond
+to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively.  These are found using logical shifts to independently find
+$f(y)$ and $g(y)$ which significantly speeds up the algorithm.
+
+After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients 
+$w_1, w_2$ and $w_3$ to be isolated.  The steps 18 through 25 perform the system reduction required as previously described.  Each step of
+the reduction represents the comparable matrix operation that would be performed had this been performed by pencil.  For example, step 18 indicates
+that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$.  
+
+Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known.  By substituting $\beta^{k}$ for $x$, the integer 
+result $a \cdot b$ is produced.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_toom\_mul.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* multiplication using the Toom-Cook 3-way algorithm 
+018    *
+019    * Much more complicated than Karatsuba but has a lower asymptotic running t
+      ime of 
+020    * O(N**1.464).  This algorithm is only particularly useful on VERY large
+021    * inputs (we're talking 1000s of digits here...).
+022   */
+023   int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+024   \{
+025       mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
+026       int res, B;
+027           
+028       /* init temps */
+029       if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
+030                                &a0, &a1, &a2, &b0, &b1, 
+031                                &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{
+032          return res;
+033       \}
+034       
+035       /* B */
+036       B = MIN(a->used, b->used) / 3;
+037       
+038       /* a = a2 * B**2 + a1 * B + a0 */
+039       if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{
+040          goto ERR;
+041       \}
+042   
+043       if ((res = mp_copy(a, &a1)) != MP_OKAY) \{
+044          goto ERR;
+045       \}
+046       mp_rshd(&a1, B);
+047       mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+048   
+049       if ((res = mp_copy(a, &a2)) != MP_OKAY) \{
+050          goto ERR;
+051       \}
+052       mp_rshd(&a2, B*2);
+053       
+054       /* b = b2 * B**2 + b1 * B + b0 */
+055       if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{
+056          goto ERR;
+057       \}
+058   
+059       if ((res = mp_copy(b, &b1)) != MP_OKAY) \{
+060          goto ERR;
+061       \}
+062       mp_rshd(&b1, B);
+063       mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
+064   
+065       if ((res = mp_copy(b, &b2)) != MP_OKAY) \{
+066          goto ERR;
+067       \}
+068       mp_rshd(&b2, B*2);
+069       
+070       /* w0 = a0*b0 */
+071       if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{
+072          goto ERR;
+073       \}
+074       
+075       /* w4 = a2 * b2 */
+076       if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{
+077          goto ERR;
+078       \}
+079       
+080       /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
+081       if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{
+082          goto ERR;
+083       \}
+084       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
+085          goto ERR;
+086       \}
+087       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
+088          goto ERR;
+089       \}
+090       if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{
+091          goto ERR;
+092       \}
+093       
+094       if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{
+095          goto ERR;
+096       \}
+097       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
+098          goto ERR;
+099       \}
+100       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
+101          goto ERR;
+102       \}
+103       if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{
+104          goto ERR;
+105       \}
+106       
+107       if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{
+108          goto ERR;
+109       \}
+110       
+111       /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
+112       if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{
+113          goto ERR;
+114       \}
+115       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
+116          goto ERR;
+117       \}
+118       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
+119          goto ERR;
+120       \}
+121       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
+122          goto ERR;
+123       \}
+124       
+125       if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{
+126          goto ERR;
+127       \}
+128       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
+129          goto ERR;
+130       \}
+131       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
+132          goto ERR;
+133       \}
+134       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
+135          goto ERR;
+136       \}
+137       
+138       if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{
+139          goto ERR;
+140       \}
+141       
+142   
+143       /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
+144       if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{
+145          goto ERR;
+146       \}
+147       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
+148          goto ERR;
+149       \}
+150       if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{
+151          goto ERR;
+152       \}
+153       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
+154          goto ERR;
+155       \}
+156       if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{
+157          goto ERR;
+158       \}
+159       
+160       /* now solve the matrix 
+161       
+162          0  0  0  0  1
+163          1  2  4  8  16
+164          1  1  1  1  1
+165          16 8  4  2  1
+166          1  0  0  0  0
+167          
+168          using 12 subtractions, 4 shifts, 
+169                 2 small divisions and 1 small multiplication 
+170        */
+171        
+172        /* r1 - r4 */
+173        if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{
+174           goto ERR;
+175        \}
+176        /* r3 - r0 */
+177        if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{
+178           goto ERR;
+179        \}
+180        /* r1/2 */
+181        if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{
+182           goto ERR;
+183        \}
+184        /* r3/2 */
+185        if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{
+186           goto ERR;
+187        \}
+188        /* r2 - r0 - r4 */
+189        if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{
+190           goto ERR;
+191        \}
+192        if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{
+193           goto ERR;
+194        \}
+195        /* r1 - r2 */
+196        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
+197           goto ERR;
+198        \}
+199        /* r3 - r2 */
+200        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
+201           goto ERR;
+202        \}
+203        /* r1 - 8r0 */
+204        if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{
+205           goto ERR;
+206        \}
+207        if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{
+208           goto ERR;
+209        \}
+210        /* r3 - 8r4 */
+211        if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{
+212           goto ERR;
+213        \}
+214        if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{
+215           goto ERR;
+216        \}
+217        /* 3r2 - r1 - r3 */
+218        if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{
+219           goto ERR;
+220        \}
+221        if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{
+222           goto ERR;
+223        \}
+224        if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{
+225           goto ERR;
+226        \}
+227        /* r1 - r2 */
+228        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
+229           goto ERR;
+230        \}
+231        /* r3 - r2 */
+232        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
+233           goto ERR;
+234        \}
+235        /* r1/3 */
+236        if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{
+237           goto ERR;
+238        \}
+239        /* r3/3 */
+240        if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{
+241           goto ERR;
+242        \}
+243        
+244        /* at this point shift W[n] by B*n */
+245        if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{
+246           goto ERR;
+247        \}
+248        if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{
+249           goto ERR;
+250        \}
+251        if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{
+252           goto ERR;
+253        \}
+254        if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{
+255           goto ERR;
+256        \}     
+257        
+258        if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{
+259           goto ERR;
+260        \}
+261        if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{
+262           goto ERR;
+263        \}
+264        if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{
+265           goto ERR;
+266        \}
+267        if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{
+268           goto ERR;
+269        \}     
+270        
+271   ERR:
+272        mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
+273                       &a0, &a1, &a2, &b0, &b1, 
+274                       &b2, &tmp1, &tmp2, NULL);
+275        return res;
+276   \}     
+277        
+278   #endif
+\end{alltt}
+\end{small}
+
+-- Comments to be added during editing phase.
+
+\subsection{Signed Multiplication}
+Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
+of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot b$ \\
+\hline \\
+1.  If $a.sign = b.sign$ then \\
+\hspace{3mm}1.1  $sign = MP\_ZPOS$ \\
+2.  else \\
+\hspace{3mm}2.1  $sign = MP\_ZNEG$ \\
+3.  If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then  \\
+\hspace{3mm}3.1  $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\
+4.  else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\
+\hspace{3mm}4.1  $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\
+5.  else \\
+\hspace{3mm}5.1  $digs \leftarrow a.used + b.used + 1$ \\
+\hspace{3mm}5.2  If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\
+\hspace{6mm}5.2.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs.  \\
+\hspace{3mm}5.3  else \\
+\hspace{6mm}5.3.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs.  \\
+6.  $c.sign \leftarrow sign$ \\
+7.  Return the result of the unsigned multiplication performed. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_mul.}
+This algorithm performs the signed multiplication of two inputs.  It will make use of any of the three unsigned multiplication algorithms 
+available when the input is of appropriate size.  The \textbf{sign} of the result is not set until the end of the algorithm since algorithm
+s\_mp\_mul\_digs will clear it.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* high level multiplication (handles sign) */
+018   int mp_mul (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     int     res, neg;
+021     neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+022   
+023     /* use Toom-Cook? */
+024   #ifdef BN_MP_TOOM_MUL_C
+025     if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) \{
+026       res = mp_toom_mul(a, b, c);
+027     \} else 
+028   #endif
+029   #ifdef BN_MP_KARATSUBA_MUL_C
+030     /* use Karatsuba? */
+031     if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) \{
+032       res = mp_karatsuba_mul (a, b, c);
+033     \} else 
+034   #endif
+035     \{
+036       /* can we use the fast multiplier?
+037        *
+038        * The fast multiplier can be used if the output will 
+039        * have less than MP_WARRAY digits and the number of 
+040        * digits won't affect carry propagation
+041        */
+042       int     digs = a->used + b->used + 1;
+043   
+044   #ifdef BN_FAST_S_MP_MUL_DIGS_C
+045       if ((digs < MP_WARRAY) &&
+046           MIN(a->used, b->used) <= 
+047           (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
+048         res = fast_s_mp_mul_digs (a, b, c, digs);
+049       \} else 
+050   #endif
+051   #ifdef BN_S_MP_MUL_DIGS_C
+052         res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
+053   #else
+054         res = MP_VAL;
+055   #endif
+056   
+057     \}
+058     c->sign = (c->used > 0) ? neg : MP_ZPOS;
+059     return res;
+060   \}
+061   #endif
+\end{alltt}
+\end{small}
+
+The implementation is rather simplistic and is not particularly noteworthy.  Line 23 computes the sign of the result using the ``?'' 
+operator from the C programming language.  Line 47 computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.  
+
+\section{Squaring}
+\label{sec:basesquare}
+
+Squaring is a special case of multiplication where both multiplicands are equal.  At first it may seem like there is no significant optimization
+available but in fact there is.  Consider the multiplication of $576$ against $241$.  In total there will be nine single precision multiplications
+performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot  6$, $2 \cdot 7$ and $2 \cdot 5$.  Now consider 
+the multiplication of $123$ against $123$.  The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, 
+$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$.  On closer inspection some of the products are equivalent.  For example, $3 \cdot 2 = 2 \cdot 3$ 
+and $3 \cdot 1 = 1 \cdot 3$. 
+
+For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$
+required for multiplication.  The following diagram gives an example of the operations required.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{ccccc|c}
+&&1&2&3&\\
+$\times$ &&1&2&3&\\
+\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\
+       & $2 \cdot 1$  & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\
+         $1 \cdot 1$  & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\
+\end{tabular}
+\end{center}
+\caption{Squaring Optimization Diagram}
+\end{figure}
+
+Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious.  For the purposes of this discussion let $x$
+represent the number being squared.  The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it.  
+
+The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product.  Every non-square term of a column will
+appear twice hence the name ``double product''.  Every odd column is made up entirely of double products.  In fact every column is made up of double 
+products and at most one square (\textit{see the exercise section}).  
+
+The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, 
+occurs at column $2k + 1$.  For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. 
+Column two of row one is a square and column three is the first unique column.
+
+\subsection{The Baseline Squaring Algorithm}
+The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
+will not handle.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits.  (\textit{mp\_init\_size}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}) \\
+3.  $t.used \leftarrow 2 \cdot a.used + 1$ \\
+4.  For $ix$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}Calculate the square. \\
+\hspace{3mm}4.1  $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\
+\hspace{3mm}4.2  $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}Calculate the double products after the square. \\
+\hspace{3mm}4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}4.4  For $iy$ from $ix + 1$ to $a.used - 1$ do \\
+\hspace{6mm}4.4.1  $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\
+\hspace{6mm}4.4.2  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}4.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}Set the last carry. \\
+\hspace{3mm}4.5  While $u > 0$ do \\
+\hspace{6mm}4.5.1  $iy \leftarrow iy + 1$ \\
+\hspace{6mm}4.5.2  $\hat r \leftarrow t_{ix + iy} + u$ \\
+\hspace{6mm}4.5.3  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}4.5.4  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+5.  Clamp excess digits of $t$.  (\textit{mp\_clamp}) \\
+6.  Exchange $b$ and $t$. \\
+7.  Clear $t$ (\textit{mp\_clear}) \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sqr.}
+This algorithm computes the square of an input using the three observations on squaring.  It is based fairly faithfully on  algorithm 14.16 of HAC
+\cite[pp.596-597]{HAC}.  Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring.  This allows the 
+destination mp\_int to be the same as the source mp\_int.
+
+The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while
+the inner loop computes the columns of the partial result.  Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate
+the carry and compute the double products.  
+
+The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this
+very algorithm.  The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that
+when it is multiplied by two, it can be properly represented by a mp\_word.
+
+Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial 
+results calculated so far.  This involves expensive carry propagation which will be eliminated in the next algorithm.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sqr.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
+018   int
+019   s_mp_sqr (mp_int * a, mp_int * b)
+020   \{
+021     mp_int  t;
+022     int     res, ix, iy, pa;
+023     mp_word r;
+024     mp_digit u, tmpx, *tmpt;
+025   
+026     pa = a->used;
+027     if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{
+028       return res;
+029     \}
+030   
+031     /* default used is maximum possible size */
+032     t.used = 2*pa + 1;
+033   
+034     for (ix = 0; ix < pa; ix++) \{
+035       /* first calculate the digit at 2*ix */
+036       /* calculate double precision result */
+037       r = ((mp_word) t.dp[2*ix]) +
+038           ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
+039   
+040       /* store lower part in result */
+041       t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+042   
+043       /* get the carry */
+044       u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+045   
+046       /* left hand side of A[ix] * A[iy] */
+047       tmpx        = a->dp[ix];
+048   
+049       /* alias for where to store the results */
+050       tmpt        = t.dp + (2*ix + 1);
+051       
+052       for (iy = ix + 1; iy < pa; iy++) \{
+053         /* first calculate the product */
+054         r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
+055   
+056         /* now calculate the double precision result, note we use
+057          * addition instead of *2 since it's easier to optimize
+058          */
+059         r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
+060   
+061         /* store lower part */
+062         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+063   
+064         /* get carry */
+065         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+066       \}
+067       /* propagate upwards */
+068       while (u != ((mp_digit) 0)) \{
+069         r       = ((mp_word) *tmpt) + ((mp_word) u);
+070         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+071         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+072       \}
+073     \}
+074   
+075     mp_clamp (&t);
+076     mp_exch (&t, b);
+077     mp_clear (&t);
+078     return MP_OKAY;
+079   \}
+080   #endif
+\end{alltt}
+\end{small}
+
+Inside the outer loop (\textit{see line 34}) the square term is calculated on line 37.  Line 44 extracts the carry from the square
+term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines 47 and 50 respectively.  The doubling is performed using two
+additions (\textit{see line 59}) since it is usually faster than shifting,if not at least as fast.  
+
+\subsection{Faster Squaring by the ``Comba'' Method}
+A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
+drawback that it must double the product inside the inner loop as well.  As for multiplication, the Comba technique can be used to eliminate these
+performance hazards.
+
+The first obvious solution is to make an array of mp\_words which will hold all of the columns.  This will indeed eliminate all of the carry
+propagation operations from the inner loop.  However, the inner product must still be doubled $O(n^2)$ times.  The solution stems from the simple fact
+that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
+$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
+
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
+arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
+moved to a $O(n)$ work level outside the $O(n^2)$ level.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
+1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
+2.  If step 1 failed return(\textit{MP\_MEM}). \\
+3.  for $ix$ from $0$ to $2a.used + 1$ do \\
+\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
+\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
+4.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}Compute the square.\\
+\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\
+\\
+\hspace{3mm}Compute the double products.\\
+\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
+\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
+5.  $oldused \leftarrow b.used$ \\
+6.  $b.used \leftarrow 2a.used + 1$ \\
+\\
+Double the products and propagate the carries simultaneously. \\
+7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
+8.  for $ix$ from $1$ to $2a.used$ do \\
+\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
+\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
+\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
+9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
+10.  if $2a.used + 1 < oldused$ then do \\
+\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
+\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
+11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}). \\ 
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_sqr.}
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
+the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+
+This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
+array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
+processors to simply make it a full size array.
+
+The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
+it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
+computes the sum of the products for each column.  They are not doubled until later.
+
+After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
+operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
+squares in place.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_sqr.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* fast squaring
+018    *
+019    * This is the comba method where the columns of the product
+020    * are computed first then the carries are computed.  This
+021    * has the effect of making a very simple inner loop that
+022    * is executed the most
+023    *
+024    * W2 represents the outer products and W the inner.
+025    *
+026    * A further optimizations is made because the inner
+027    * products are of the form "A * B * 2".  The *2 part does
+028    * not need to be computed until the end which is good
+029    * because 64-bit shifts are slow!
+030    *
+031    * Based on Algorithm 14.16 on pp.597 of HAC.
+032    *
+033    */
+034   /* the jist of squaring...
+035   
+036   you do like mult except the offset of the tmpx [one that starts closer to ze
+      ro]
+037   can't equal the offset of tmpy.  So basically you set up iy like before then
+       you min it with
+038   (ty-tx) so that it never happens.  You double all those you add in the inner
+       loop
+039   
+040   After that loop you do the squares and add them in.
+041   
+042   Remove W2 and don't memset W
+043   
+044   */
+045   
+046   int fast_s_mp_sqr (mp_int * a, mp_int * b)
+047   \{
+048     int       olduse, res, pa, ix, iz;
+049     mp_digit   W[MP_WARRAY], *tmpx;
+050     mp_word   W1;
+051   
+052     /* grow the destination as required */
+053     pa = a->used + a->used;
+054     if (b->alloc < pa) \{
+055       if ((res = mp_grow (b, pa)) != MP_OKAY) \{
+056         return res;
+057       \}
+058     \}
+059   
+060     /* number of output digits to produce */
+061     W1 = 0;
+062     for (ix = 0; ix <= pa; ix++) \{ 
+063         int      tx, ty, iy;
+064         mp_word  _W;
+065         mp_digit *tmpy;
+066   
+067         /* clear counter */
+068         _W = 0;
+069   
+070         /* get offsets into the two bignums */
+071         ty = MIN(a->used-1, ix);
+072         tx = ix - ty;
+073   
+074         /* setup temp aliases */
+075         tmpx = a->dp + tx;
+076         tmpy = a->dp + ty;
+077   
+078         /* this is the number of times the loop will iterrate, essentially its
+       
+079            while (tx++ < a->used && ty-- >= 0) \{ ... \}
+080          */
+081         iy = MIN(a->used-tx, ty+1);
+082   
+083         /* now for squaring tx can never equal ty 
+084          * we halve the distance since they approach at a rate of 2x
+085          * and we have to round because odd cases need to be executed
+086          */
+087         iy = MIN(iy, (ty-tx+1)>>1);
+088   
+089         /* execute loop */
+090         for (iz = 0; iz < iy; iz++) \{
+091            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+092         \}
+093   
+094         /* double the inner product and add carry */
+095         _W = _W + _W + W1;
+096   
+097         /* even columns have the square term in them */
+098         if ((ix&1) == 0) \{
+099            _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+100         \}
+101   
+102         /* store it */
+103         W[ix] = _W;
+104   
+105         /* make next carry */
+106         W1 = _W >> ((mp_word)DIGIT_BIT);
+107     \}
+108   
+109     /* setup dest */
+110     olduse  = b->used;
+111     b->used = a->used+a->used;
+112   
+113     \{
+114       mp_digit *tmpb;
+115       tmpb = b->dp;
+116       for (ix = 0; ix < pa; ix++) \{
+117         *tmpb++ = W[ix] & MP_MASK;
+118       \}
+119   
+120       /* clear unused digits [that existed in the old copy of c] */
+121       for (; ix < olduse; ix++) \{
+122         *tmpb++ = 0;
+123       \}
+124     \}
+125     mp_clamp (b);
+126     return MP_OKAY;
+127   \}
+128   #endif
+\end{alltt}
+\end{small}
+
+-- Write something deep and insightful later, Tom.
+
+\subsection{Polynomial Basis Squaring}
+The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
+is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$.  Instead of performing $2n + 1$
+multiplications to find the $\zeta$ relations, squaring operations are performed instead.  
+
+\subsection{Karatsuba Squaring}
+Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square.  
+Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  The Karatsuba equation can be modified to square a 
+number with the following equation.
+
+\begin{equation}
+h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
+\end{equation}
+
+Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
+Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
+$O \left ( n^{lg(3)} \right )$.
+
+If the asymptotic times of Karatsuba squaring and multiplication are the same, why not simply use the multiplication algorithm 
+instead?  The answer to this arises from the cutoff point for squaring.  As in multiplication there exists a cutoff point, at which the 
+time required for a Comba based squaring and a Karatsuba based squaring meet.  Due to the overhead inherent in the Karatsuba method, the cutoff 
+point is fairly high.  For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits.  
+
+Consider squaring a 200 digit number with this technique.  It will be split into two 100 digit halves which are subsequently squared.  
+The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm.  If Karatsuba multiplication
+were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  Initialize the following temporary mp\_ints:  $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\
+2.  If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1\beta^B + x0$ \\
+3.  $B \leftarrow \lfloor a.used / 2 \rfloor$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+5.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\
+\\
+Calculate the three squares. \\
+6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
+7.  $x1x1 \leftarrow x1^2$ \\
+8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+9.  $t1 \leftarrow t1^2$ \\
+\\
+Compute the middle term. \\
+10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
+11.  $t1 \leftarrow t2 - t1$ \\
+\\
+Compute final product. \\
+12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
+13.  $x1x1 \leftarrow x1x1\beta^{2B}$ \\
+14.  $t1 \leftarrow t1 + x0x0$ \\
+15.  $b \leftarrow t1 + x1x1$ \\
+16.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_sqr}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_sqr.}
+This algorithm computes the square of an input $a$ using the Karatsuba technique.  This algorithm is very similar to the Karatsuba based
+multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings.
+
+The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is
+placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
+as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
+
+By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
+Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
+this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
+
+Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or
+machine clock cycles.}. 
+
+\begin{equation}
+5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2
+\end{equation}
+
+For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$.  This implies that the following inequality should hold.
+\begin{center}
+\begin{tabular}{rcl}
+${5n \over 3} + 3n^2 + 3n$     & $<$ & ${n \over 3} + 6n^2$ \\
+${5 \over 3} + 3n + 3$     & $<$ & ${1 \over 3} + 6n$ \\
+${13 \over 9}$     & $<$ & $n$ \\
+\end{tabular}
+\end{center}
+
+This results in a cutoff point around $n = 2$.  As a consequence it is actually faster to compute the middle term the ``long way'' on processors
+where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication.  On
+the Intel P4 processor this ratio is 1:29 making this method even more beneficial.  The only common exception is the ARMv4 processor which has a
+ratio of 1:7.  } than simpler operations such as addition.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_sqr.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* Karatsuba squaring, computes b = a*a using three 
+018    * half size squarings
+019    *
+020    * See comments of karatsuba_mul for details.  It 
+021    * is essentially the same algorithm but merely 
+022    * tuned to perform recursive squarings.
+023    */
+024   int mp_karatsuba_sqr (mp_int * a, mp_int * b)
+025   \{
+026     mp_int  x0, x1, t1, t2, x0x0, x1x1;
+027     int     B, err;
+028   
+029     err = MP_MEM;
+030   
+031     /* min # of digits */
+032     B = a->used;
+033   
+034     /* now divide in two */
+035     B = B >> 1;
+036   
+037     /* init copy all the temps */
+038     if (mp_init_size (&x0, B) != MP_OKAY)
+039       goto ERR;
+040     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+041       goto X0;
+042   
+043     /* init temps */
+044     if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
+045       goto X1;
+046     if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
+047       goto T1;
+048     if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
+049       goto T2;
+050     if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
+051       goto X0X0;
+052   
+053     \{
+054       register int x;
+055       register mp_digit *dst, *src;
+056   
+057       src = a->dp;
+058   
+059       /* now shift the digits */
+060       dst = x0.dp;
+061       for (x = 0; x < B; x++) \{
+062         *dst++ = *src++;
+063       \}
+064   
+065       dst = x1.dp;
+066       for (x = B; x < a->used; x++) \{
+067         *dst++ = *src++;
+068       \}
+069     \}
+070   
+071     x0.used = B;
+072     x1.used = a->used - B;
+073   
+074     mp_clamp (&x0);
+075   
+076     /* now calc the products x0*x0 and x1*x1 */
+077     if (mp_sqr (&x0, &x0x0) != MP_OKAY)
+078       goto X1X1;           /* x0x0 = x0*x0 */
+079     if (mp_sqr (&x1, &x1x1) != MP_OKAY)
+080       goto X1X1;           /* x1x1 = x1*x1 */
+081   
+082     /* now calc (x1-x0)**2 */
+083     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+084       goto X1X1;           /* t1 = x1 - x0 */
+085     if (mp_sqr (&t1, &t1) != MP_OKAY)
+086       goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
+087   
+088     /* add x0y0 */
+089     if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
+090       goto X1X1;           /* t2 = x0x0 + x1x1 */
+091     if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
+092       goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+093   
+094     /* shift by B */
+095     if (mp_lshd (&t1, B) != MP_OKAY)
+096       goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
+097     if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
+098       goto X1X1;           /* x1x1 = x1x1 << 2*B */
+099   
+100     if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
+101       goto X1X1;           /* t1 = x0x0 + t1 */
+102     if (mp_add (&t1, &x1x1, b) != MP_OKAY)
+103       goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
+104   
+105     err = MP_OKAY;
+106   
+107   X1X1:mp_clear (&x1x1);
+108   X0X0:mp_clear (&x0x0);
+109   T2:mp_clear (&t2);
+110   T1:mp_clear (&t1);
+111   X1:mp_clear (&x1);
+112   X0:mp_clear (&x0);
+113   ERR:
+114     return err;
+115   \}
+116   #endif
+\end{alltt}
+\end{small}
+
+This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul.  It uses the same inline style to copy and 
+shift the input into the two halves.  The loop from line 53 to line 69 has been modified since only one input exists.  The \textbf{used}
+count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin.  At this point $x1$ and $x0$ are valid equivalents
+to the respective halves as if mp\_rshd and mp\_mod\_2d had been used.  
+
+By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered.  On the Athlon the cutoff point
+is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
+it is actually below the Comba limit (\textit{at 110 digits}).
+
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
+the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
+
+\textit{Last paragraph sucks.  re-write! -- Tom}
+
+\subsection{Toom-Cook Squaring}
+The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
+instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
+derive their own Toom-Cook squaring algorithm.  
+
+\subsection{High Level Squaring}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  If $a.used \ge TOOM\_SQR\_CUTOFF$ then  \\
+\hspace{3mm}1.1  $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\
+2.  else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\
+\hspace{3mm}2.1  $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\
+3.  else \\
+\hspace{3mm}3.1  $digs \leftarrow a.used + b.used + 1$ \\
+\hspace{3mm}3.2  If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\
+\hspace{6mm}3.2.1  $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr.  \\
+\hspace{3mm}3.3  else \\
+\hspace{6mm}3.3.1  $b \leftarrow a^2$ using algorithm s\_mp\_sqr.  \\
+4.  $b.sign \leftarrow MP\_ZPOS$ \\
+5.  Return the result of the unsigned squaring performed. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm mp\_sqr.}
+This algorithm computes the square of the input using one of four different algorithms.  If the input is very large and has at least
+\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used.  If
+neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_sqr.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes b = a*a */
+018   int
+019   mp_sqr (mp_int * a, mp_int * b)
+020   \{
+021     int     res;
+022   
+023   #ifdef BN_MP_TOOM_SQR_C
+024     /* use Toom-Cook? */
+025     if (a->used >= TOOM_SQR_CUTOFF) \{
+026       res = mp_toom_sqr(a, b);
+027     /* Karatsuba? */
+028     \} else 
+029   #endif
+030   #ifdef BN_MP_KARATSUBA_SQR_C
+031   if (a->used >= KARATSUBA_SQR_CUTOFF) \{
+032       res = mp_karatsuba_sqr (a, b);
+033     \} else 
+034   #endif
+035     \{
+036   #ifdef BN_FAST_S_MP_SQR_C
+037       /* can we use the fast comba multiplier? */
+038       if ((a->used * 2 + 1) < MP_WARRAY && 
+039            a->used < 
+040            (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) \{
+041         res = fast_s_mp_sqr (a, b);
+042       \} else
+043   #endif
+044   #ifdef BN_S_MP_SQR_C
+045         res = s_mp_sqr (a, b);
+046   #else
+047         res = MP_VAL;
+048   #endif
+049     \}
+050     b->sign = MP_ZPOS;
+051     return res;
+052   \}
+053   #endif
+\end{alltt}
+\end{small}
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
+                      & that have different number of digits in Karatsuba multiplication. \\
+                      & \\
+$\left [ 3 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\
+                      & of double products and at most one square is stated.  Prove this statement. \\
+                      & \\                      
+$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
+                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
+                      & \\
+$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
+                      & \\
+$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
+                      & \\ 
+$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
+                      & required for equation $6.7$ to be true.  \\
+                      & \\
+\end{tabular}
+
+\chapter{Modular Reduction}
+\section{Basics of Modular Reduction}
+\index{modular residue}
+Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, 
+such as factoring.  Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set.  A number $a$ is said to be \textit{reduced}
+modulo another number $b$ by finding the remainder of the division $a/b$.  Full integer division with remainder is a topic to be covered 
+in~\ref{sec:division}.
+
+Modular reduction is equivalent to solving for $r$ in the following equation.  $a = bq + r$ where $q = \lfloor a/b \rfloor$.  The result 
+$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$.  In other vernacular $r$ is known as the 
+``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and
+other forms of residues.  
+
+Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
+is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
+RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
+Elliptic Curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
+range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
+algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
+
+\section{The Barrett Reduction}
+The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate
+division.  Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to 
+
+\begin{equation}
+c = a - b \cdot \lfloor a/b \rfloor
+\end{equation}
+
+Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP\footnote{It is worth noting that Barrett's paper 
+targeted the DSP56K processor.}  intuition would indicate the next step would be to replace $a/b$ by a multiplication by the reciprocal.  However, 
+DSP intuition on its own will not work as these numbers are considerably larger than the precision of common DSP floating point data types.  
+It would take another common optimization to optimize the algorithm.
+
+\subsection{Fixed Point Arithmetic}
+The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers.  Fixed
+point arithmetic would become very popular as it greatly optimize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were 
+fairly slow if not unavailable.   The idea behind fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit 
+integer and a $q$-bit fraction part (\textit{where $p+q = k$}).  
+
+In this system a $k$-bit integer $n$ would actually represent $n/2^q$.  For example, with $q = 4$ the integer $n = 37$ would actually represent the
+value $2.3125$.  To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized by 
+moving the implied decimal point back to where it should be.  For example, with $q = 4$ to multiply the integers $9$ and $5$ they must be converted 
+to fixed point first by multiplying by $2^q$.  Let $a = 9(2^q)$ represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the 
+fixed point representation of $5$.  The product $ab$ is equal to $45(2^{2q})$ which when normalized by dividing by $2^q$ produces $45(2^q)$.  
+
+This technique became popular since a normal integer multiplication and logical shift right are the only required operations to perform a multiplication
+of two fixed point numbers.  Using fixed point arithmetic, division can be easily approximated by multiplying by the reciprocal.  If $2^q$ is 
+equivalent to one than $2^q/b$ is equivalent to the fixed point approximation of $1/b$ using real arithmetic.  Using this fact dividing an integer 
+$a$ by another integer $b$ can be achieved with the following expression.
+
+\begin{equation}
+\lfloor a / b \rfloor \mbox{ }\approx\mbox{ } \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
+\end{equation}
+
+The precision of the division is proportional to the value of $q$.  If the divisor $b$ is used frequently as is the case with 
+modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift.  Both operations
+are considerably faster than division on most processors.  
+
+Consider dividing $19$ by $5$.  The correct result is $\lfloor 19/5 \rfloor = 3$.  With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which
+leads to a product of $19$ which when divided by $2^q$ produces $2$.  However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and
+the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct.  The value of $2^q$ must be close to or ideally
+larger than the dividend.  In effect if $a$ is the dividend then $q$ should allow $0 \le \lfloor a/2^q \rfloor \le 1$ in order for this approach
+to work correctly.  Plugging this form of divison into the original equation the following modular residue equation arises.
+
+\begin{equation}
+c = a - b \cdot \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
+\end{equation}
+
+Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol.  Using the $\mu$
+variable also helps re-inforce the idea that it is meant to be computed once and re-used.
+
+\begin{equation}
+c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor
+\end{equation}
+
+Provided that $2^q \ge a$ this algorithm will produce a quotient that is either exactly correct or off by a value of one.  In the context of Barrett
+reduction the value of $a$ is bound by $0 \le a \le (b - 1)^2$ meaning that $2^q \ge b^2$ is sufficient to ensure the reciprocal will have enough
+precision.  
+
+Let $n$ represent the number of digits in $b$.  This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and 
+another $n^2$ single precision multiplications to find the residue.  In total $3n^2$ single precision multiplications are required to 
+reduce the number.  
+
+For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$.  Consider reducing
+$a = 180388626447$ modulo $b$ using the above reduction equation.  The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$.
+By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found.
+
+\subsection{Choosing a Radix Point}
+Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications.  If that were the best
+that could be achieved a full division\footnote{A division requires approximately $O(2cn^2)$ single precision multiplications for a small value of $c$.  
+See~\ref{sec:division} for further details.} might as well be used in its place.  The key to optimizing the reduction is to reduce the precision of
+the initial multiplication that finds the quotient.  
+
+Let $a$ represent the number of which the residue is sought.  Let $b$ represent the modulus used to find the residue.  Let $m$ represent
+the number of digits in $b$.  For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$, which is generally true if 
+two $m$-digit numbers have been multiplied.  Dividing $a$ by $b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer.  Digits below the 
+$m - 1$'th digit of $a$ will contribute at most a value of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$.  Another way to
+express this is by re-writing $a$ as two parts.  If $a' \equiv a \mbox{ (mod }b^m\mbox{)}$ and $a'' = a - a'$ then 
+${a \over b} \equiv {{a' + a''} \over b}$ which is equivalent to ${a' \over b} + {a'' \over b}$.  Since $a'$ is bound to be less than $b$ the quotient
+is bound by $0 \le {a' \over b} < 1$.
+
+Since the digits of $a'$ do not contribute much to the quotient the observation is that they might as well be zero.  However, if the digits 
+``might as well be zero'' they might as well not be there in the first place.  Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input
+with the irrelevant digits trimmed.  Now the modular reduction is trimmed to the almost equivalent equation
+
+\begin{equation}
+c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor
+\end{equation}
+
+Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$ where in this case $q$ is a multiple of $lg(\beta)$. Also note that the 
+exponent on the divisor when added to the amount $q_0$ was shifted by equals $2m$.  If the optimization had not been performed the divisor 
+would have the exponent $2m$ so in the end the exponents do ``add up''. Using the above equation the quotient 
+$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most two.  The original fixed point quotient can be off
+by as much as one (\textit{provided the radix point is chosen suitably}) and now that the lower irrelevent digits have been trimmed the quotient
+can be off by an additional value of one for a total of at most two.  This implies that 
+$0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  By first subtracting $b$ times the quotient and then conditionally subtracting 
+$b$ once or twice the residue is found.
+
+The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single
+precision multiplications, ignoring the subtractions required.  In total $2m^2 + m$ single precision multiplications are required to find the residue.  
+This is considerably faster than the original attempt.
+
+For example, let $\beta = 10$ represent the radix of the digits.  Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ 
+represent the value of which the residue is desired.  In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$.  
+With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$.  The quotient is then 
+$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$.  Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ 
+is found.  
+
+\subsection{Trimming the Quotient}
+So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications.  As 
+it stands now the algorithm is already fairly fast compared to a full integer division algorithm.  However, there is still room for
+optimization.  
+
+After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower
+half of the product.  It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision 
+multiplications.  If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly.  
+In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed.  
+
+The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number.  Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision
+multiplications would be required.  Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number
+of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications.  
+
+\subsection{Trimming the Residue}
+After the quotient has been calculated it is used to reduce the input.  As previously noted the algorithm is not exact and it can be off by a small
+multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  If $b$ is $m$ digits than the 
+result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are
+implicitly zero.  
+
+The next optimization arises from this very fact.  Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full
+$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed.  Similarly the value of $a$ can
+be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well.  A multiplication that produces 
+only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications.  
+
+With both optimizations in place the algorithm is the algorithm Barrett proposed.  It requires $m^2 + 2m - 1$ single precision multiplications which
+is considerably faster than the straightforward $3m^2$ method.  
+
+\subsection{The Barrett Algorithm}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor, m = \lceil lg_{\beta}(b) \rceil, (0 \le a < b^2, b > 1)$ \\
+\textbf{Output}.  $a \mbox{ (mod }b\mbox{)}$ \\
+\hline \\
+Let $m$ represent the number of digits in $b$.  \\
+1.  Make a copy of $a$ and store it in $q$.  (\textit{mp\_init\_copy}) \\
+2.  $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\
+\\
+Produce the quotient. \\
+3.  $q \leftarrow q \cdot \mu$  (\textit{note: only produce digits at or above $m-1$}) \\
+4.  $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\
+\\
+Subtract the multiple of modulus from the input. \\
+5.  $a \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+6.  $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\
+7.  $a \leftarrow a - q$ (\textit{mp\_sub}) \\
+\\
+Add $\beta^{m+1}$ if a carry occured. \\
+8.  If $a < 0$ then (\textit{mp\_cmp\_d}) \\
+\hspace{3mm}8.1  $q \leftarrow 1$ (\textit{mp\_set}) \\
+\hspace{3mm}8.2  $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\
+\hspace{3mm}8.3  $a \leftarrow a + q$ \\
+\\
+Now subtract the modulus if the residue is too large (e.g. quotient too small). \\
+9.  While $a \ge b$ do (\textit{mp\_cmp}) \\
+\hspace{3mm}9.1  $c \leftarrow a - b$ \\
+10.  Clear $q$. \\
+11.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce.}
+This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm.  It is loosely based on algorithm 14.42 of HAC
+\cite[pp.  602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}.  The algorithm has several restrictions and assumptions which must 
+be adhered to for the algorithm to work.
+
+First the modulus $b$ is assumed to be positive and greater than one.  If the modulus were less than or equal to one than subtracting
+a multiple of it would either accomplish nothing or actually enlarge the input.  The input $a$ must be in the range $0 \le a < b^2$ in order
+for the quotient to have enough precision.  If $a$ is the product of two numbers that were already reduced modulo $b$, this will not be a problem.
+Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish.  The value of $\mu$ is passed as an argument to this 
+algorithm and is assumed to be calculated and stored before the algorithm is used.  
+
+Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position.  An algorithm called 
+$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task.  The algorithm is based on $s\_mp\_mul\_digs$ except that
+instead of stopping at a given level of precision it starts at a given level of precision.  This optimal algorithm can only be used if the number
+of digits in $b$ is very much smaller than $\beta$.  
+
+While it is known that 
+$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue, so an implied 
+``borrow'' from the higher digits might leave a negative result.  After the multiple of the modulus has been subtracted from $a$ the residue must be 
+fixed up in case it is negative.  The invariant $\beta^{m+1}$ must be added to the residue to make it positive again.  
+
+The while loop at step 9 will subtract $b$ until the residue is less than $b$.  If the algorithm is performed correctly this step is 
+performed at most twice, and on average once. However, if $a \ge b^2$ than it will iterate substantially more times than it should.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* reduces x mod m, assumes 0 < x < m**2, mu is 
+018    * precomputed via mp_reduce_setup.
+019    * From HAC pp.604 Algorithm 14.42
+020    */
+021   int
+022   mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+023   \{
+024     mp_int  q;
+025     int     res, um = m->used;
+026   
+027     /* q = x */
+028     if ((res = mp_init_copy (&q, x)) != MP_OKAY) \{
+029       return res;
+030     \}
+031   
+032     /* q1 = x / b**(k-1)  */
+033     mp_rshd (&q, um - 1);         
+034   
+035     /* according to HAC this optimization is ok */
+036     if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) \{
+037       if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) \{
+038         goto CLEANUP;
+039       \}
+040     \} else \{
+041   #ifdef BN_S_MP_MUL_HIGH_DIGS_C
+042       if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{
+043         goto CLEANUP;
+044       \}
+045   #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
+046       if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{
+047         goto CLEANUP;
+048       \}
+049   #else 
+050       \{ 
+051         res = MP_VAL;
+052         goto CLEANUP;
+053       \}
+054   #endif
+055     \}
+056   
+057     /* q3 = q2 / b**(k+1) */
+058     mp_rshd (&q, um + 1);         
+059   
+060     /* x = x mod b**(k+1), quick (no division) */
+061     if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) \{
+062       goto CLEANUP;
+063     \}
+064   
+065     /* q = q * m mod b**(k+1), quick (no division) */
+066     if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) \{
+067       goto CLEANUP;
+068     \}
+069   
+070     /* x = x - q */
+071     if ((res = mp_sub (x, &q, x)) != MP_OKAY) \{
+072       goto CLEANUP;
+073     \}
+074   
+075     /* If x < 0, add b**(k+1) to it */
+076     if (mp_cmp_d (x, 0) == MP_LT) \{
+077       mp_set (&q, 1);
+078       if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
+079         goto CLEANUP;
+080       if ((res = mp_add (x, &q, x)) != MP_OKAY)
+081         goto CLEANUP;
+082     \}
+083   
+084     /* Back off if it's too big */
+085     while (mp_cmp (x, m) != MP_LT) \{
+086       if ((res = s_mp_sub (x, m, x)) != MP_OKAY) \{
+087         goto CLEANUP;
+088       \}
+089     \}
+090     
+091   CLEANUP:
+092     mp_clear (&q);
+093   
+094     return res;
+095   \}
+096   #endif
+\end{alltt}
+\end{small}
+
+The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up.  This essentially halves
+the number of single precision multiplications required.  However, the optimization is only safe if $\beta$ is much larger than the number of digits
+in the modulus.  In the source code this is evaluated on lines 36 to 44 where algorithm s\_mp\_mul\_high\_digs is used when it is
+safe to do so.  
+
+\subsection{The Barrett Setup Algorithm}
+In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
+future use so that the Barrett algorithm can be used without delay.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_setup}. \\
+\textbf{Input}.   mp\_int $a$ ($a > 1$)  \\
+\textbf{Output}.  $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\
+\hline \\
+1.  $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot  m}$ (\textit{mp\_2expt}) \\
+2.  $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\
+3.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_setup}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_setup.}
+This algorithm computes the reciprocal $\mu$ required for Barrett reduction.  First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot  m}$ which
+is equivalent and much faster.  The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_setup.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* pre-calculate the value required for Barrett reduction
+018    * For a given modulus "b" it calulates the value required in "a"
+019    */
+020   int mp_reduce_setup (mp_int * a, mp_int * b)
+021   \{
+022     int     res;
+023     
+024     if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) \{
+025       return res;
+026     \}
+027     return mp_div (a, b, a, NULL);
+028   \}
+029   #endif
+\end{alltt}
+\end{small}
+
+This simple routine calculates the reciprocal $\mu$ required by Barrett reduction.  Note the extended usage of algorithm mp\_div where the variable
+which would received the remainder is passed as NULL.  As will be discussed in~\ref{sec:division} the division routine allows both the quotient and the 
+remainder to be passed as NULL meaning to ignore the value.  
+
+\section{The Montgomery Reduction}
+Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting 
+form of reduction in common use.  It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a 
+residue times a constant.  However, as perplexing as this may sound the algorithm is relatively simple and very efficient.  
+
+Throughout this entire section the variable $n$ will represent the modulus used to form the residue.  As will be discussed shortly the value of
+$n$ must be odd.  The variable $x$ will represent the quantity of which the residue is sought.  Similar to the Barrett algorithm the input
+is restricted to $0 \le x < n^2$.  To begin the description some simple number theory facts must be established.
+
+\textbf{Fact 1.}  Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$.  Another way
+to explain this is that $n$ is (\textit{or multiples of $n$ are}) congruent to zero modulo $n$.  Adding zero will not change the value of the residue.  
+
+\textbf{Fact 2.}  If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$.  Actually
+this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to 
+multiplication by $k^{-1}$ modulo $n$.  
+
+From these two simple facts the following simple algorithm can be derived.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction}. \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $1$ to $k$ do \\
+\hspace{3mm}1.1  If $x$ is odd then \\
+\hspace{6mm}1.1.1  $x \leftarrow x + n$ \\
+\hspace{3mm}1.2  $x \leftarrow x/2$ \\
+2.  Return $x$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction}
+\end{figure}
+
+The algorithm reduces the input one bit at a time using the two congruencies stated previously.  Inside the loop $n$, which is odd, is
+added to $x$ if $x$ is odd.  This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two.  Since
+$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$.  Let $r$ represent the 
+final result of the Montgomery algorithm.  If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to 
+$0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction is required to get the residue desired.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|l|}
+\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
+\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\
+\hline $2$ & $x/2 = 1453$ \\
+\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\
+\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\
+\hline $5$ & $x/2 = 278$ \\
+\hline $6$ & $x/2 = 139$ \\
+\hline $7$ & $x + n = 396$, $x/2 = 198$ \\
+\hline $8$ & $x/2 = 99$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example of Montgomery Reduction (I)}
+\label{fig:MONT1}
+\end{figure}
+
+Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$.  The result of the algorithm $r = 99$ is
+congruent to the value of $2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^8$ modulo $257$ the correct residue 
+$r \equiv 158$ is produced.  
+
+Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
+and $k^2$ single precision additions.  At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful.  
+Fortunately there exists an alternative representation of the algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
+\hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
+2.  Return $x/2^k$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction (modified I)}
+\end{figure}
+
+This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2.  The number of single
+precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|l|r|}
+\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} & \textbf{Result ($x$) in Binary} \\
+\hline -- & $5555$ & $1010110110011$ \\
+\hline $1$ & $x + 2^{0}n = 5812$ &  $1011010110100$ \\
+\hline $2$ & $5812$ & $1011010110100$ \\
+\hline $3$ & $x + 2^{2}n = 6840$ & $1101010111000$ \\
+\hline $4$ & $x + 2^{3}n = 8896$ & $10001011000000$ \\
+\hline $5$ & $8896$ & $10001011000000$ \\
+\hline $6$ & $8896$ & $10001011000000$ \\
+\hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\
+\hline $8$ & $25344$ & $110001100000000$ \\
+\hline -- & $x/2^k = 99$ & \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example of Montgomery Reduction (II)}
+\label{fig:MONT2}
+\end{figure}
+
+Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 8$. 
+With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the 
+loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is 
+zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.  
+
+\subsection{Digit Based Montgomery Reduction}
+Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis.  Consider the
+previous algorithm re-written to compute the Montgomery reduction in this new fashion.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1  $x \leftarrow x + \mu n \beta^t$ \\
+2.  Return $x/\beta^k$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction (modified II)}
+\end{figure}
+
+The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue.  If the first digit of 
+the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit.  This
+problem breaks down to solving the following congruency.  
+
+\begin{center}
+\begin{tabular}{rcl}
+$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\
+$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\
+$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
+\end{tabular}
+\end{center}
+
+In each iteration of the loop on step 1 a new value of $\mu$ must be calculated.  The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used 
+extensively in this algorithm and should be precomputed.  Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$.  
+
+For example, let $\beta = 10$ represent the radix.  Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$.  Let $x = 33$ 
+represent the value to reduce.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\
+\hline --                 & $33$ & --\\
+\hline $0$                 & $33 + \mu n = 50$ & $1$ \\
+\hline $1$                 & $50 + \mu n \beta = 900$ & $5$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Montgomery Reduction}
+\end{figure}
+
+The final result $900$ is then divided by $\beta^k$ to produce the final result $9$.  The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ 
+which implies the result is not the modular residue of $x$ modulo $n$.  However, recall that the residue is actually multiplied by $\beta^{-k}$ in
+the algorithm.  To get the true residue the value must be multiplied by $\beta^k$.  In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and
+the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$.  
+
+\subsection{Baseline Montgomery Reduction}
+The baseline Montgomery reduction algorithm will produce the residue for any size input.  It is designed to be a catch-all algororithm for 
+Montgomery reductions.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
+\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  $digs \leftarrow 2n.used + 1$ \\
+2.  If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\
+\hspace{3mm}2.1  Use algorithm fast\_mp\_montgomery\_reduce instead. \\
+\\
+Setup $x$ for the reduction. \\
+3.  If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\
+4.  $x.used \leftarrow digs$ \\
+\\
+Eliminate the lower $k$ digits. \\
+5.  For $ix$ from $0$ to $k - 1$ do \\
+\hspace{3mm}5.1  $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}5.2  $u \leftarrow 0$ \\
+\hspace{3mm}5.3  For $iy$ from $0$ to $k - 1$ do \\
+\hspace{6mm}5.3.1  $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\
+\hspace{6mm}5.3.2  $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.3.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.4  While $u > 0$ do \\
+\hspace{6mm}5.4.1  $iy \leftarrow iy + 1$ \\
+\hspace{6mm}5.4.2  $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\
+\hspace{6mm}5.4.4  $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\
+\\
+Divide by $\beta^k$ and fix up as required. \\
+6.  $x \leftarrow \lfloor x / \beta^k \rfloor$ \\
+7.  If $x \ge n$ then \\
+\hspace{3mm}7.1  $x \leftarrow x - n$ \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_montgomery\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_montgomery\_reduce.}
+This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm.  The algorithm is loosely based
+on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop.  The
+restrictions on this algorithm are fairly easy to adapt to.  First $0 \le x < n^2$ bounds the input to numbers in the same range as 
+for the Barrett algorithm.  Additionally if $n > 1$ and $n$ is odd there will exist a modular inverse $\rho$.  $\rho$ must be calculated in
+advance of this algorithm.  Finally the variable $k$ is fixed and a pseudonym for $n.used$.  
+
+Step 2 decides whether a faster Montgomery algorithm can be used.  It is based on the Comba technique meaning that there are limits on
+the size of the input.  This algorithm is discussed in sub-section 6.3.3.
+
+Step 5 is the main reduction loop of the algorithm.  The value of $\mu$ is calculated once per iteration in the outer loop.  The inner loop
+calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits.  Both the addition and
+multiplication are performed in the same loop to save time and memory.  Step 5.4 will handle any additional carries that escape the inner loop.
+
+Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications 
+in the inner loop.  In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision
+multiplications.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_reduce.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes xR**-1 == x (mod N) via Montgomery Reduction */
+018   int
+019   mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+020   \{
+021     int     ix, res, digs;
+022     mp_digit mu;
+023   
+024     /* can the fast reduction [comba] method be used?
+025      *
+026      * Note that unlike in mul you're safely allowed *less*
+027      * than the available columns [255 per default] since carries
+028      * are fixed up in the inner loop.
+029      */
+030     digs = n->used * 2 + 1;
+031     if ((digs < MP_WARRAY) &&
+032         n->used <
+033         (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
+034       return fast_mp_montgomery_reduce (x, n, rho);
+035     \}
+036   
+037     /* grow the input as required */
+038     if (x->alloc < digs) \{
+039       if ((res = mp_grow (x, digs)) != MP_OKAY) \{
+040         return res;
+041       \}
+042     \}
+043     x->used = digs;
+044   
+045     for (ix = 0; ix < n->used; ix++) \{
+046       /* mu = ai * rho mod b
+047        *
+048        * The value of rho must be precalculated via
+049        * montgomery_setup() such that
+050        * it equals -1/n0 mod b this allows the
+051        * following inner loop to reduce the
+052        * input one digit at a time
+053        */
+054       mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
+055   
+056       /* a = a + mu * m * b**i */
+057       \{
+058         register int iy;
+059         register mp_digit *tmpn, *tmpx, u;
+060         register mp_word r;
+061   
+062         /* alias for digits of the modulus */
+063         tmpn = n->dp;
+064   
+065         /* alias for the digits of x [the input] */
+066         tmpx = x->dp + ix;
+067   
+068         /* set the carry to zero */
+069         u = 0;
+070   
+071         /* Multiply and add in place */
+072         for (iy = 0; iy < n->used; iy++) \{
+073           /* compute product and sum */
+074           r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
+075                     ((mp_word) u) + ((mp_word) * tmpx);
+076   
+077           /* get carry */
+078           u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+079   
+080           /* fix digit */
+081           *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
+082         \}
+083         /* At this point the ix'th digit of x should be zero */
+084   
+085   
+086         /* propagate carries upwards as required*/
+087         while (u) \{
+088           *tmpx   += u;
+089           u        = *tmpx >> DIGIT_BIT;
+090           *tmpx++ &= MP_MASK;
+091         \}
+092       \}
+093     \}
+094   
+095     /* at this point the n.used'th least
+096      * significant digits of x are all zero
+097      * which means we can shift x to the
+098      * right by n.used digits and the
+099      * residue is unchanged.
+100      */
+101   
+102     /* x = x/b**n.used */
+103     mp_clamp(x);
+104     mp_rshd (x, n->used);
+105   
+106     /* if x >= n then x = x - n */
+107     if (mp_cmp_mag (x, n) != MP_LT) \{
+108       return s_mp_sub (x, n, x);
+109     \}
+110   
+111     return MP_OKAY;
+112   \}
+113   #endif
+\end{alltt}
+\end{small}
+
+This is the baseline implementation of the Montgomery reduction algorithm.  Lines 30 to 35 determine if the Comba based
+routine can be used instead.  Line 48 computes the value of $\mu$ for that particular iteration of the outer loop.  
+
+The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop.  The alias $tmpx$ refers to the $ix$'th digit of $x$ and
+the alias $tmpn$ refers to the modulus $n$.  
+
+\subsection{Faster ``Comba'' Montgomery Reduction}
+
+The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial
+nature of the inner loop.  The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba
+technique.  The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates
+a $k \times 1$ product $k$ times. 
+
+The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$.  This means the 
+carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit.  The solution as it turns out is very simple.  
+Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry.  
+
+With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases
+the speed of the algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
+\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\
+1.  if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\
+Copy the digits of $x$ into the array $\hat W$ \\
+2.  For $ix$ from $0$ to $x.used - 1$ do \\
+\hspace{3mm}2.1  $\hat W_{ix} \leftarrow x_{ix}$ \\
+3.  For $ix$ from $x.used$ to $2n.used - 1$ do \\
+\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
+Elimiate the lower $k$ digits. \\
+4.  for $ix$ from $0$ to $n.used - 1$ do \\
+\hspace{3mm}4.1  $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}4.2  For $iy$ from $0$ to $n.used - 1$ do \\
+\hspace{6mm}4.2.1  $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\
+\hspace{3mm}4.3  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
+Propagate carries upwards. \\
+5.  for $ix$ from $n.used$ to $2n.used + 1$ do \\
+\hspace{3mm}5.1  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
+Shift right and reduce modulo $\beta$ simultaneously. \\
+6.  for $ix$ from $0$ to $n.used + 1$ do \\
+\hspace{3mm}6.1  $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\
+Zero excess digits and fixup $x$. \\
+7.  if $x.used > n.used + 1$ then do \\
+\hspace{3mm}7.1  for $ix$ from $n.used + 1$ to $x.used - 1$ do \\
+\hspace{6mm}7.1.1  $x_{ix} \leftarrow 0$ \\
+8.  $x.used \leftarrow n.used + 1$ \\
+9.  Clamp excessive digits of $x$. \\
+10.  If $x \ge n$ then \\
+\hspace{3mm}10.1  $x \leftarrow x - n$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_mp\_montgomery\_reduce}
+\end{figure}
+
+\textbf{Algorithm fast\_mp\_montgomery\_reduce.}
+This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique.  It is on most computer platforms significantly
+faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}).  The algorithm has the same restrictions
+on the input as the baseline reduction algorithm.  An additional two restrictions are imposed on this algorithm.  The number of digits $k$ in the 
+the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$.   When $\beta = 2^{28}$ this algorithm can be used to reduce modulo
+a modulus of at most $3,556$ bits in length.  
+
+As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product.  It is initially filled with the
+contents of $x$ with the excess digits zeroed.  The reduction loop is very similar the to the baseline loop at heart.  The multiplication on step
+4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$.  Some multipliers such
+as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce.  By performing
+a single precision multiplication instead half the amount of time is spent.
+
+Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work.  That is what step
+4.3 will do.  In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards.  Note
+how the upper bits of those same words are not reduced modulo $\beta$.  This is because those values will be discarded shortly and there is no
+point.
+
+Step 5 will propagate the remainder of the carries upwards.  On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are
+stored in the destination $x$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_fast\_mp\_montgomery\_reduce.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes xR**-1 == x (mod N) via Montgomery Reduction
+018    *
+019    * This is an optimized implementation of montgomery_reduce
+020    * which uses the comba method to quickly calculate the columns of the
+021    * reduction.
+022    *
+023    * Based on Algorithm 14.32 on pp.601 of HAC.
+024   */
+025   int
+026   fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+027   \{
+028     int     ix, res, olduse;
+029     mp_word W[MP_WARRAY];
+030   
+031     /* get old used count */
+032     olduse = x->used;
+033   
+034     /* grow a as required */
+035     if (x->alloc < n->used + 1) \{
+036       if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) \{
+037         return res;
+038       \}
+039     \}
+040   
+041     /* first we have to get the digits of the input into
+042      * an array of double precision words W[...]
+043      */
+044     \{
+045       register mp_word *_W;
+046       register mp_digit *tmpx;
+047   
+048       /* alias for the W[] array */
+049       _W   = W;
+050   
+051       /* alias for the digits of  x*/
+052       tmpx = x->dp;
+053   
+054       /* copy the digits of a into W[0..a->used-1] */
+055       for (ix = 0; ix < x->used; ix++) \{
+056         *_W++ = *tmpx++;
+057       \}
+058   
+059       /* zero the high words of W[a->used..m->used*2] */
+060       for (; ix < n->used * 2 + 1; ix++) \{
+061         *_W++ = 0;
+062       \}
+063     \}
+064   
+065     /* now we proceed to zero successive digits
+066      * from the least significant upwards
+067      */
+068     for (ix = 0; ix < n->used; ix++) \{
+069       /* mu = ai * m' mod b
+070        *
+071        * We avoid a double precision multiplication (which isn't required)
+072        * by casting the value down to a mp_digit.  Note this requires
+073        * that W[ix-1] have  the carry cleared (see after the inner loop)
+074        */
+075       register mp_digit mu;
+076       mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
+077   
+078       /* a = a + mu * m * b**i
+079        *
+080        * This is computed in place and on the fly.  The multiplication
+081        * by b**i is handled by offseting which columns the results
+082        * are added to.
+083        *
+084        * Note the comba method normally doesn't handle carries in the
+085        * inner loop In this case we fix the carry from the previous
+086        * column since the Montgomery reduction requires digits of the
+087        * result (so far) [see above] to work.  This is
+088        * handled by fixing up one carry after the inner loop.  The
+089        * carry fixups are done in order so after these loops the
+090        * first m->used words of W[] have the carries fixed
+091        */
+092       \{
+093         register int iy;
+094         register mp_digit *tmpn;
+095         register mp_word *_W;
+096   
+097         /* alias for the digits of the modulus */
+098         tmpn = n->dp;
+099   
+100         /* Alias for the columns set by an offset of ix */
+101         _W = W + ix;
+102   
+103         /* inner loop */
+104         for (iy = 0; iy < n->used; iy++) \{
+105             *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
+106         \}
+107       \}
+108   
+109       /* now fix carry for next digit, W[ix+1] */
+110       W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
+111     \}
+112   
+113     /* now we have to propagate the carries and
+114      * shift the words downward [all those least
+115      * significant digits we zeroed].
+116      */
+117     \{
+118       register mp_digit *tmpx;
+119       register mp_word *_W, *_W1;
+120   
+121       /* nox fix rest of carries */
+122   
+123       /* alias for current word */
+124       _W1 = W + ix;
+125   
+126       /* alias for next word, where the carry goes */
+127       _W = W + ++ix;
+128   
+129       for (; ix <= n->used * 2 + 1; ix++) \{
+130         *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
+131       \}
+132   
+133       /* copy out, A = A/b**n
+134        *
+135        * The result is A/b**n but instead of converting from an
+136        * array of mp_word to mp_digit than calling mp_rshd
+137        * we just copy them in the right order
+138        */
+139   
+140       /* alias for destination word */
+141       tmpx = x->dp;
+142   
+143       /* alias for shifted double precision result */
+144       _W = W + n->used;
+145   
+146       for (ix = 0; ix < n->used + 1; ix++) \{
+147         *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
+148       \}
+149   
+150       /* zero oldused digits, if the input a was larger than
+151        * m->used+1 we'll have to clear the digits
+152        */
+153       for (; ix < olduse; ix++) \{
+154         *tmpx++ = 0;
+155       \}
+156     \}
+157   
+158     /* set the max used and clamp */
+159     x->used = n->used + 1;
+160     mp_clamp (x);
+161   
+162     /* if A >= m then A = A - m */
+163     if (mp_cmp_mag (x, n) != MP_LT) \{
+164       return s_mp_sub (x, n, x);
+165     \}
+166     return MP_OKAY;
+167   \}
+168   #endif
+\end{alltt}
+\end{small}
+
+The $\hat W$ array is first filled with digits of $x$ on line 48 then the rest of the digits are zeroed on line 55.  Both loops share
+the same alias variables to make the code easier to read.  
+
+The value of $\mu$ is calculated in an interesting fashion.  First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit.  This
+forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line 110 fixes the carry 
+for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$.
+
+The for loop on line 109 propagates the rest of the carries upwards through the columns.  The for loop on line 126 reduces the columns
+modulo $\beta$ and shifts them $k$ places at the same time.  The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th
+digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$.  
+
+\subsection{Montgomery Setup}
+To calculate the variable $\rho$ a relatively simple algorithm will be required.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_montgomery\_setup}. \\
+\textbf{Input}.   mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\
+\textbf{Output}.  $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
+\hline \\
+1.  $b \leftarrow n_0$ \\
+2.  If $b$ is even return(\textit{MP\_VAL}) \\
+3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
+4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
+\hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
+5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
+6.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_montgomery\_setup} 
+\end{figure}
+
+\textbf{Algorithm mp\_montgomery\_setup.}
+This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms.  It uses a very interesting trick 
+to calculate $1/n_0$ when $\beta$ is a power of two.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_setup.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* setups the montgomery reduction stuff */
+018   int
+019   mp_montgomery_setup (mp_int * n, mp_digit * rho)
+020   \{
+021     mp_digit x, b;
+022   
+023   /* fast inversion mod 2**k
+024    *
+025    * Based on the fact that
+026    *
+027    * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
+028    *                    =>  2*X*A - X*X*A*A = 1
+029    *                    =>  2*(1) - (1)     = 1
+030    */
+031     b = n->dp[0];
+032   
+033     if ((b & 1) == 0) \{
+034       return MP_VAL;
+035     \}
+036   
+037     x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+038     x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+039   #if !defined(MP_8BIT)
+040     x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+041   #endif
+042   #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
+043     x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+044   #endif
+045   #ifdef MP_64BIT
+046     x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+047   #endif
+048   
+049     /* rho = -1/m mod b */
+050     *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
+051   
+052     return MP_OKAY;
+053   \}
+054   #endif
+\end{alltt}
+\end{small}
+
+This source code computes the value of $\rho$ required to perform Montgomery reduction.  It has been modified to avoid performing excess
+multiplications when $\beta$ is not the default 28-bits.  
+
+\section{The Diminished Radix Algorithm}
+The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett
+or Montgomery methods for certain forms of moduli.  The technique is based on the following simple congruence.
+
+\begin{equation}
+(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)}
+\end{equation}
+
+This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive.  It used the fact that if $n = 2^{31}$ and $k=1$ that 
+then a x86 multiplier could produce the 62-bit product and use  the ``shrd'' instruction to perform a double-precision right shift.  The proof
+of the above equation is very simple.  First write $x$ in the product form.
+
+\begin{equation}
+x = qn + r
+\end{equation}
+
+Now reduce both sides modulo $(n - k)$.
+
+\begin{equation}
+x \equiv qk + r  \mbox{ (mod }(n-k)\mbox{)}
+\end{equation}
+
+The variable $n$ reduces modulo $n - k$ to $k$.  By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ 
+into the equation the original congruence is reproduced, thus concluding the proof.  The following algorithm is based on this observation.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Diminished Radix Reduction}. \\
+\textbf{Input}.   Integer $x$, $n$, $k$ \\
+\textbf{Output}.  $x \mbox{ mod } (n - k)$ \\
+\hline \\
+1.  $q \leftarrow \lfloor x / n \rfloor$ \\
+2.  $q \leftarrow k \cdot q$ \\
+3.  $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\
+4.  $x \leftarrow x + q$ \\
+5.  If $x \ge (n - k)$ then \\
+\hspace{3mm}5.1  $x \leftarrow x - (n - k)$ \\
+\hspace{3mm}5.2  Goto step 1. \\
+6.  Return $x$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Diminished Radix Reduction}
+\label{fig:DR}
+\end{figure}
+
+This algorithm will reduce $x$ modulo $n - k$ and return the residue.  If $0 \le x < (n - k)^2$ then the algorithm will loop almost always
+once or twice and occasionally three times.  For simplicity sake the value of $x$ is bounded by the following simple polynomial.
+
+\begin{equation} 
+0 \le x < n^2 + k^2 - 2nk
+\end{equation}
+
+The true bound is  $0 \le x < (n - k - 1)^2$ but this has quite a few more terms.  The value of $q$ after step 1 is bounded by the following.
+
+\begin{equation}
+q < n - 2k - k^2/n
+\end{equation}
+
+Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero.  The value of $x$ after step 3 is bounded trivially as
+$0 \le x < n$.  By step four the sum $x + q$ is bounded by 
+
+\begin{equation}
+0 \le q + x < (k + 1)n - 2k^2 - 1
+\end{equation}
+
+With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3.  After the second pass it is highly unlike that the
+sum in step 4 will exceed $n - k$.  In practice fewer than three passes of the algorithm are required to reduce virtually every input in the 
+range $0 \le x < (n - k - 1)^2$.  
+
+\begin{figure}
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|}
+\hline
+$x = 123456789, n = 256, k = 3$ \\
+\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\
+$q \leftarrow q*k = 1446759$ \\
+$x \leftarrow x \mbox{ mod } n = 21$ \\
+$x \leftarrow x + q = 1446780$ \\
+$x \leftarrow x - (n - k) = 1446527$ \\
+\hline 
+$q \leftarrow \lfloor x/n \rfloor = 5650$ \\
+$q \leftarrow q*k = 16950$ \\
+$x \leftarrow x \mbox{ mod } n = 127$ \\
+$x \leftarrow x + q = 17077$ \\
+$x \leftarrow x - (n - k) = 16824$ \\
+\hline 
+$q \leftarrow \lfloor x/n \rfloor = 65$ \\
+$q \leftarrow q*k = 195$ \\
+$x \leftarrow x \mbox{ mod } n = 184$ \\
+$x \leftarrow x + q = 379$ \\
+$x \leftarrow x - (n - k) = 126$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example Diminished Radix Reduction}
+\label{fig:EXDR}
+\end{figure}
+
+Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$.  Note that even while $x$
+is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast.  In this case only
+three passes were required to find the residue $x \equiv 126$.
+
+
+\subsection{Choice of Moduli}
+On the surface this algorithm looks like a very expensive algorithm.  It requires a couple of subtractions followed by multiplication and other
+modular reductions.  The usefulness of this algorithm becomes exceedingly clear when an appropriate modulus is chosen.
+
+Division in general is a very expensive operation to perform.  The one exception is when the division is by a power of the radix of representation used.  
+Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right.  Similarly division 
+by two (\textit{or powers of two}) is very simple for binary computers to perform.  It would therefore seem logical to choose $n$ of the form $2^p$ 
+which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits.  
+
+However, there is one operation related to division of power of twos that is even faster than this.  If $n = \beta^p$ then the division may be 
+performed by moving whole digits to the right $p$ places.  In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$.  
+Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ merely requires zeroing the digits above the $p-1$'th digit of $x$.  
+
+Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ whereas the term ``unrestricted
+modulus'' will refer to a modulus of the form $2^p - k$.  The word ``restricted'' in this case refers to the fact that it is based on the 
+$2^p$ logic except $p$ must be a multiple of $lg(\beta)$.  
+
+\subsection{Choice of $k$}
+Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$
+in step 2 is the most expensive operation.  Fortunately the choice of $k$ is not terribly limited.  For all intents and purposes it might
+as well be a single digit.  The smaller the value of $k$ is the faster the algorithm will be.  
+
+\subsection{Restricted Diminished Radix Reduction}
+The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$.  This algorithm can reduce 
+an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}.  The implementation
+of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition 
+of $x$ and $q$.  The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular 
+exponentiations are performed.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\
+\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\
+\textbf{Output}.  $x \mbox{ mod } n$ \\
+\hline \\
+1.  $m \leftarrow n.used$ \\
+2.  If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\
+3.  $\mu \leftarrow 0$ \\
+4.  for $i$ from $0$ to $m - 1$ do \\
+\hspace{3mm}4.1  $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\
+\hspace{3mm}4.2  $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}4.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+5.  $x_{m} \leftarrow \mu$ \\
+6.  for $i$ from $m + 1$ to $x.used - 1$ do \\
+\hspace{3mm}6.1  $x_{i} \leftarrow 0$ \\
+7.  Clamp excess digits of $x$. \\
+8.  If $x \ge n$ then \\
+\hspace{3mm}8.1  $x \leftarrow x - n$ \\
+\hspace{3mm}8.2  Goto step 3. \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_dr\_reduce.}
+This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$.  It has similar restrictions to that of the Barrett reduction
+with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$.  
+
+This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization.  The division by $\beta^m$, multiplication by $k$
+and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4.  The division by $\beta^m$ is emulated by accessing
+the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position.  After the loop the $m$'th
+digit is set to the carry and the upper digits are zeroed.  Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to 
+$x$ before the addition of the multiple of the upper half.  
+
+At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required.  First $n$ is subtracted from $x$ and then the algorithm resumes
+at step 3.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_reduce.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+018    *
+019    * Based on algorithm from the paper
+020    *
+021    * "Generating Efficient Primes for Discrete Log Cryptosystems"
+022    *                 Chae Hoon Lim, Pil Loong Lee,
+023    *          POSTECH Information Research Laboratories
+024    *
+025    * The modulus must be of a special format [see manual]
+026    *
+027    * Has been modified to use algorithm 7.10 from the LTM book instead
+028    *
+029    * Input x must be in the range 0 <= x <= (n-1)**2
+030    */
+031   int
+032   mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
+033   \{
+034     int      err, i, m;
+035     mp_word  r;
+036     mp_digit mu, *tmpx1, *tmpx2;
+037   
+038     /* m = digits in modulus */
+039     m = n->used;
+040   
+041     /* ensure that "x" has at least 2m digits */
+042     if (x->alloc < m + m) \{
+043       if ((err = mp_grow (x, m + m)) != MP_OKAY) \{
+044         return err;
+045       \}
+046     \}
+047   
+048   /* top of loop, this is where the code resumes if
+049    * another reduction pass is required.
+050    */
+051   top:
+052     /* aliases for digits */
+053     /* alias for lower half of x */
+054     tmpx1 = x->dp;
+055   
+056     /* alias for upper half of x, or x/B**m */
+057     tmpx2 = x->dp + m;
+058   
+059     /* set carry to zero */
+060     mu = 0;
+061   
+062     /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
+063     for (i = 0; i < m; i++) \{
+064         r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
+065         *tmpx1++  = (mp_digit)(r & MP_MASK);
+066         mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
+067     \}
+068   
+069     /* set final carry */
+070     *tmpx1++ = mu;
+071   
+072     /* zero words above m */
+073     for (i = m + 1; i < x->used; i++) \{
+074         *tmpx1++ = 0;
+075     \}
+076   
+077     /* clamp, sub and return */
+078     mp_clamp (x);
+079   
+080     /* if x >= n then subtract and reduce again
+081      * Each successive "recursion" makes the input smaller and smaller.
+082      */
+083     if (mp_cmp_mag (x, n) != MP_LT) \{
+084       s_mp_sub(x, n, x);
+085       goto top;
+086     \}
+087     return MP_OKAY;
+088   \}
+089   #endif
+\end{alltt}
+\end{small}
+
+The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line 51 is where
+the algorithm will resume if further reduction passes are required.  In theory it could be placed at the top of the function however, the size of
+the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time.  
+
+The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits.  By reading digits from $x$ offset by $m$ digits
+a division by $\beta^m$ can be simulated virtually for free.  The loop on line 63 performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
+in this algorithm.
+
+By line 70 the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line 73 the 
+same pointer will point to the $m+1$'th digit where the zeroes will be placed.  
+
+Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required.  
+With the same logic at line 84 the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
+as well.  Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code
+does not need to be checked.
+
+\subsubsection{Setup}
+To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required.  This algorithm is not really complicated but provided for
+completeness.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_setup}. \\
+\textbf{Input}.   mp\_int $n$ \\
+\textbf{Output}.  $k = \beta - n_0$ \\
+\hline \\
+1.  $k \leftarrow \beta - n_0$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_setup}
+\end{figure}
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_setup.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines the setup value */
+018   void mp_dr_setup(mp_int *a, mp_digit *d)
+019   \{
+020      /* the casts are required if DIGIT_BIT is one less than
+021       * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
+022       */
+023      *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
+024           ((mp_word)a->dp[0]));
+025   \}
+026   
+027   #endif
+\end{alltt}
+\end{small}
+
+\subsubsection{Modulus Detection}
+Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus.  An integer is said to be
+of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\
+\textbf{Input}.   mp\_int $n$ \\
+\textbf{Output}.  $1$ if $n$ is in D.R form, $0$ otherwise \\
+\hline
+1.  If $n.used < 2$ then return($0$). \\
+2.  for $ix$ from $1$ to $n.used - 1$ do \\
+\hspace{3mm}2.1  If $n_{ix} \ne \beta - 1$ return($0$). \\
+3.  Return($1$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_is\_modulus}
+\end{figure}
+
+\textbf{Algorithm mp\_dr\_is\_modulus.}
+This algorithm determines if a value is in Diminished Radix form.  Step 1 rejects obvious cases where fewer than two digits are
+in the mp\_int.  Step 2 tests all but the first digit to see if they are equal to $\beta - 1$.  If the algorithm manages to get to
+step 3 then $n$ must be of Diminished Radix form.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_is\_modulus.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines if a number is a valid DR modulus */
+018   int mp_dr_is_modulus(mp_int *a)
+019   \{
+020      int ix;
+021   
+022      /* must be at least two digits */
+023      if (a->used < 2) \{
+024         return 0;
+025      \}
+026   
+027      /* must be of the form b**k - a [a <= b] so all
+028       * but the first digit must be equal to -1 (mod b).
+029       */
+030      for (ix = 1; ix < a->used; ix++) \{
+031          if (a->dp[ix] != MP_MASK) \{
+032             return 0;
+033          \}
+034      \}
+035      return 1;
+036   \}
+037   
+038   #endif
+\end{alltt}
+\end{small}
+
+\subsection{Unrestricted Diminished Radix Reduction}
+The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$.  This algorithm
+is a straightforward adaptation of algorithm~\ref{fig:DR}.
+
+In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead.  However, this new
+algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_2k}. \\
+\textbf{Input}.   mp\_int $a$ and $n$.  mp\_digit $k$  \\
+\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\
+\textbf{Output}.  $a \mbox{ (mod }n\mbox{)}$ \\
+\hline
+1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+2.  While $a \ge n$ do \\
+\hspace{3mm}2.1  $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\
+\hspace{3mm}2.2  $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+\hspace{3mm}2.3  $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\
+\hspace{3mm}2.4  $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.5  If $a \ge n$ then do \\
+\hspace{6mm}2.5.1  $a \leftarrow a - n$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_2k}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_2k.}
+This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$.  Division by $2^p$ is emulated with a right
+shift which makes the algorithm fairly inexpensive to use.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* reduces a modulo n where n is of the form 2**p - d */
+018   int
+019   mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+020   \{
+021      mp_int q;
+022      int    p, res;
+023      
+024      if ((res = mp_init(&q)) != MP_OKAY) \{
+025         return res;
+026      \}
+027      
+028      p = mp_count_bits(n);    
+029   top:
+030      /* q = a/2**p, a = a mod 2**p */
+031      if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) \{
+032         goto ERR;
+033      \}
+034      
+035      if (d != 1) \{
+036         /* q = q * d */
+037         if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) \{ 
+038            goto ERR;
+039         \}
+040      \}
+041      
+042      /* a = a + q */
+043      if ((res = s_mp_add(a, &q, a)) != MP_OKAY) \{
+044         goto ERR;
+045      \}
+046      
+047      if (mp_cmp_mag(a, n) != MP_LT) \{
+048         s_mp_sub(a, n, a);
+049         goto top;
+050      \}
+051      
+052   ERR:
+053      mp_clear(&q);
+054      return res;
+055   \}
+056   
+057   #endif
+\end{alltt}
+\end{small}
+
+The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$.  The call to mp\_div\_2d
+on line 31 calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
+is kept fairly small.  The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without
+any multiplications.  
+
+The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are 
+positive.  By using the unsigned versions the overhead is kept to a minimum.  
+
+\subsubsection{Unrestricted Setup}
+To setup this reduction algorithm the value of $k = 2^p - n$ is required.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\
+\textbf{Input}.   mp\_int $n$   \\
+\textbf{Output}.  $k = 2^p - n$ \\
+\hline
+1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+2.  $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\
+3.  $x \leftarrow x - n$ (\textit{mp\_sub}) \\
+4.  $k \leftarrow x_0$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_2k\_setup}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_2k\_setup.}
+This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k.  By making a temporary variable $x$ equal to $2^p$ a subtraction
+is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k\_setup.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines the setup value */
+018   int 
+019   mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+020   \{
+021      int res, p;
+022      mp_int tmp;
+023      
+024      if ((res = mp_init(&tmp)) != MP_OKAY) \{
+025         return res;
+026      \}
+027      
+028      p = mp_count_bits(a);
+029      if ((res = mp_2expt(&tmp, p)) != MP_OKAY) \{
+030         mp_clear(&tmp);
+031         return res;
+032      \}
+033      
+034      if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) \{
+035         mp_clear(&tmp);
+036         return res;
+037      \}
+038      
+039      *d = tmp.dp[0];
+040      mp_clear(&tmp);
+041      return MP_OKAY;
+042   \}
+043   #endif
+\end{alltt}
+\end{small}
+
+\subsubsection{Unrestricted Detection}
+An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true.
+
+\begin{enumerate}
+\item  The number has only one digit.
+\item  The number has more than one digit and every bit from the $\beta$'th to the most significant is one.
+\end{enumerate}
+
+If either condition is true than there is a power of two $2^p$ such that $0 < 2^p - n < \beta$.   If the input is only
+one digit than it will always be of the correct form.  Otherwise all of the bits above the first digit must be one.  This arises from the fact
+that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most
+significant bit.  The resulting sum will be a power of two.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\
+\textbf{Input}.   mp\_int $n$   \\
+\textbf{Output}.  $1$ if of proper form, $0$ otherwise \\
+\hline
+1.  If $n.used = 0$ then return($0$). \\
+2.  If $n.used = 1$ then return($1$). \\
+3.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+4.  for $x$ from $lg(\beta)$ to $p$ do \\
+\hspace{3mm}4.1  If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\
+5.  Return($1$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_is\_2k}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_is\_2k.}
+This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_is\_2k.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines if mp_reduce_2k can be used */
+018   int mp_reduce_is_2k(mp_int *a)
+019   \{
+020      int ix, iy, iw;
+021      mp_digit iz;
+022      
+023      if (a->used == 0) \{
+024         return 0;
+025      \} else if (a->used == 1) \{
+026         return 1;
+027      \} else if (a->used > 1) \{
+028         iy = mp_count_bits(a);
+029         iz = 1;
+030         iw = 1;
+031       
+032         /* Test every bit from the second digit up, must be 1 */
+033         for (ix = DIGIT_BIT; ix < iy; ix++) \{
+034             if ((a->dp[iw] & iz) == 0) \{
+035                return 0;
+036             \}
+037             iz <<= 1;
+038             if (iz > (mp_digit)MP_MASK) \{
+039                ++iw;
+040                iz = 1;
+041             \}
+042         \}
+043      \}
+044      return 1;
+045   \}
+046   
+047   #endif
+\end{alltt}
+\end{small}
+
+
+
+\section{Algorithm Comparison}
+So far three very different algorithms for modular reduction have been discussed.  Each of the algorithms have their own strengths and weaknesses
+that makes having such a selection very useful.  The following table sumarizes the three algorithms along with comparisons of work factors.  Since
+all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table.  
+
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\
+\hline Barrett    & $m^2 + 2m - 1$ & None              & $79$ & $1087$ & $4223$ \\
+\hline Montgomery & $m^2 + m$      & $n$ must be odd   & $72$ & $1056$ & $4160$ \\
+\hline D.R.       & $2m$           & $n = \beta^m - k$ & $16$ & $64$   & $128$  \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+
+In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete.  However, in practice since Montgomery
+reduction can be written as a single function with the Comba technique it is much faster.  Barrett reduction suffers from the overhead of
+calling the half precision multipliers, addition and division by $\beta$ algorithms.
+
+For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice.  The one set of algorithms where Diminished Radix reduction truly
+shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}.  In these algorithms
+primes of the form $\beta^m - k$ can be found and shared amongst users.  These primes will allow the Diminished Radix algorithm to be used in
+modular exponentiation to greatly speed up the operation.
+
+
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\
+                     & calculates the correct value of $\rho$. \\
+                     & \\
+$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly.  \\
+                     & \\
+$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\
+                     & (\textit{figure~\ref{fig:DR}}) terminates.  Also prove the probability that it will \\
+                     & terminate within $1 \le k \le 10$ iterations. \\
+                     & \\
+\end{tabular}                     
+
+
+\chapter{Exponentiation}
+Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$.  A variant of exponentiation, computed
+in a finite field or ring, is called modular exponentiation.  This latter style of operation is typically used in public key 
+cryptosystems such as RSA and Diffie-Hellman.  The ability to quickly compute modular exponentiations is of great benefit to any
+such cryptosystem and many methods have been sought to speed it up.
+
+\section{Exponentiation Basics}
+A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired.  However, as $b$ grows in size
+the number of multiplications becomes prohibitive.  Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature
+with a $1024$-bit key.  Such a calculation could never be completed as it would take simply far too long.
+
+Fortunately there is a very simple algorithm based on the laws of exponents.  Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which
+are two trivial relationships between the base and the exponent.  Let $b_i$ represent the $i$'th bit of $b$ starting from the least 
+significant bit.  If $b$ is a $k$-bit integer than the following equation is true.
+
+\begin{equation}
+a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i}
+\end{equation}
+
+By taking the base $a$ logarithm of both sides of the equation the following equation is the result.
+
+\begin{equation}
+b = \sum_{i=0}^{k-1}2^i \cdot b_i
+\end{equation}
+
+The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to
+$a^{2^{i+1}}$.  This observation forms the basis of essentially all fast exponentiation algorithms.  It requires $k$ squarings and on average
+$k \over 2$ multiplications to compute the result.  This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times.
+
+While this current method is a considerable speed up there are further improvements to be made.  For example, the $a^{2^i}$ term does not need to 
+be computed in an auxilary variable.  Consider the following equivalent algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Left to Right Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$ and $k$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $k - 1$ to $0$ do \\
+\hspace{3mm}2.1  $c \leftarrow c^2$ \\
+\hspace{3mm}2.2  $c \leftarrow c \cdot a^{b_i}$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Left to Right Exponentiation}
+\label{fig:LTOR}
+\end{figure}
+
+This algorithm starts from the most significant bit and works towards the least significant bit.  When the $i$'th bit of $b$ is set $a$ is
+multiplied against the current product.  In each iteration the product is squared which doubles the exponent of the individual terms of the
+product.  
+
+For example, let $b = 101100_2 \equiv 44_{10}$.  The following chart demonstrates the actions of the algorithm.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|}
+\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\
+\hline - & $1$ \\
+\hline $5$ & $a$ \\
+\hline $4$ & $a^2$ \\
+\hline $3$ & $a^4 \cdot a$ \\
+\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\
+\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\
+\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Left to Right Exponentiation}
+\end{figure}
+
+When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation.  This particular algorithm is 
+called ``Left to Right'' because it reads the exponent in that order.  All of the exponentiation algorithms that will be presented are of this nature.  
+
+\subsection{Single Digit Exponentiation}
+The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit.  It is intended 
+to be used when a small power of an input is required (\textit{e.g. $a^5$}).  It is faster than simply multiplying $b - 1$ times for all values of 
+$b$ that are greater than three.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_expt\_d}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_digit $b$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $g \leftarrow a$ (\textit{mp\_init\_copy}) \\
+2.  $c \leftarrow 1$ (\textit{mp\_set}) \\
+3.  for $x$ from 1 to $lg(\beta)$ do \\
+\hspace{3mm}3.1  $c \leftarrow c^2$ (\textit{mp\_sqr}) \\
+\hspace{3mm}3.2  If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\
+\hspace{6mm}3.2.1  $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\
+\hspace{3mm}3.3  $b \leftarrow b << 1$ \\
+4.  Clear $g$. \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_expt\_d}
+\end{figure}
+
+\textbf{Algorithm mp\_expt\_d.}
+This algorithm computes the value of $a$ raised to the power of a single digit $b$.  It uses the left to right exponentiation algorithm to
+quickly compute the exponentiation.  It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the 
+exponent is a fixed width.  
+
+A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$.  The result is set to the initial value of 
+$1$ in the subsequent step.
+
+Inside the loop the exponent is read from the most significant bit first down to the least significant bit.  First $c$ is invariably squared
+on step 3.1.  In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$.  The value
+of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit.  In effect each
+iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_expt\_d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* calculate c = a**b  using a square-multiply algorithm */
+018   int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
+019   \{
+020     int     res, x;
+021     mp_int  g;
+022   
+023     if ((res = mp_init_copy (&g, a)) != MP_OKAY) \{
+024       return res;
+025     \}
+026   
+027     /* set initial result */
+028     mp_set (c, 1);
+029   
+030     for (x = 0; x < (int) DIGIT_BIT; x++) \{
+031       /* square */
+032       if ((res = mp_sqr (c, c)) != MP_OKAY) \{
+033         mp_clear (&g);
+034         return res;
+035       \}
+036   
+037       /* if the bit is set multiply */
+038       if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) \{
+039         if ((res = mp_mul (c, &g, c)) != MP_OKAY) \{
+040            mp_clear (&g);
+041            return res;
+042         \}
+043       \}
+044   
+045       /* shift to next bit */
+046       b <<= 1;
+047     \}
+048   
+049     mp_clear (&g);
+050     return MP_OKAY;
+051   \}
+052   #endif
+\end{alltt}
+\end{small}
+
+Line 28 sets the initial value of the result to $1$.  Next the loop on line 30 steps through each bit of the exponent starting from
+the most significant down towards the least significant. The invariant squaring operation placed on line 32 is performed first.  After 
+the squaring the result $c$ is multiplied by the base $g$ if and only if the most significant bit of the exponent is set.  The shift on line
+46 moves all of the bits of the exponent upwards towards the most significant location.  
+
+\section{$k$-ary Exponentiation}
+When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor
+slower than squaring.  Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$.  Suppose instead it referred to
+the $i$'th $k$-bit digit of the exponent of $b$.  For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY}
+computes the same exponentiation.  A group of $k$ bits from the exponent is called a \textit{window}.  That is it is a small window on only a
+portion of the entire exponent.  Consider the following modification to the basic left to right exponentiation algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{$k$-ary Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $t - 1$ to $0$ do \\
+\hspace{3mm}2.1  $c \leftarrow c^{2^k} $ \\
+\hspace{3mm}2.2  Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\
+\hspace{3mm}2.3  $c \leftarrow c \cdot a^g$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{$k$-ary Exponentiation}
+\label{fig:KARY}
+\end{figure}
+
+The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times.  If the values of $a^g$ for $0 < g < 2^k$ have been
+precomputed this algorithm requires only $t$ multiplications and $tk$ squarings.  The table can be generated with $2^{k - 1} - 1$ squarings and
+$2^{k - 1} + 1$ multiplications.  This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$.  
+However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}.
+
+Suppose $k = 4$ and $t = 100$.  This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation.  The
+original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value.  The total number of squarings
+has increased slightly but the number of multiplications has nearly halved.
+
+\subsection{Optimal Values of $k$}
+An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$.  The simplest
+approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result.  Table~\ref{fig:OPTK} lists optimal values of $k$
+for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\
+\hline $16$ & $2$ & $27$ & $24$ \\
+\hline $32$ & $3$ & $49$ & $48$ \\
+\hline $64$ & $3$ & $92$ & $96$ \\
+\hline $128$ & $4$ & $175$ & $192$ \\
+\hline $256$ & $4$ & $335$ & $384$ \\
+\hline $512$ & $5$ & $645$ & $768$ \\
+\hline $1024$ & $6$ & $1257$ & $1536$ \\
+\hline $2048$ & $6$ & $2452$ & $3072$ \\
+\hline $4096$ & $7$ & $4808$ & $6144$ \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Optimal Values of $k$ for $k$-ary Exponentiation}
+\label{fig:OPTK}
+\end{figure}
+
+\subsection{Sliding-Window Exponentiation}
+A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$.  Essentially
+this is a table for all values of $g$ where the most significant bit of $g$ is a one.  However, in order for this to be allowed in the 
+algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided.  
+
+Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\
+\hline $16$ & $3$ & $24$ & $27$ \\
+\hline $32$ & $3$ & $45$ & $49$ \\
+\hline $64$ & $4$ & $87$ & $92$ \\
+\hline $128$ & $4$ & $167$ & $175$ \\
+\hline $256$ & $5$ & $322$ & $335$ \\
+\hline $512$ & $6$ & $628$ & $645$ \\
+\hline $1024$ & $6$ & $1225$ & $1257$ \\
+\hline $2048$ & $7$ & $2403$ & $2452$ \\
+\hline $4096$ & $8$ & $4735$ & $4808$ \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Optimal Values of $k$ for Sliding Window Exponentiation}
+\label{fig:OPTK2}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $t - 1$ to $0$ do \\
+\hspace{3mm}2.1  If the $i$'th bit of $b$ is a zero then \\
+\hspace{6mm}2.1.1   $c \leftarrow c^2$ \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c \leftarrow c^{2^k}$ \\
+\hspace{6mm}2.2.2  Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\
+\hspace{6mm}2.2.3  $c \leftarrow c \cdot a^g$ \\
+\hspace{6mm}2.2.4  $i \leftarrow i - k$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Sliding Window $k$-ary Exponentiation}
+\end{figure}
+
+Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent.  While this
+algorithm requires the same number of squarings it can potentially have fewer multiplications.  The pre-computed table $a^g$ is also half
+the size as the previous table.  
+
+Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms.  The first algorithm will divide the exponent up as 
+the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$.  The second algorithm will break the 
+exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$.  The single digit $0$ in the second representation are where
+a single squaring took place instead of a squaring and multiplication.  In total the first method requires $10$ multiplications and $18$ 
+squarings.  The second method requires $8$ multiplications and $18$ squarings.  
+
+In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster.  
+
+\section{Modular Exponentiation}
+
+Modular exponentiation is essentially computing the power of a base within a finite field or ring.  For example, computing 
+$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation.  Instead of first computing $a^b$ and then reducing it 
+modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation.  
+
+This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using
+one of the algorithms presented in chapter six.  
+
+Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first.  This algorithm
+will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The
+value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see \ref{sec;modinv}}).  If no inverse exists the algorithm
+terminates with an error.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_exptmod}. \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+1.  If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
+2.  If $b.sign = MP\_NEG$ then \\
+\hspace{3mm}2.1  $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\
+\hspace{3mm}2.2  $x' \leftarrow \vert x \vert$ \\
+\hspace{3mm}2.3  Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\
+3.  if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\
+\hspace{3mm}3.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\
+4.  else \\
+\hspace{3mm}4.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_exptmod}
+\end{figure}
+
+\textbf{Algorithm mp\_exptmod.}
+The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod.  It is a sliding window $k$-ary algorithm 
+which uses Barrett reduction to reduce the product modulo $p$.  The second algorithm mp\_exptmod\_fast performs the same operation 
+except it uses either Montgomery or Diminished Radix reduction.  The two latter reduction algorithms are clumped in the same exponentiation
+algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}).  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_exptmod.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   
+018   /* this is a shell function that calls either the normal or Montgomery
+019    * exptmod functions.  Originally the call to the montgomery code was
+020    * embedded in the normal function but that wasted alot of stack space
+021    * for nothing (since 99% of the time the Montgomery code would be called)
+022    */
+023   int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+024   \{
+025     int dr;
+026   
+027     /* modulus P must be positive */
+028     if (P->sign == MP_NEG) \{
+029        return MP_VAL;
+030     \}
+031   
+032     /* if exponent X is negative we have to recurse */
+033     if (X->sign == MP_NEG) \{
+034   #ifdef BN_MP_INVMOD_C
+035        mp_int tmpG, tmpX;
+036        int err;
+037   
+038        /* first compute 1/G mod P */
+039        if ((err = mp_init(&tmpG)) != MP_OKAY) \{
+040           return err;
+041        \}
+042        if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) \{
+043           mp_clear(&tmpG);
+044           return err;
+045        \}
+046   
+047        /* now get |X| */
+048        if ((err = mp_init(&tmpX)) != MP_OKAY) \{
+049           mp_clear(&tmpG);
+050           return err;
+051        \}
+052        if ((err = mp_abs(X, &tmpX)) != MP_OKAY) \{
+053           mp_clear_multi(&tmpG, &tmpX, NULL);
+054           return err;
+055        \}
+056   
+057        /* and now compute (1/G)**|X| instead of G**X [X < 0] */
+058        err = mp_exptmod(&tmpG, &tmpX, P, Y);
+059        mp_clear_multi(&tmpG, &tmpX, NULL);
+060        return err;
+061   #else 
+062        /* no invmod */
+063        return MP_VAL
+064   #endif
+065     \}
+066   
+067   #ifdef BN_MP_DR_IS_MODULUS_C
+068     /* is it a DR modulus? */
+069     dr = mp_dr_is_modulus(P);
+070   #else
+071     dr = 0;
+072   #endif
+073   
+074   #ifdef BN_MP_REDUCE_IS_2K_C
+075     /* if not, is it a uDR modulus? */
+076     if (dr == 0) \{
+077        dr = mp_reduce_is_2k(P) << 1;
+078     \}
+079   #endif
+080       
+081     /* if the modulus is odd or dr != 0 use the fast method */
+082   #ifdef BN_MP_EXPTMOD_FAST_C
+083     if (mp_isodd (P) == 1 || dr !=  0) \{
+084       return mp_exptmod_fast (G, X, P, Y, dr);
+085     \} else \{
+086   #endif
+087   #ifdef BN_S_MP_EXPTMOD_C
+088       /* otherwise use the generic Barrett reduction technique */
+089       return s_mp_exptmod (G, X, P, Y);
+090   #else
+091       /* no exptmod for evens */
+092       return MP_VAL;
+093   #endif
+094   #ifdef BN_MP_EXPTMOD_FAST_C
+095     \}
+096   #endif
+097   \}
+098   
+099   #endif
+\end{alltt}
+\end{small}
+
+In order to keep the algorithms in a known state the first step on line 28 is to reject any negative modulus as input.  If the exponent is
+negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$.  The temporary variable $tmpG$ is assigned
+the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
+exponent.
+
+If the exponent is positive the algorithm resumes the exponentiation.  Line 69 determines if the modulus is of the restricted Diminished Radix 
+form.  If it is not line 77 attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
+of three values.
+
+\begin{enumerate}
+\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form.
+\item $dr = 1$ means that the modulus is of restricted Diminished Radix form.
+\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
+\end{enumerate}
+
+Line 67 determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
+the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.  
+
+\subsection{Barrett Modular Exponentiation}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_exptmod}. \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+1.  $k \leftarrow lg(x)$ \\
+2.  $winsize \leftarrow  \left \lbrace \begin{array}{ll}
+                              2 &  \mbox{if }k \le 7 \\
+                              3 &  \mbox{if }7 < k \le 36 \\
+                              4 &  \mbox{if }36 < k \le 140 \\
+                              5 &  \mbox{if }140 < k \le 450 \\
+                              6 &  \mbox{if }450 < k \le 1303 \\
+                              7 &  \mbox{if }1303 < k \le 3529 \\
+                              8 &  \mbox{if }3529 < k \\
+                              \end{array} \right .$ \\
+3.  Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\
+4.  Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\
+5.  $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\
+\\
+Setup the table of small powers of $g$.  First find $g^{2^{winsize}}$ and then all multiples of it. \\
+6.  $k \leftarrow 2^{winsize - 1}$ \\
+7.  $M_{k} \leftarrow M_1$ \\
+8.  for $ix$ from 0 to $winsize - 2$ do \\
+\hspace{3mm}8.1  $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr})  \\
+\hspace{3mm}8.2  $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
+9.  for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\
+\hspace{3mm}9.1  $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\
+\hspace{3mm}9.2  $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
+10.  $res \leftarrow 1$ \\
+\\
+Start Sliding Window. \\
+11.  $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\
+12.  Loop \\
+\hspace{3mm}12.1  $bitcnt \leftarrow bitcnt - 1$ \\
+\hspace{3mm}12.2  If $bitcnt = 0$ then do \\
+\hspace{6mm}12.2.1  If $digidx = -1$ goto step 13. \\
+\hspace{6mm}12.2.2  $buf \leftarrow x_{digidx}$ \\
+\hspace{6mm}12.2.3  $digidx \leftarrow digidx - 1$ \\
+\hspace{6mm}12.2.4  $bitcnt \leftarrow lg(\beta)$ \\
+Continued on next page. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_exptmod}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+\hspace{3mm}12.3  $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\
+\hspace{3mm}12.4  $buf \leftarrow buf << 1$ \\
+\hspace{3mm}12.5  if $mode = 0$ and $y = 0$ then goto step 12. \\
+\hspace{3mm}12.6  if $mode = 1$ and $y = 0$ then do \\
+\hspace{6mm}12.6.1  $res \leftarrow res^2$ \\
+\hspace{6mm}12.6.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}12.6.3  Goto step 12. \\
+\hspace{3mm}12.7  $bitcpy \leftarrow bitcpy + 1$ \\
+\hspace{3mm}12.8  $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\
+\hspace{3mm}12.9  $mode \leftarrow 2$ \\
+\hspace{3mm}12.10  If $bitcpy = winsize$ then do \\
+\hspace{6mm}Window is full so perform the squarings and single multiplication. \\
+\hspace{6mm}12.10.1  for $ix$ from $0$ to $winsize -1$ do \\
+\hspace{9mm}12.10.1.1  $res \leftarrow res^2$ \\
+\hspace{9mm}12.10.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}12.10.2  $res \leftarrow res \cdot M_{bitbuf}$ \\
+\hspace{6mm}12.10.3  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}Reset the window. \\
+\hspace{6mm}12.10.4  $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\
+\\
+No more windows left.  Check for residual bits of exponent. \\
+13.  If $mode = 2$ and $bitcpy > 0$ then do \\
+\hspace{3mm}13.1  for $ix$ form $0$ to $bitcpy - 1$ do \\
+\hspace{6mm}13.1.1  $res \leftarrow res^2$ \\
+\hspace{6mm}13.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}13.1.3  $bitbuf \leftarrow bitbuf << 1$ \\
+\hspace{6mm}13.1.4  If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\
+\hspace{9mm}13.1.4.1  $res \leftarrow res \cdot M_{1}$ \\
+\hspace{9mm}13.1.4.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+14.  $y \leftarrow res$ \\
+15.  Clear $res$, $mu$ and the $M$ array. \\
+16.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_exptmod (continued)}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_exptmod.}
+This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$.  It takes advantage of the Barrett reduction
+algorithm to keep the product small throughout the algorithm.
+
+The first two steps determine the optimal window size based on the number of bits in the exponent.  The larger the exponent the 
+larger the window size becomes.  After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated.  This
+table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$.  
+
+After the table is allocated the first power of $g$ is found.  Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make
+the rest of the algorithm more efficient.  The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$
+times.  The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$.
+
+Now that the table is available the sliding window may begin.  The following list describes the functions of all the variables in the window.
+\begin{enumerate}
+\item The variable $mode$ dictates how the bits of the exponent are interpreted.  
+\begin{enumerate}
+   \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet.  For example, if the exponent were simply 
+         $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit.  In this case bits are ignored until a non-zero bit is found.  
+   \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet.  In this mode leading $0$ bits 
+         are read and a single squaring is performed.  If a non-zero bit is read a new window is created.  
+   \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit
+         downwards.
+\end{enumerate}
+\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read.  When it reaches zero a new digit
+      is fetched from the exponent.
+\item The variable $buf$ holds the currently read digit of the exponent. 
+\item The variable $digidx$ is an index into the exponents digits.  It starts at the leading digit $x.used - 1$ and moves towards the trailing digit.
+\item The variable $bitcpy$ indicates how many bits are in the currently formed window.  When it reaches $winsize$ the window is flushed and
+      the appropriate operations performed.
+\item The variable $bitbuf$ holds the current bits of the window being formed.  
+\end{enumerate}
+
+All of step 12 is the window processing loop.  It will iterate while there are digits available form the exponent to read.  The first step
+inside this loop is to extract a new digit if no more bits are available in the current digit.  If there are no bits left a new digit is
+read and if there are no digits left than the loop terminates.  
+
+After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit
+upwards.  In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to 
+trailing edges the entire exponent is read from most significant bit to least significant bit.
+
+At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read.  This prevents the 
+algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read.  Step 12.6 and 12.7-10 handle
+the two cases of $mode = 1$ and $mode = 2$ respectively.  
+
+\begin{center}
+\begin{figure}[here]
+\includegraphics{pics/expt_state.ps}
+\caption{Sliding Window State Diagram}
+\label{pic:expt_state}
+\end{figure}
+\end{center}
+
+By step 13 there are no more digits left in the exponent.  However, there may be partial bits in the window left.  If $mode = 2$ then 
+a Left-to-Right algorithm is used to process the remaining few bits.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_exptmod.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   #ifdef MP_LOW_MEM
+018      #define TAB_SIZE 32
+019   #else
+020      #define TAB_SIZE 256
+021   #endif
+022   
+023   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+024   \{
+025     mp_int  M[TAB_SIZE], res, mu;
+026     mp_digit buf;
+027     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+028   
+029     /* find window size */
+030     x = mp_count_bits (X);
+031     if (x <= 7) \{
+032       winsize = 2;
+033     \} else if (x <= 36) \{
+034       winsize = 3;
+035     \} else if (x <= 140) \{
+036       winsize = 4;
+037     \} else if (x <= 450) \{
+038       winsize = 5;
+039     \} else if (x <= 1303) \{
+040       winsize = 6;
+041     \} else if (x <= 3529) \{
+042       winsize = 7;
+043     \} else \{
+044       winsize = 8;
+045     \}
+046   
+047   #ifdef MP_LOW_MEM
+048       if (winsize > 5) \{
+049          winsize = 5;
+050       \}
+051   #endif
+052   
+053     /* init M array */
+054     /* init first cell */
+055     if ((err = mp_init(&M[1])) != MP_OKAY) \{
+056        return err; 
+057     \}
+058   
+059     /* now init the second half of the array */
+060     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+061       if ((err = mp_init(&M[x])) != MP_OKAY) \{
+062         for (y = 1<<(winsize-1); y < x; y++) \{
+063           mp_clear (&M[y]);
+064         \}
+065         mp_clear(&M[1]);
+066         return err;
+067       \}
+068     \}
+069   
+070     /* create mu, used for Barrett reduction */
+071     if ((err = mp_init (&mu)) != MP_OKAY) \{
+072       goto __M;
+073     \}
+074     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
+075       goto __MU;
+076     \}
+077   
+078     /* create M table
+079      *
+080      * The M table contains powers of the base, 
+081      * e.g. M[x] = G**x mod P
+082      *
+083      * The first half of the table is not 
+084      * computed though accept for M[0] and M[1]
+085      */
+086     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
+087       goto __MU;
+088     \}
+089   
+090     /* compute the value at M[1<<(winsize-1)] by squaring 
+091      * M[1] (winsize-1) times 
+092      */
+093     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
+094       goto __MU;
+095     \}
+096   
+097     for (x = 0; x < (winsize - 1); x++) \{
+098       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+099                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
+100         goto __MU;
+101       \}
+102       if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
+103         goto __MU;
+104       \}
+105     \}
+106   
+107     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+108      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+109      */
+110     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
+111       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
+112         goto __MU;
+113       \}
+114       if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) \{
+115         goto __MU;
+116       \}
+117     \}
+118   
+119     /* setup result */
+120     if ((err = mp_init (&res)) != MP_OKAY) \{
+121       goto __MU;
+122     \}
+123     mp_set (&res, 1);
+124   
+125     /* set initial mode and bit cnt */
+126     mode   = 0;
+127     bitcnt = 1;
+128     buf    = 0;
+129     digidx = X->used - 1;
+130     bitcpy = 0;
+131     bitbuf = 0;
+132   
+133     for (;;) \{
+134       /* grab next digit as required */
+135       if (--bitcnt == 0) \{
+136         /* if digidx == -1 we are out of digits */
+137         if (digidx == -1) \{
+138           break;
+139         \}
+140         /* read next digit and reset the bitcnt */
+141         buf    = X->dp[digidx--];
+142         bitcnt = (int) DIGIT_BIT;
+143       \}
+144   
+145       /* grab the next msb from the exponent */
+146       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+147       buf <<= (mp_digit)1;
+148   
+149       /* if the bit is zero and mode == 0 then we ignore it
+150        * These represent the leading zero bits before the first 1 bit
+151        * in the exponent.  Technically this opt is not required but it
+152        * does lower the # of trivial squaring/reductions used
+153        */
+154       if (mode == 0 && y == 0) \{
+155         continue;
+156       \}
+157   
+158       /* if the bit is zero and mode == 1 then we square */
+159       if (mode == 1 && y == 0) \{
+160         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+161           goto __RES;
+162         \}
+163         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
+164           goto __RES;
+165         \}
+166         continue;
+167       \}
+168   
+169       /* else we add it to the window */
+170       bitbuf |= (y << (winsize - ++bitcpy));
+171       mode    = 2;
+172   
+173       if (bitcpy == winsize) \{
+174         /* ok window is filled so square as required and multiply  */
+175         /* square first */
+176         for (x = 0; x < winsize; x++) \{
+177           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+178             goto __RES;
+179           \}
+180           if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
+181             goto __RES;
+182           \}
+183         \}
+184   
+185         /* then multiply */
+186         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
+187           goto __RES;
+188         \}
+189         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
+190           goto __RES;
+191         \}
+192   
+193         /* empty window and reset */
+194         bitcpy = 0;
+195         bitbuf = 0;
+196         mode   = 1;
+197       \}
+198     \}
+199   
+200     /* if bits remain then square/multiply */
+201     if (mode == 2 && bitcpy > 0) \{
+202       /* square then multiply if the bit is set */
+203       for (x = 0; x < bitcpy; x++) \{
+204         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+205           goto __RES;
+206         \}
+207         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
+208           goto __RES;
+209         \}
+210   
+211         bitbuf <<= 1;
+212         if ((bitbuf & (1 << winsize)) != 0) \{
+213           /* then multiply */
+214           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
+215             goto __RES;
+216           \}
+217           if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
+218             goto __RES;
+219           \}
+220         \}
+221       \}
+222     \}
+223   
+224     mp_exch (&res, Y);
+225     err = MP_OKAY;
+226   __RES:mp_clear (&res);
+227   __MU:mp_clear (&mu);
+228   __M:
+229     mp_clear(&M[1]);
+230     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+231       mp_clear (&M[x]);
+232     \}
+233     return err;
+234   \}
+235   #endif
+\end{alltt}
+\end{small}
+
+Lines 31 through 41 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
+from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
+on line 33 the value of $x$ is already known to be greater than $140$.  
+
+The conditional piece of code beginning on line 47 allows the window size to be restricted to five bits.  This logic is used to ensure
+the table of precomputed powers of $G$ remains relatively small.  
+
+The for loop on line 60 initializes the $M$ array while lines 61 and 74 compute the value of $\mu$ required for
+Barrett reduction.  
+
+-- More later.
+
+\section{Quick Power of Two}
+Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms.  Recall that a logical shift left $m << k$ is
+equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two can be achieved.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_2expt}. \\
+\textbf{Input}.   integer $b$ \\
+\textbf{Output}.  $a \leftarrow 2^b$ \\
+\hline \\
+1.  $a \leftarrow 0$ \\
+2.  If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\
+3.  $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\
+4.  $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_2expt}
+\end{figure}
+
+\textbf{Algorithm mp\_2expt.}
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_2expt.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes a = 2**b 
+018    *
+019    * Simple algorithm which zeroes the int, grows it then just sets one bit
+020    * as required.
+021    */
+022   int
+023   mp_2expt (mp_int * a, int b)
+024   \{
+025     int     res;
+026   
+027     /* zero a as per default */
+028     mp_zero (a);
+029   
+030     /* grow a to accomodate the single bit */
+031     if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) \{
+032       return res;
+033     \}
+034   
+035     /* set the used count of where the bit will go */
+036     a->used = b / DIGIT_BIT + 1;
+037   
+038     /* put the single bit in its place */
+039     a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
+040   
+041     return MP_OKAY;
+042   \}
+043   #endif
+\end{alltt}
+\end{small}
+
+\chapter{Higher Level Algorithms}
+
+This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package.  These
+routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important.  
+
+The first section describes a method of integer division with remainder that is universally well known.  It provides the signed division logic
+for the package.  The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations.  
+These algorithms serve mostly to simplify other algorithms where small constants are required.  The last two sections discuss how to manipulate 
+various representations of integers.  For example, converting from an mp\_int to a string of character.
+
+\section{Integer Division with Remainder}
+\label{sec:division}
+
+Integer division aside from modular exponentiation is the most intensive algorithm to compute.  Like addition, subtraction and multiplication
+the basis of this algorithm is the long-hand division algorithm taught to school children.  Throughout this discussion several common variables
+will be used.  Let $x$ represent the divisor and $y$ represent the dividend.  Let $q$ represent the integer quotient $\lfloor y / x \rfloor$ and 
+let $r$ represent the remainder $r = y - x \lfloor y / x \rfloor$.  The following simple algorithm will be used to start the discussion.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Radix-$\beta$ Integer Division}. \\
+\textbf{Input}.   integer $x$ and $y$ \\
+\textbf{Output}.  $q = \lfloor y/x\rfloor, r = y - xq$ \\
+\hline \\
+1.  $q \leftarrow 0$ \\
+2.  $n \leftarrow \vert \vert y \vert \vert - \vert \vert x \vert \vert$ \\
+3.  for $t$ from $n$ down to $0$ do \\
+\hspace{3mm}3.1  Maximize $k$ such that $kx\beta^t$ is less than or equal to $y$ and $(k + 1)x\beta^t$ is greater. \\
+\hspace{3mm}3.2  $q \leftarrow q + k\beta^t$ \\
+\hspace{3mm}3.3  $y \leftarrow y - kx\beta^t$ \\
+4.  $r \leftarrow y$ \\
+5.  Return($q, r$) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Radix-$\beta$ Integer Division}
+\label{fig:raddiv}
+\end{figure}
+
+As children we are taught this very simple algorithm for the case of $\beta = 10$.  Almost instinctively several optimizations are taught for which
+their reason of existing are never explained.  For this example let $y = 5471$ represent the dividend and $x = 23$ represent the divisor.
+
+To find the first digit of the quotient the value of $k$ must be maximized such that $kx\beta^t$ is less than or equal to $y$ and 
+simultaneously $(k + 1)x\beta^t$ is greater than $y$.  Implicitly $k$ is the maximum value the $t$'th digit of the quotient may have.  The habitual method
+used to find the maximum is to ``eyeball'' the two numbers, typically only the leading digits and quickly estimate a quotient.  By only using leading
+digits a much simpler division may be used to form an educated guess at what the value must be.  In this case $k = \lfloor 54/23\rfloor = 2$ quickly 
+arises as a possible  solution.  Indeed $2x\beta^2 = 4600$ is less than $y = 5471$ and simultaneously $(k + 1)x\beta^2 = 6900$ is larger than $y$.  
+As a  result $k\beta^2$ is added to the quotient which now equals $q = 200$ and $4600$ is subtracted from $y$ to give a remainder of $y = 841$.
+
+Again this process is repeated to produce the quotient digit $k = 3$ which makes the quotient $q = 200 + 3\beta = 230$ and the remainder 
+$y = 841 - 3x\beta = 181$.  Finally the last iteration of the loop produces $k = 7$ which leads to the quotient $q = 230 + 7 = 237$ and the
+remainder $y = 181 - 7x = 20$.  The final quotient and remainder found are $q = 237$ and $r = y = 20$ which are indeed correct since 
+$237 \cdot 23 + 20 = 5471$ is true.  
+
+\subsection{Quotient Estimation}
+\label{sec:divest}
+As alluded to earlier the quotient digit $k$ can be estimated from only the leading digits of both the divisor and dividend.  When $p$ leading
+digits are used from both the divisor and dividend to form an estimation the accuracy of the estimation rises as $p$ grows.  Technically
+speaking the estimation is based on assuming the lower $\vert \vert y \vert \vert - p$ and $\vert \vert x \vert \vert - p$ lower digits of the
+dividend and divisor are zero.  
+
+The value of the estimation may off by a few values in either direction and in general is fairly correct.  A simplification \cite[pp. 271]{TAOCPV2}
+of the estimation technique is to use $t + 1$ digits of the dividend and $t$ digits of the divisor, in particularly when $t = 1$.  The estimate 
+using this technique is never too small.  For the following proof let $t = \vert \vert y \vert \vert - 1$ and $s = \vert \vert x \vert \vert - 1$ 
+represent the most significant digits of the dividend and divisor respectively.
+
+\textbf{Proof.}\textit{  The quotient $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ is greater than or equal to 
+$k = \lfloor y / (x \cdot \beta^{\vert \vert y \vert \vert - \vert \vert x \vert \vert - 1}) \rfloor$. }
+The first obvious case is when $\hat k = \beta - 1$ in which case the proof is concluded since the real quotient cannot be larger.  For all other 
+cases $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ and $\hat k x_s \ge y_t\beta + y_{t-1} - x_s + 1$.  The latter portion of the inequalility
+$-x_s + 1$ arises from the fact that a truncated integer division will give the same quotient for at most $x_s - 1$ values.  Next a series of 
+inequalities will prove the hypothesis.
+
+\begin{equation}
+y - \hat k x \le y - \hat k x_s\beta^s
+\end{equation}
+
+This is trivially true since $x \ge x_s\beta^s$.  Next we replace $\hat kx_s\beta^s$ by the previous inequality for $\hat kx_s$.  
+
+\begin{equation}
+y - \hat k x \le y_t\beta^t + \ldots + y_0 - (y_t\beta^t + y_{t-1}\beta^{t-1} - x_s\beta^t + \beta^s)
+\end{equation}
+
+By simplifying the previous inequality the following inequality is formed.
+
+\begin{equation}
+y - \hat k x \le y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s
+\end{equation}
+
+Subsequently,
+
+\begin{equation}
+y_{t-2}\beta^{t-2} + \ldots +  y_0  + x_s\beta^s - \beta^s < x_s\beta^s \le x
+\end{equation}
+
+Which proves that $y - \hat kx \le x$ and by consequence $\hat k \ge k$ which concludes the proof.  \textbf{QED}
+
+
+\subsection{Normalized Integers}
+For the purposes of division a normalized input is when the divisors leading digit $x_n$ is greater than or equal to $\beta / 2$.  By multiplying both
+$x$ and $y$ by $j = \lfloor (\beta / 2) / x_n \rfloor$ the quotient remains unchanged and the remainder is simply $j$ times the original
+remainder.  The purpose of normalization is to ensure the leading digit of the divisor is sufficiently large such that the estimated quotient will
+lie in the domain of a single digit.  Consider the maximum dividend $(\beta - 1) \cdot \beta + (\beta - 1)$ and the minimum divisor $\beta / 2$.  
+
+\begin{equation} 
+{{\beta^2 - 1} \over { \beta / 2}} \le 2\beta - {2 \over \beta} 
+\end{equation}
+
+At most the quotient approaches $2\beta$, however, in practice this will not occur since that would imply the previous quotient digit was too small.  
+
+\subsection{Radix-$\beta$ Division with Remainder}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div}. \\
+\textbf{Input}.   mp\_int $a, b$ \\
+\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
+\hline \\
+1.  If $b = 0$ return(\textit{MP\_VAL}). \\
+2.  If $\vert a \vert < \vert b \vert$ then do \\
+\hspace{3mm}2.1  $d \leftarrow a$ \\
+\hspace{3mm}2.2  $c \leftarrow 0$ \\
+\hspace{3mm}2.3  Return(\textit{MP\_OKAY}). \\
+\\
+Setup the quotient to receive the digits. \\
+3.  Grow $q$ to $a.used + 2$ digits. \\
+4.  $q \leftarrow 0$ \\
+5.  $x \leftarrow \vert a \vert , y \leftarrow \vert b \vert$ \\
+6.  $sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = b.sign \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\\
+Normalize the inputs such that the leading digit of $y$ is greater than or equal to $\beta / 2$. \\
+7.  $norm \leftarrow (lg(\beta) - 1) - (\lceil lg(y) \rceil \mbox{ (mod }lg(\beta)\mbox{)})$ \\
+8.  $x \leftarrow x \cdot 2^{norm}, y \leftarrow y \cdot 2^{norm}$ \\
+\\
+Find the leading digit of the quotient. \\
+9.  $n \leftarrow x.used - 1, t \leftarrow y.used - 1$ \\
+10.  $y \leftarrow y \cdot \beta^{n - t}$ \\
+11.  While ($x \ge y$) do \\
+\hspace{3mm}11.1  $q_{n - t} \leftarrow q_{n - t} + 1$ \\
+\hspace{3mm}11.2  $x \leftarrow x - y$ \\
+12.  $y \leftarrow \lfloor y / \beta^{n-t} \rfloor$ \\
+\\
+Continued on the next page. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div} (continued). \\
+\textbf{Input}.   mp\_int $a, b$ \\
+\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
+\hline \\
+Now find the remainder fo the digits. \\
+13.  for $i$ from $n$ down to $(t + 1)$ do \\
+\hspace{3mm}13.1  If $i > x.used$ then jump to the next iteration of this loop. \\
+\hspace{3mm}13.2  If $x_{i} = y_{t}$ then \\
+\hspace{6mm}13.2.1  $q_{i - t - 1} \leftarrow \beta - 1$ \\
+\hspace{3mm}13.3  else \\
+\hspace{6mm}13.3.1  $\hat r \leftarrow x_{i} \cdot \beta + x_{i - 1}$ \\
+\hspace{6mm}13.3.2  $\hat r \leftarrow \lfloor \hat r / y_{t} \rfloor$ \\
+\hspace{6mm}13.3.3  $q_{i - t - 1} \leftarrow \hat r$ \\
+\hspace{3mm}13.4  $q_{i - t - 1} \leftarrow q_{i - t - 1} + 1$ \\
+\\
+Fixup quotient estimation. \\
+\hspace{3mm}13.5  Loop \\
+\hspace{6mm}13.5.1  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
+\hspace{6mm}13.5.2  t$1 \leftarrow 0$ \\
+\hspace{6mm}13.5.3  t$1_0 \leftarrow y_{t - 1}, $ t$1_1 \leftarrow y_t,$ t$1.used \leftarrow 2$ \\
+\hspace{6mm}13.5.4  $t1 \leftarrow t1 \cdot q_{i - t - 1}$ \\
+\hspace{6mm}13.5.5  t$2_0 \leftarrow x_{i - 2}, $ t$2_1 \leftarrow x_{i - 1}, $ t$2_2 \leftarrow x_i, $ t$2.used \leftarrow 3$ \\
+\hspace{6mm}13.5.6  If $\vert t1 \vert > \vert t2 \vert$ then goto step 13.5. \\
+\hspace{3mm}13.6  t$1 \leftarrow y \cdot q_{i - t - 1}$ \\
+\hspace{3mm}13.7  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
+\hspace{3mm}13.8  $x \leftarrow x - $ t$1$ \\
+\hspace{3mm}13.9  If $x.sign = MP\_NEG$ then \\
+\hspace{6mm}13.10  t$1 \leftarrow y$ \\
+\hspace{6mm}13.11  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
+\hspace{6mm}13.12  $x \leftarrow x + $ t$1$ \\
+\hspace{6mm}13.13  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
+\\
+Finalize the result. \\
+14.  Clamp excess digits of $q$ \\
+15.  $c \leftarrow q, c.sign \leftarrow sign$ \\
+16.  $x.sign \leftarrow a.sign$ \\
+17.  $d \leftarrow \lfloor x / 2^{norm} \rfloor$ \\
+18.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div (continued)}
+\end{figure}
+\textbf{Algorithm mp\_div.}
+This algorithm will calculate quotient and remainder from an integer division given a dividend and divisor.  The algorithm is a signed
+division and will produce a fully qualified quotient and remainder.
+
+First the divisor $b$ must be non-zero which is enforced in step one.  If the divisor is larger than the dividend than the quotient is implicitly 
+zero and the remainder is the dividend.  
+
+After the first two trivial cases of inputs are handled the variable $q$ is setup to receive the digits of the quotient.  Two unsigned copies of the
+divisor $y$ and dividend $x$ are made as well.  The core of the division algorithm is an unsigned division and will only work if the values are
+positive.  Now the two values $x$ and $y$ must be normalized such that the leading digit of $y$ is greater than or equal to $\beta / 2$.  
+This is performed by shifting both to the left by enough bits to get the desired normalization.  
+
+At this point the division algorithm can begin producing digits of the quotient.  Recall that maximum value of the estimation used is 
+$2\beta - {2 \over \beta}$ which means that a digit of the quotient must be first produced by another means.  In this case $y$ is shifted
+to the left (\textit{step ten}) so that it has the same number of digits as $x$.  The loop on step eleven will subtract multiples of the 
+shifted copy of $y$ until $x$ is smaller.  Since the leading digit of $y$ is greater than or equal to $\beta/2$ this loop will iterate at most two
+times to produce the desired leading digit of the quotient.  
+
+Now the remainder of the digits can be produced.  The equation $\hat q = \lfloor {{x_i \beta + x_{i-1}}\over y_t} \rfloor$ is used to fairly
+accurately approximate the true quotient digit.  The estimation can in theory produce an estimation as high as $2\beta - {2 \over \beta}$ but by
+induction the upper quotient digit is correct (\textit{as established on step eleven}) and the estimate must be less than $\beta$.  
+
+Recall from section~\ref{sec:divest} that the estimation is never too low but may be too high.  The next step of the estimation process is
+to refine the estimation.  The loop on step 13.5 uses $x_i\beta^2 + x_{i-1}\beta + x_{i-2}$ and $q_{i - t - 1}(y_t\beta + y_{t-1})$ as a higher
+order approximation to adjust the quotient digit.
+
+After both phases of estimation the quotient digit may still be off by a value of one\footnote{This is similar to the error introduced
+by optimizing Barrett reduction.}.  Steps 13.6 and 13.7 subtract the multiple of the divisor from the dividend (\textit{Similar to step 3.3 of
+algorithm~\ref{fig:raddiv}} and then subsequently add a multiple of the divisor if the quotient was too large.  
+
+Now that the quotient has been determine finializing the result is a matter of clamping the quotient, fixing the sizes and de-normalizing the 
+remainder.  An important aspect of this algorithm seemingly overlooked in other descriptions such as that of Algorithm 14.20 HAC \cite[pp. 598]{HAC}
+is that when the estimations are being made (\textit{inside the loop on step 13.5}) that the digits $y_{t-1}$, $x_{i-2}$ and $x_{i-1}$ may lie 
+outside their respective boundaries.  For example, if $t = 0$ or $i \le 1$ then the digits would be undefined.  In those cases the digits should
+respectively be replaced with a zero.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   #ifdef BN_MP_DIV_SMALL
+018   
+019   /* slower bit-bang division... also smaller */
+020   int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+021   \{
+022      mp_int ta, tb, tq, q;
+023      int    res, n, n2;
+024   
+025     /* is divisor zero ? */
+026     if (mp_iszero (b) == 1) \{
+027       return MP_VAL;
+028     \}
+029   
+030     /* if a < b then q=0, r = a */
+031     if (mp_cmp_mag (a, b) == MP_LT) \{
+032       if (d != NULL) \{
+033         res = mp_copy (a, d);
+034       \} else \{
+035         res = MP_OKAY;
+036       \}
+037       if (c != NULL) \{
+038         mp_zero (c);
+039       \}
+040       return res;
+041     \}
+042       
+043     /* init our temps */
+044     if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) \{
+045        return res;
+046     \}
+047   
+048   
+049     mp_set(&tq, 1);
+050     n = mp_count_bits(a) - mp_count_bits(b);
+051     if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
+052         ((res = mp_copy(b, &tb)) != MP_OKAY) || 
+053         ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
+054         ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) \{
+055         goto __ERR;
+056     \}
+057   
+058     while (n-- >= 0) \{
+059        if (mp_cmp(&tb, &ta) != MP_GT) \{
+060           if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
+061               ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) \{
+062              goto __ERR;
+063           \}
+064        \}
+065        if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
+066            ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) \{
+067              goto __ERR;
+068        \}
+069     \}
+070   
+071     /* now q == quotient and ta == remainder */
+072     n  = a->sign;
+073     n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
+074     if (c != NULL) \{
+075        mp_exch(c, &q);
+076        c->sign  = n2;
+077     \}
+078     if (d != NULL) \{
+079        mp_exch(d, &ta);
+080        d->sign = n;
+081     \}
+082   __ERR:
+083      mp_clear_multi(&ta, &tb, &tq, &q, NULL);
+084      return res;
+085   \}
+086   
+087   #else
+088   
+089   /* integer signed division. 
+090    * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
+091    * HAC pp.598 Algorithm 14.20
+092    *
+093    * Note that the description in HAC is horribly 
+094    * incomplete.  For example, it doesn't consider 
+095    * the case where digits are removed from 'x' in 
+096    * the inner loop.  It also doesn't consider the 
+097    * case that y has fewer than three digits, etc..
+098    *
+099    * The overall algorithm is as described as 
+100    * 14.20 from HAC but fixed to treat these cases.
+101   */
+102   int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+103   \{
+104     mp_int  q, x, y, t1, t2;
+105     int     res, n, t, i, norm, neg;
+106   
+107     /* is divisor zero ? */
+108     if (mp_iszero (b) == 1) \{
+109       return MP_VAL;
+110     \}
+111   
+112     /* if a < b then q=0, r = a */
+113     if (mp_cmp_mag (a, b) == MP_LT) \{
+114       if (d != NULL) \{
+115         res = mp_copy (a, d);
+116       \} else \{
+117         res = MP_OKAY;
+118       \}
+119       if (c != NULL) \{
+120         mp_zero (c);
+121       \}
+122       return res;
+123     \}
+124   
+125     if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) \{
+126       return res;
+127     \}
+128     q.used = a->used + 2;
+129   
+130     if ((res = mp_init (&t1)) != MP_OKAY) \{
+131       goto __Q;
+132     \}
+133   
+134     if ((res = mp_init (&t2)) != MP_OKAY) \{
+135       goto __T1;
+136     \}
+137   
+138     if ((res = mp_init_copy (&x, a)) != MP_OKAY) \{
+139       goto __T2;
+140     \}
+141   
+142     if ((res = mp_init_copy (&y, b)) != MP_OKAY) \{
+143       goto __X;
+144     \}
+145   
+146     /* fix the sign */
+147     neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+148     x.sign = y.sign = MP_ZPOS;
+149   
+150     /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
+151     norm = mp_count_bits(&y) % DIGIT_BIT;
+152     if (norm < (int)(DIGIT_BIT-1)) \{
+153        norm = (DIGIT_BIT-1) - norm;
+154        if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) \{
+155          goto __Y;
+156        \}
+157        if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) \{
+158          goto __Y;
+159        \}
+160     \} else \{
+161        norm = 0;
+162     \}
+163   
+164     /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+165     n = x.used - 1;
+166     t = y.used - 1;
+167   
+168     /* while (x >= y*b**n-t) do \{ q[n-t] += 1; x -= y*b**\{n-t\} \} */
+169     if ((res = mp_lshd (&y, n - t)) != MP_OKAY) \{ /* y = y*b**\{n-t\} */
+170       goto __Y;
+171     \}
+172   
+173     while (mp_cmp (&x, &y) != MP_LT) \{
+174       ++(q.dp[n - t]);
+175       if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) \{
+176         goto __Y;
+177       \}
+178     \}
+179   
+180     /* reset y by shifting it back down */
+181     mp_rshd (&y, n - t);
+182   
+183     /* step 3. for i from n down to (t + 1) */
+184     for (i = n; i >= (t + 1); i--) \{
+185       if (i > x.used) \{
+186         continue;
+187       \}
+188   
+189       /* step 3.1 if xi == yt then set q\{i-t-1\} to b-1, 
+190        * otherwise set q\{i-t-1\} to (xi*b + x\{i-1\})/yt */
+191       if (x.dp[i] == y.dp[t]) \{
+192         q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
+193       \} else \{
+194         mp_word tmp;
+195         tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
+196         tmp |= ((mp_word) x.dp[i - 1]);
+197         tmp /= ((mp_word) y.dp[t]);
+198         if (tmp > (mp_word) MP_MASK)
+199           tmp = MP_MASK;
+200         q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
+201       \}
+202   
+203       /* while (q\{i-t-1\} * (yt * b + y\{t-1\})) > 
+204                xi * b**2 + xi-1 * b + xi-2 
+205        
+206          do q\{i-t-1\} -= 1; 
+207       */
+208       q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
+209       do \{
+210         q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
+211   
+212         /* find left hand */
+213         mp_zero (&t1);
+214         t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+215         t1.dp[1] = y.dp[t];
+216         t1.used = 2;
+217         if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) \{
+218           goto __Y;
+219         \}
+220   
+221         /* find right hand */
+222         t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+223         t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+224         t2.dp[2] = x.dp[i];
+225         t2.used = 3;
+226       \} while (mp_cmp_mag(&t1, &t2) == MP_GT);
+227   
+228       /* step 3.3 x = x - q\{i-t-1\} * y * b**\{i-t-1\} */
+229       if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) \{
+230         goto __Y;
+231       \}
+232   
+233       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{
+234         goto __Y;
+235       \}
+236   
+237       if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) \{
+238         goto __Y;
+239       \}
+240   
+241       /* if x < 0 then \{ x = x + y*b**\{i-t-1\}; q\{i-t-1\} -= 1; \} */
+242       if (x.sign == MP_NEG) \{
+243         if ((res = mp_copy (&y, &t1)) != MP_OKAY) \{
+244           goto __Y;
+245         \}
+246         if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{
+247           goto __Y;
+248         \}
+249         if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) \{
+250           goto __Y;
+251         \}
+252   
+253         q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
+254       \}
+255     \}
+256   
+257     /* now q is the quotient and x is the remainder 
+258      * [which we have to normalize] 
+259      */
+260     
+261     /* get sign before writing to c */
+262     x.sign = x.used == 0 ? MP_ZPOS : a->sign;
+263   
+264     if (c != NULL) \{
+265       mp_clamp (&q);
+266       mp_exch (&q, c);
+267       c->sign = neg;
+268     \}
+269   
+270     if (d != NULL) \{
+271       mp_div_2d (&x, norm, &x, NULL);
+272       mp_exch (&x, d);
+273     \}
+274   
+275     res = MP_OKAY;
+276   
+277   __Y:mp_clear (&y);
+278   __X:mp_clear (&x);
+279   __T2:mp_clear (&t2);
+280   __T1:mp_clear (&t1);
+281   __Q:mp_clear (&q);
+282     return res;
+283   \}
+284   
+285   #endif
+286   
+287   #endif
+\end{alltt}
+\end{small}
+
+The implementation of this algorithm differs slightly from the pseudo code presented previously.  In this algorithm either of the quotient $c$ or
+remainder $d$ may be passed as a \textbf{NULL} pointer which indicates their value is not desired.  For example, the C code to call the division
+algorithm with only the quotient is 
+
+\begin{verbatim}
+mp_div(&a, &b, &c, NULL);  /* c = [a/b] */
+\end{verbatim}
+
+Lines 37 and 44 handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
+respectively.  After the two trivial cases all of the temporary variables are initialized.  Line 105 determines the sign of 
+the quotient and line 76 ensures that both $x$ and $y$ are positive.  
+
+The number of bits in the leading digit is calculated on line 105.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
+of precision which when reduced modulo $lg(\beta)$ produces the value of $k$.  In this case $k$ is the number of bits in the leading digit which is
+exactly what is required.  For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting
+them to the left by $lg(\beta) - 1 - k$ bits.
+
+Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively.  These are first used to produce the 
+leading digit of the quotient.  The loop beginning on line 183 will produce the remainder of the quotient digits.
+
+The conditional ``continue'' on line 114 is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
+algorithm eliminates multiple non-zero digits in a single iteration.  This ensures that $x_i$ is always non-zero since by definition the digits
+above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}.  
+
+Lines 130, 130 and 134 through 134 manually construct the high accuracy estimations by setting the digits of the two mp\_int 
+variables directly.  
+
+\section{Single Digit Helpers}
+
+This section briefly describes a series of single digit helper algorithms which come in handy when working with small constants.  All of 
+the helper functions assume the single digit input is positive and will treat them as such.
+
+\subsection{Single Digit Addition and Subtraction}
+
+Both addition and subtraction are performed by ``cheating'' and using mp\_set followed by the higher level addition or subtraction 
+algorithms.   As a result these algorithms are subtantially simpler with a slight cost in performance.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = a + b$ \\
+\hline \\
+1.  $t \leftarrow b$ (\textit{mp\_set}) \\
+2.  $c \leftarrow a + t$ \\
+3.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_add\_d}
+\end{figure}
+
+\textbf{Algorithm mp\_add\_d.}
+This algorithm initiates a temporary mp\_int with the value of the single digit and uses algorithm mp\_add to add the two values together.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_add\_d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* single digit addition */
+018   int
+019   mp_add_d (mp_int * a, mp_digit b, mp_int * c)
+020   \{
+021     int     res, ix, oldused;
+022     mp_digit *tmpa, *tmpc, mu;
+023   
+024     /* grow c as required */
+025     if (c->alloc < a->used + 1) \{
+026        if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) \{
+027           return res;
+028        \}
+029     \}
+030   
+031     /* if a is negative and |a| >= b, call c = |a| - b */
+032     if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) \{
+033        /* temporarily fix sign of a */
+034        a->sign = MP_ZPOS;
+035   
+036        /* c = |a| - b */
+037        res = mp_sub_d(a, b, c);
+038   
+039        /* fix sign  */
+040        a->sign = c->sign = MP_NEG;
+041   
+042        return res;
+043     \}
+044   
+045     /* old number of used digits in c */
+046     oldused = c->used;
+047   
+048     /* sign always positive */
+049     c->sign = MP_ZPOS;
+050   
+051     /* source alias */
+052     tmpa    = a->dp;
+053   
+054     /* destination alias */
+055     tmpc    = c->dp;
+056   
+057     /* if a is positive */
+058     if (a->sign == MP_ZPOS) \{
+059        /* add digit, after this we're propagating
+060         * the carry.
+061         */
+062        *tmpc   = *tmpa++ + b;
+063        mu      = *tmpc >> DIGIT_BIT;
+064        *tmpc++ &= MP_MASK;
+065   
+066        /* now handle rest of the digits */
+067        for (ix = 1; ix < a->used; ix++) \{
+068           *tmpc   = *tmpa++ + mu;
+069           mu      = *tmpc >> DIGIT_BIT;
+070           *tmpc++ &= MP_MASK;
+071        \}
+072        /* set final carry */
+073        ix++;
+074        *tmpc++  = mu;
+075   
+076        /* setup size */
+077        c->used = a->used + 1;
+078     \} else \{
+079        /* a was negative and |a| < b */
+080        c->used  = 1;
+081   
+082        /* the result is a single digit */
+083        if (a->used == 1) \{
+084           *tmpc++  =  b - a->dp[0];
+085        \} else \{
+086           *tmpc++  =  b;
+087        \}
+088   
+089        /* setup count so the clearing of oldused
+090         * can fall through correctly
+091         */
+092        ix       = 1;
+093     \}
+094   
+095     /* now zero to oldused */
+096     while (ix++ < oldused) \{
+097        *tmpc++ = 0;
+098     \}
+099     mp_clamp(c);
+100   
+101     return MP_OKAY;
+102   \}
+103   
+104   #endif
+\end{alltt}
+\end{small}
+
+Clever use of the letter 't'.
+
+\subsubsection{Subtraction}
+The single digit subtraction algorithm mp\_sub\_d is essentially the same except it uses mp\_sub to subtract the digit from the mp\_int.
+
+\subsection{Single Digit Multiplication}
+Single digit multiplication arises enough in division and radix conversion that it ought to be implement as a special case of the baseline
+multiplication algorithm.  Essentially this algorithm is a modified version of algorithm s\_mp\_mul\_digs where one of the multiplicands
+only has one digit.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = ab$ \\
+\hline \\
+1.  $pa \leftarrow a.used$ \\
+2.  Grow $c$ to at least $pa + 1$ digits. \\
+3.  $oldused \leftarrow c.used$ \\
+4.  $c.used \leftarrow pa + 1$ \\
+5.  $c.sign \leftarrow a.sign$ \\
+6.  $\mu \leftarrow 0$ \\
+7.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}7.1  $\hat r \leftarrow \mu + a_{ix}b$ \\
+\hspace{3mm}7.2  $c_{ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}7.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+8.  $c_{pa} \leftarrow \mu$ \\
+9.  for $ix$ from $pa + 1$ to $oldused$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits of $c$. \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_d}
+\end{figure}
+\textbf{Algorithm mp\_mul\_d.}
+This algorithm quickly multiplies an mp\_int by a small single digit value.  It is specially tailored to the job and has a minimal of overhead.  
+Unlike the full multiplication algorithms this algorithm does not require any significnat temporary storage or memory allocations.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* multiply by a digit */
+018   int
+019   mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
+020   \{
+021     mp_digit u, *tmpa, *tmpc;
+022     mp_word  r;
+023     int      ix, res, olduse;
+024   
+025     /* make sure c is big enough to hold a*b */
+026     if (c->alloc < a->used + 1) \{
+027       if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) \{
+028         return res;
+029       \}
+030     \}
+031   
+032     /* get the original destinations used count */
+033     olduse = c->used;
+034   
+035     /* set the sign */
+036     c->sign = a->sign;
+037   
+038     /* alias for a->dp [source] */
+039     tmpa = a->dp;
+040   
+041     /* alias for c->dp [dest] */
+042     tmpc = c->dp;
+043   
+044     /* zero carry */
+045     u = 0;
+046   
+047     /* compute columns */
+048     for (ix = 0; ix < a->used; ix++) \{
+049       /* compute product and carry sum for this term */
+050       r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
+051   
+052       /* mask off higher bits to get a single digit */
+053       *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+054   
+055       /* send carry into next iteration */
+056       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+057     \}
+058   
+059     /* store final carry [if any] */
+060     *tmpc++ = u;
+061   
+062     /* now zero digits above the top */
+063     while (ix++ < olduse) \{
+064        *tmpc++ = 0;
+065     \}
+066   
+067     /* set used count */
+068     c->used = a->used + 1;
+069     mp_clamp(c);
+070   
+071     return MP_OKAY;
+072   \}
+073   #endif
+\end{alltt}
+\end{small}
+
+In this implementation the destination $c$ may point to the same mp\_int as the source $a$ since the result is written after the digit is 
+read from the source.  This function uses pointer aliases $tmpa$ and $tmpc$ for the digits of $a$ and $c$ respectively.  
+
+\subsection{Single Digit Division}
+Like the single digit multiplication algorithm, single digit division is also a fairly common algorithm used in radix conversion.  Since the
+divisor is only a single digit a specialized variant of the division algorithm can be used to compute the quotient.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = \lfloor a / b \rfloor, d = a - cb$ \\
+\hline \\
+1.  If $b = 0$ then return(\textit{MP\_VAL}).\\
+2.  If $b = 3$ then use algorithm mp\_div\_3 instead. \\
+3.  Init $q$ to $a.used$ digits.  \\
+4.  $q.used \leftarrow a.used$ \\
+5.  $q.sign \leftarrow a.sign$ \\
+6.  $\hat w \leftarrow 0$ \\
+7.  for $ix$ from $a.used - 1$ down to $0$ do \\
+\hspace{3mm}7.1  $\hat w \leftarrow \hat w \beta + a_{ix}$ \\
+\hspace{3mm}7.2  If $\hat w \ge b$ then \\
+\hspace{6mm}7.2.1  $t \leftarrow \lfloor \hat w / b \rfloor$ \\
+\hspace{6mm}7.2.2  $\hat w \leftarrow \hat w \mbox{ (mod }b\mbox{)}$ \\
+\hspace{3mm}7.3  else\\
+\hspace{6mm}7.3.1  $t \leftarrow 0$ \\
+\hspace{3mm}7.4  $q_{ix} \leftarrow t$ \\
+8.  $d \leftarrow \hat w$ \\
+9.  Clamp excess digits of $q$. \\
+10.  $c \leftarrow q$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_d}
+\end{figure}
+\textbf{Algorithm mp\_div\_d.}
+This algorithm divides the mp\_int $a$ by the single mp\_digit $b$ using an optimized approach.  Essentially in every iteration of the
+algorithm another digit of the dividend is reduced and another digit of quotient produced.  Provided $b < \beta$ the value of $\hat w$
+after step 7.1 will be limited such that $0 \le \lfloor \hat w / b \rfloor < \beta$.  
+
+If the divisor $b$ is equal to three a variant of this algorithm is used which is called mp\_div\_3.  It replaces the division by three with
+a multiplication by $\lfloor \beta / 3 \rfloor$ and the appropriate shift and residual fixup.  In essence it is much like the Barrett reduction
+from chapter seven.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   static int s_is_power_of_two(mp_digit b, int *p)
+018   \{
+019      int x;
+020   
+021      for (x = 1; x < DIGIT_BIT; x++) \{
+022         if (b == (((mp_digit)1)<<x)) \{
+023            *p = x;
+024            return 1;
+025         \}
+026      \}
+027      return 0;
+028   \}
+029   
+030   /* single digit division (based on routine from MPI) */
+031   int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
+032   \{
+033     mp_int  q;
+034     mp_word w;
+035     mp_digit t;
+036     int     res, ix;
+037   
+038     /* cannot divide by zero */
+039     if (b == 0) \{
+040        return MP_VAL;
+041     \}
+042   
+043     /* quick outs */
+044     if (b == 1 || mp_iszero(a) == 1) \{
+045        if (d != NULL) \{
+046           *d = 0;
+047        \}
+048        if (c != NULL) \{
+049           return mp_copy(a, c);
+050        \}
+051        return MP_OKAY;
+052     \}
+053   
+054     /* power of two ? */
+055     if (s_is_power_of_two(b, &ix) == 1) \{
+056        if (d != NULL) \{
+057           *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
+058        \}
+059        if (c != NULL) \{
+060           return mp_div_2d(a, ix, c, NULL);
+061        \}
+062        return MP_OKAY;
+063     \}
+064   
+065   #ifdef BN_MP_DIV_3_C
+066     /* three? */
+067     if (b == 3) \{
+068        return mp_div_3(a, c, d);
+069     \}
+070   #endif
+071   
+072     /* no easy answer [c'est la vie].  Just division */
+073     if ((res = mp_init_size(&q, a->used)) != MP_OKAY) \{
+074        return res;
+075     \}
+076     
+077     q.used = a->used;
+078     q.sign = a->sign;
+079     w = 0;
+080     for (ix = a->used - 1; ix >= 0; ix--) \{
+081        w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+082        
+083        if (w >= b) \{
+084           t = (mp_digit)(w / b);
+085           w -= ((mp_word)t) * ((mp_word)b);
+086         \} else \{
+087           t = 0;
+088         \}
+089         q.dp[ix] = (mp_digit)t;
+090     \}
+091     
+092     if (d != NULL) \{
+093        *d = (mp_digit)w;
+094     \}
+095     
+096     if (c != NULL) \{
+097        mp_clamp(&q);
+098        mp_exch(&q, c);
+099     \}
+100     mp_clear(&q);
+101     
+102     return res;
+103   \}
+104   
+105   #endif
+\end{alltt}
+\end{small}
+
+Like the implementation of algorithm mp\_div this algorithm allows either of the quotient or remainder to be passed as a \textbf{NULL} pointer to
+indicate the respective value is not required.  This allows a trivial single digit modular reduction algorithm, mp\_mod\_d to be created.
+
+The division and remainder on lines 43 and @45,%@ can be replaced often by a single division on most processors.  For example, the 32-bit x86 based 
+processors can divide a 64-bit quantity by a 32-bit quantity and produce the quotient and remainder simultaneously.  Unfortunately the GCC 
+compiler does not recognize that optimization and will actually produce two function calls to find the quotient and remainder respectively.  
+
+\subsection{Single Digit Root Extraction}
+
+Finding the $n$'th root of an integer is fairly easy as far as numerical analysis is concerned.  Algorithms such as the Newton-Raphson approximation 
+(\ref{eqn:newton}) series will converge very quickly to a root for any continuous function $f(x)$.  
+
+\begin{equation}
+x_{i+1} = x_i - {f(x_i) \over f'(x_i)}
+\label{eqn:newton}
+\end{equation}
+
+In this case the $n$'th root is desired and $f(x) = x^n - a$ where $a$ is the integer of which the root is desired.  The derivative of $f(x)$ is 
+simply $f'(x) = nx^{n - 1}$.  Of particular importance is that this algorithm will be used over the integers not over the a more continuous domain
+such as the real numbers.  As a result the root found can be above the true root by few and must be manually adjusted.  Ideally at the end of the 
+algorithm the $n$'th root $b$ of an integer $a$ is desired such that $b^n \le a$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_n\_root}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c^b \le a$ \\
+\hline \\
+1.  If $b$ is even and $a.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
+2.  $sign \leftarrow a.sign$ \\
+3.  $a.sign \leftarrow MP\_ZPOS$ \\
+4.  t$2 \leftarrow 2$ \\
+5.  Loop \\
+\hspace{3mm}5.1  t$1 \leftarrow $ t$2$ \\
+\hspace{3mm}5.2  t$3 \leftarrow $ t$1^{b - 1}$ \\
+\hspace{3mm}5.3  t$2 \leftarrow $ t$3 $ $\cdot$ t$1$ \\
+\hspace{3mm}5.4  t$2 \leftarrow $ t$2 - a$ \\
+\hspace{3mm}5.5  t$3 \leftarrow $ t$3 \cdot b$ \\
+\hspace{3mm}5.6  t$3 \leftarrow \lfloor $t$2 / $t$3 \rfloor$ \\
+\hspace{3mm}5.7  t$2 \leftarrow $ t$1 - $ t$3$ \\
+\hspace{3mm}5.8  If t$1 \ne $ t$2$ then goto step 5.  \\
+6.  Loop \\
+\hspace{3mm}6.1  t$2 \leftarrow $ t$1^b$ \\
+\hspace{3mm}6.2  If t$2 > a$ then \\
+\hspace{6mm}6.2.1  t$1 \leftarrow $ t$1 - 1$ \\
+\hspace{6mm}6.2.2  Goto step 6. \\
+7.  $a.sign \leftarrow sign$ \\
+8.  $c \leftarrow $ t$1$ \\
+9.  $c.sign \leftarrow sign$  \\
+10.  Return(\textit{MP\_OKAY}).  \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_n\_root}
+\end{figure}
+\textbf{Algorithm mp\_n\_root.}
+This algorithm finds the integer $n$'th root of an input using the Newton-Raphson approach.  It is partially optimized based on the observation
+that the numerator of ${f(x) \over f'(x)}$ can be derived from a partial denominator.  That is at first the denominator is calculated by finding
+$x^{b - 1}$.  This value can then be multiplied by $x$ and have $a$ subtracted from it to find the numerator.  This saves a total of $b - 1$ 
+multiplications by t$1$ inside the loop.  
+
+The initial value of the approximation is t$2 = 2$ which allows the algorithm to start with very small values and quickly converge on the
+root.  Ideally this algorithm is meant to find the $n$'th root of an input where $n$ is bounded by $2 \le n \le 5$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_n\_root.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* find the n'th root of an integer 
+018    *
+019    * Result found such that (c)**b <= a and (c+1)**b > a 
+020    *
+021    * This algorithm uses Newton's approximation 
+022    * x[i+1] = x[i] - f(x[i])/f'(x[i]) 
+023    * which will find the root in log(N) time where 
+024    * each step involves a fair bit.  This is not meant to 
+025    * find huge roots [square and cube, etc].
+026    */
+027   int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
+028   \{
+029     mp_int  t1, t2, t3;
+030     int     res, neg;
+031   
+032     /* input must be positive if b is even */
+033     if ((b & 1) == 0 && a->sign == MP_NEG) \{
+034       return MP_VAL;
+035     \}
+036   
+037     if ((res = mp_init (&t1)) != MP_OKAY) \{
+038       return res;
+039     \}
+040   
+041     if ((res = mp_init (&t2)) != MP_OKAY) \{
+042       goto __T1;
+043     \}
+044   
+045     if ((res = mp_init (&t3)) != MP_OKAY) \{
+046       goto __T2;
+047     \}
+048   
+049     /* if a is negative fudge the sign but keep track */
+050     neg     = a->sign;
+051     a->sign = MP_ZPOS;
+052   
+053     /* t2 = 2 */
+054     mp_set (&t2, 2);
+055   
+056     do \{
+057       /* t1 = t2 */
+058       if ((res = mp_copy (&t2, &t1)) != MP_OKAY) \{
+059         goto __T3;
+060       \}
+061   
+062       /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
+063       
+064       /* t3 = t1**(b-1) */
+065       if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) \{   
+066         goto __T3;
+067       \}
+068   
+069       /* numerator */
+070       /* t2 = t1**b */
+071       if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) \{    
+072         goto __T3;
+073       \}
+074   
+075       /* t2 = t1**b - a */
+076       if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) \{  
+077         goto __T3;
+078       \}
+079   
+080       /* denominator */
+081       /* t3 = t1**(b-1) * b  */
+082       if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) \{    
+083         goto __T3;
+084       \}
+085   
+086       /* t3 = (t1**b - a)/(b * t1**(b-1)) */
+087       if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) \{  
+088         goto __T3;
+089       \}
+090   
+091       if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) \{
+092         goto __T3;
+093       \}
+094     \}  while (mp_cmp (&t1, &t2) != MP_EQ);
+095   
+096     /* result can be off by a few so check */
+097     for (;;) \{
+098       if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) \{
+099         goto __T3;
+100       \}
+101   
+102       if (mp_cmp (&t2, a) == MP_GT) \{
+103         if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) \{
+104            goto __T3;
+105         \}
+106       \} else \{
+107         break;
+108       \}
+109     \}
+110   
+111     /* reset the sign of a first */
+112     a->sign = neg;
+113   
+114     /* set the result */
+115     mp_exch (&t1, c);
+116   
+117     /* set the sign of the result */
+118     c->sign = neg;
+119   
+120     res = MP_OKAY;
+121   
+122   __T3:mp_clear (&t3);
+123   __T2:mp_clear (&t2);
+124   __T1:mp_clear (&t1);
+125     return res;
+126   \}
+127   #endif
+\end{alltt}
+\end{small}
+
+\section{Random Number Generation}
+
+Random numbers come up in a variety of activities from public key cryptography to simple simulations and various randomized algorithms.  Pollard-Rho 
+factoring for example, can make use of random values as starting points to find factors of a composite integer.  In this case the algorithm presented
+is solely for simulations and not intended for cryptographic use.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rand}. \\
+\textbf{Input}.   An integer $b$ \\
+\textbf{Output}.  A pseudo-random number of $b$ digits \\
+\hline \\
+1.  $a \leftarrow 0$ \\
+2.  If $b \le 0$ return(\textit{MP\_OKAY}) \\
+3.  Pick a non-zero random digit $d$. \\
+4.  $a \leftarrow a + d$ \\
+5.  for $ix$ from 1 to $d - 1$ do \\
+\hspace{3mm}5.1  $a \leftarrow a \cdot \beta$ \\
+\hspace{3mm}5.2  Pick a random digit $d$. \\
+\hspace{3mm}5.3  $a \leftarrow a + d$ \\
+6.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rand}
+\end{figure}
+\textbf{Algorithm mp\_rand.}
+This algorithm produces a pseudo-random integer of $b$ digits.  By ensuring that the first digit is non-zero the algorithm also guarantees that the
+final result has at least $b$ digits.  It relies heavily on a third-part random number generator which should ideally generate uniformly all of
+the integers from $0$ to $\beta - 1$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_rand.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* makes a pseudo-random int of a given size */
+018   int
+019   mp_rand (mp_int * a, int digits)
+020   \{
+021     int     res;
+022     mp_digit d;
+023   
+024     mp_zero (a);
+025     if (digits <= 0) \{
+026       return MP_OKAY;
+027     \}
+028   
+029     /* first place a random non-zero digit */
+030     do \{
+031       d = ((mp_digit) abs (rand ()));
+032     \} while (d == 0);
+033   
+034     if ((res = mp_add_d (a, d, a)) != MP_OKAY) \{
+035       return res;
+036     \}
+037   
+038     while (digits-- > 0) \{
+039       if ((res = mp_lshd (a, 1)) != MP_OKAY) \{
+040         return res;
+041       \}
+042   
+043       if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) \{
+044         return res;
+045       \}
+046     \}
+047   
+048     return MP_OKAY;
+049   \}
+050   #endif
+\end{alltt}
+\end{small}
+
+\section{Formatted Representations}
+The ability to emit a radix-$n$ textual representation of an integer is useful for interacting with human parties.  For example, the ability to
+be given a string of characters such as ``114585'' and turn it into the radix-$\beta$ equivalent would make it easier to enter numbers
+into a program.
+
+\subsection{Reading Radix-n Input}
+For the purposes of this text we will assume that a simple lower ASCII map (\ref{fig:ASC}) is used for the values of from $0$ to $63$ to 
+printable characters.  For example, when the character ``N'' is read it represents the integer $23$.  The first $16$ characters of the
+map are for the common representations up to hexadecimal.  After that they match the ``base64'' encoding scheme which are suitable chosen
+such that they are printable.  While outputting as base64 may not be too helpful for human operators it does allow communication via non binary
+mediums.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{cc|cc|cc|cc}
+\hline \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} &  \textbf{Value} & \textbf{Char} \\
+\hline 
+0 & 0 & 1 & 1 & 2 & 2 & 3 & 3 \\
+4 & 4 & 5 & 5 & 6 & 6 & 7 & 7 \\
+8 & 8 & 9 & 9 & 10 & A & 11 & B \\
+12 & C & 13 & D & 14 & E & 15 & F \\
+16 & G & 17 & H & 18 & I & 19 & J \\
+20 & K & 21 & L & 22 & M & 23 & N \\
+24 & O & 25 & P & 26 & Q & 27 & R \\
+28 & S & 29 & T & 30 & U & 31 & V \\
+32 & W & 33 & X & 34 & Y & 35 & Z \\
+36 & a & 37 & b & 38 & c & 39 & d \\
+40 & e & 41 & f & 42 & g & 43 & h \\
+44 & i & 45 & j & 46 & k & 47 & l \\
+48 & m & 49 & n & 50 & o & 51 & p \\
+52 & q & 53 & r & 54 & s & 55 & t \\
+56 & u & 57 & v & 58 & w & 59 & x \\
+60 & y & 61 & z & 62 & $+$ & 63 & $/$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Lower ASCII Map}
+\label{fig:ASC}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_read\_radix}. \\
+\textbf{Input}.   A string $str$ of length $sn$ and radix $r$. \\
+\textbf{Output}.  The radix-$\beta$ equivalent mp\_int. \\
+\hline \\
+1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
+2.  $ix \leftarrow 0$ \\
+3.  If $str_0 =$ ``-'' then do \\
+\hspace{3mm}3.1  $ix \leftarrow ix + 1$ \\
+\hspace{3mm}3.2  $sign \leftarrow MP\_NEG$ \\
+4.  else \\
+\hspace{3mm}4.1  $sign \leftarrow MP\_ZPOS$ \\
+5.  $a \leftarrow 0$ \\
+6.  for $iy$ from $ix$ to $sn - 1$ do \\
+\hspace{3mm}6.1  Let $y$ denote the position in the map of $str_{iy}$. \\
+\hspace{3mm}6.2  If $str_{iy}$ is not in the map or $y \ge r$ then goto step 7. \\
+\hspace{3mm}6.3  $a \leftarrow a \cdot r$ \\
+\hspace{3mm}6.4  $a \leftarrow a + y$ \\
+7.  If $a \ne 0$ then $a.sign \leftarrow sign$ \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_read\_radix}
+\end{figure}
+\textbf{Algorithm mp\_read\_radix.}
+This algorithm will read an ASCII string and produce the radix-$\beta$ mp\_int representation of the same integer.  A minus symbol ``-'' may precede the 
+string  to indicate the value is negative, otherwise it is assumed to be positive.  The algorithm will read up to $sn$ characters from the input
+and will stop when it reads a character it cannot map the algorithm stops reading characters from the string.  This allows numbers to be embedded
+as part of larger input without any significant problem.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_read\_radix.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* read a string [ASCII] in a given radix */
+018   int mp_read_radix (mp_int * a, char *str, int radix)
+019   \{
+020     int     y, res, neg;
+021     char    ch;
+022   
+023     /* make sure the radix is ok */
+024     if (radix < 2 || radix > 64) \{
+025       return MP_VAL;
+026     \}
+027   
+028     /* if the leading digit is a 
+029      * minus set the sign to negative. 
+030      */
+031     if (*str == '-') \{
+032       ++str;
+033       neg = MP_NEG;
+034     \} else \{
+035       neg = MP_ZPOS;
+036     \}
+037   
+038     /* set the integer to the default of zero */
+039     mp_zero (a);
+040     
+041     /* process each digit of the string */
+042     while (*str) \{
+043       /* if the radix < 36 the conversion is case insensitive
+044        * this allows numbers like 1AB and 1ab to represent the same  value
+045        * [e.g. in hex]
+046        */
+047       ch = (char) ((radix < 36) ? toupper (*str) : *str);
+048       for (y = 0; y < 64; y++) \{
+049         if (ch == mp_s_rmap[y]) \{
+050            break;
+051         \}
+052       \}
+053   
+054       /* if the char was found in the map 
+055        * and is less than the given radix add it
+056        * to the number, otherwise exit the loop. 
+057        */
+058       if (y < radix) \{
+059         if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) \{
+060            return res;
+061         \}
+062         if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) \{
+063            return res;
+064         \}
+065       \} else \{
+066         break;
+067       \}
+068       ++str;
+069     \}
+070     
+071     /* set the sign only if a != 0 */
+072     if (mp_iszero(a) != 1) \{
+073        a->sign = neg;
+074     \}
+075     return MP_OKAY;
+076   \}
+077   #endif
+\end{alltt}
+\end{small}
+
+\subsection{Generating Radix-$n$ Output}
+Generating radix-$n$ output is fairly trivial with a division and remainder algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toradix}. \\
+\textbf{Input}.   A mp\_int $a$ and an integer $r$\\
+\textbf{Output}.  The radix-$r$ representation of $a$ \\
+\hline \\
+1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
+2.  If $a = 0$ then $str = $ ``$0$'' and return(\textit{MP\_OKAY}).  \\
+3.  $t \leftarrow a$ \\
+4.  $str \leftarrow$ ``'' \\
+5.  if $t.sign = MP\_NEG$ then \\
+\hspace{3mm}5.1  $str \leftarrow str + $ ``-'' \\
+\hspace{3mm}5.2  $t.sign = MP\_ZPOS$ \\
+6.  While ($t \ne 0$) do \\
+\hspace{3mm}6.1  $d \leftarrow t \mbox{ (mod }r\mbox{)}$ \\
+\hspace{3mm}6.2  $t \leftarrow \lfloor t / r \rfloor$ \\
+\hspace{3mm}6.3  Look up $d$ in the map and store the equivalent character in $y$. \\
+\hspace{3mm}6.4  $str \leftarrow str + y$ \\
+7.  If $str_0 = $``$-$'' then \\
+\hspace{3mm}7.1  Reverse the digits $str_1, str_2, \ldots str_n$. \\
+8.  Otherwise \\
+\hspace{3mm}8.1  Reverse the digits $str_0, str_1, \ldots str_n$. \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toradix}
+\end{figure}
+\textbf{Algorithm mp\_toradix.}
+This algorithm computes the radix-$r$ representation of an mp\_int $a$.  The ``digits'' of the representation are extracted by reducing 
+successive powers of $\lfloor a / r^k \rfloor$ the input modulo $r$ until $r^k > a$.  Note that instead of actually dividing by $r^k$ in
+each iteration the quotient $\lfloor a / r \rfloor$ is saved for the next iteration.  As a result a series of trivial $n \times 1$ divisions
+are required instead of a series of $n \times k$ divisions.  One design flaw of this approach is that the digits are produced in the reverse order 
+(see~\ref{fig:mpradix}).  To remedy this flaw the digits must be swapped or simply ``reversed''.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Value of $a$} & \textbf{Value of $d$} & \textbf{Value of $str$} \\
+\hline $1234$ & -- & -- \\
+\hline $123$  & $4$ & ``4'' \\
+\hline $12$   & $3$ & ``43'' \\
+\hline $1$    & $2$ & ``432'' \\
+\hline $0$    & $1$ & ``4321'' \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Algorithm mp\_toradix.}
+\label{fig:mpradix}
+\end{figure}
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_toradix.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* stores a bignum as a ASCII string in a given radix (2..64) */
+018   int mp_toradix (mp_int * a, char *str, int radix)
+019   \{
+020     int     res, digs;
+021     mp_int  t;
+022     mp_digit d;
+023     char   *_s = str;
+024   
+025     /* check range of the radix */
+026     if (radix < 2 || radix > 64) \{
+027       return MP_VAL;
+028     \}
+029   
+030     /* quick out if its zero */
+031     if (mp_iszero(a) == 1) \{
+032        *str++ = '0';
+033        *str = '\symbol{92}0';
+034        return MP_OKAY;
+035     \}
+036   
+037     if ((res = mp_init_copy (&t, a)) != MP_OKAY) \{
+038       return res;
+039     \}
+040   
+041     /* if it is negative output a - */
+042     if (t.sign == MP_NEG) \{
+043       ++_s;
+044       *str++ = '-';
+045       t.sign = MP_ZPOS;
+046     \}
+047   
+048     digs = 0;
+049     while (mp_iszero (&t) == 0) \{
+050       if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) \{
+051         mp_clear (&t);
+052         return res;
+053       \}
+054       *str++ = mp_s_rmap[d];
+055       ++digs;
+056     \}
+057   
+058     /* reverse the digits of the string.  In this case _s points
+059      * to the first digit [exluding the sign] of the number]
+060      */
+061     bn_reverse ((unsigned char *)_s, digs);
+062   
+063     /* append a NULL so the string is properly terminated */
+064     *str = '\symbol{92}0';
+065   
+066     mp_clear (&t);
+067     return MP_OKAY;
+068   \}
+069   
+070   #endif
+\end{alltt}
+\end{small}
+
+\chapter{Number Theoretic Algorithms}
+This chapter discusses several fundamental number theoretic algorithms such as the greatest common divisor, least common multiple and Jacobi 
+symbol computation.  These algorithms arise as essential components in several key cryptographic algorithms such as the RSA public key algorithm and
+various Sieve based factoring algorithms.
+
+\section{Greatest Common Divisor}
+The greatest common divisor of two integers $a$ and $b$, often denoted as $(a, b)$ is the largest integer $k$ that is a proper divisor of
+both $a$ and $b$.  That is, $k$ is the largest integer such that $0 \equiv a \mbox{ (mod }k\mbox{)}$ and $0 \equiv b \mbox{ (mod }k\mbox{)}$ occur
+simultaneously.
+
+The most common approach (cite) is to reduce one input modulo another.  That is if $a$ and $b$ are divisible by some integer $k$ and if $qa + r = b$ then
+$r$ is also divisible by $k$.  The reduction pattern follows $\left < a , b \right > \rightarrow \left < b, a \mbox{ mod } b \right >$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (I)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  While ($b > 0$) do \\
+\hspace{3mm}1.1  $r \leftarrow a \mbox{ (mod }b\mbox{)}$ \\
+\hspace{3mm}1.2  $a \leftarrow b$ \\
+\hspace{3mm}1.3  $b \leftarrow r$ \\
+2.  Return($a$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (I)}
+\label{fig:gcd1}
+\end{figure}
+
+This algorithm will quickly converge on the greatest common divisor since the residue $r$ tends diminish rapidly.  However, divisions are
+relatively expensive operations to perform and should ideally be avoided.  There is another approach based on a similar relationship of 
+greatest common divisors.  The faster approach is based on the observation that if $k$ divides both $a$ and $b$ it will also divide $a - b$.  
+In particular, we would like $a - b$ to decrease in magnitude which implies that $b \ge a$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (II)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  While ($b > 0$) do \\
+\hspace{3mm}1.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
+\hspace{3mm}1.2  $b \leftarrow b - a$ \\
+2.  Return($a$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (II)}
+\label{fig:gcd2}
+\end{figure}
+
+\textbf{Proof} \textit{Algorithm~\ref{fig:gcd2} will return the greatest common divisor of $a$ and $b$.}
+The algorithm in figure~\ref{fig:gcd2} will eventually terminate since $b \ge a$ the subtraction in step 1.2 will be a value less than $b$.  In other
+words in every iteration that tuple $\left < a, b \right >$ decrease in magnitude until eventually $a = b$.  Since both $a$ and $b$ are always 
+divisible by the greatest common divisor (\textit{until the last iteration}) and in the last iteration of the algorithm $b = 0$, therefore, in the 
+second to last iteration of the algorithm $b = a$ and clearly $(a, a) = a$ which concludes the proof.  \textbf{QED}.
+
+As a matter of practicality algorithm \ref{fig:gcd1} decreases far too slowly to be useful.  Specially if $b$ is much larger than $a$ such that 
+$b - a$ is still very much larger than $a$.  A simple addition to the algorithm is to divide $b - a$ by a power of some integer $p$ which does
+not divide the greatest common divisor but will divide $b - a$.  In this case ${b - a} \over p$ is also an integer and still divisible by
+the greatest common divisor.
+
+However, instead of factoring $b - a$ to find a suitable value of $p$ the powers of $p$ can be removed from $a$ and $b$ that are in common first.  
+Then inside the loop whenever $b - a$ is divisible by some power of $p$ it can be safely removed.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (III)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  $k \leftarrow 0$ \\
+2.  While $a$ and $b$ are both divisible by $p$ do \\
+\hspace{3mm}2.1  $a \leftarrow \lfloor a / p \rfloor$ \\
+\hspace{3mm}2.2  $b \leftarrow \lfloor b / p \rfloor$ \\
+\hspace{3mm}2.3  $k \leftarrow k + 1$ \\
+3.  While $a$ is divisible by $p$ do \\
+\hspace{3mm}3.1  $a \leftarrow \lfloor a / p \rfloor$ \\
+4.  While $b$ is divisible by $p$ do \\
+\hspace{3mm}4.1  $b \leftarrow \lfloor b / p \rfloor$ \\
+5.  While ($b > 0$) do \\
+\hspace{3mm}5.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
+\hspace{3mm}5.2  $b \leftarrow b - a$ \\
+\hspace{3mm}5.3  While $b$ is divisible by $p$ do \\
+\hspace{6mm}5.3.1  $b \leftarrow \lfloor b / p \rfloor$ \\
+6.  Return($a \cdot p^k$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (III)}
+\label{fig:gcd3}
+\end{figure}
+
+This algorithm is based on the first except it removes powers of $p$ first and inside the main loop to ensure the tuple $\left < a, b \right >$ 
+decreases more rapidly.  The first loop on step two removes powers of $p$ that are in common.  A count, $k$, is kept which will present a common
+divisor of $p^k$.  After step two the remaining common divisor of $a$ and $b$ cannot be divisible by $p$.  This means that $p$ can be safely 
+divided out of the difference $b - a$ so long as the division leaves no remainder.  
+
+In particular the value of $p$ should be chosen such that the division on step 5.3.1 occur often.  It also helps that division by $p$ be easy
+to compute.  The ideal choice of $p$ is two since division by two amounts to a right logical shift.  Another important observation is that by
+step five both $a$ and $b$ are odd.  Therefore, the diffrence $b - a$ must be even which means that each iteration removes one bit from the 
+largest of the pair.
+
+\subsection{Complete Greatest Common Divisor}
+The algorithms presented so far cannot handle inputs which are zero or negative.  The following algorithm can handle all input cases properly
+and will produce the greatest common divisor.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_gcd}. \\
+\textbf{Input}.   mp\_int $a$ and $b$ \\
+\textbf{Output}.  The greatest common divisor $c = (a, b)$.  \\
+\hline \\
+1.  If $a = 0$ and $b \ne 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow b$ \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $a \ne 0$ and $b = 0$ then \\
+\hspace{3mm}2.1  $c \leftarrow a$ \\
+\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
+3.  If $a = b = 0$ then \\
+\hspace{3mm}3.1  $c \leftarrow 1$ \\
+\hspace{3mm}3.2  Return(\textit{MP\_OKAY}). \\
+4.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
+5.  $k \leftarrow 0$ \\
+6.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}6.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}6.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+7.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+8.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}8.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+9.  While $v.used > 0$ \\
+\hspace{3mm}9.1  If $\vert u \vert > \vert v \vert$ then \\
+\hspace{6mm}9.1.1  Swap $u$ and $v$. \\
+\hspace{3mm}9.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
+\hspace{3mm}9.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{6mm}9.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+10.  $c \leftarrow u \cdot 2^k$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_gcd}
+\end{figure}
+\textbf{Algorithm mp\_gcd.}
+This algorithm will produce the greatest common divisor of two mp\_ints $a$ and $b$.  The algorithm was originally based on Algorithm B of
+Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain.  In theory it achieves the same asymptotic working time as
+Algorithm B and in practice this appears to be true.  
+
+The first three steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
+largest input or zero if they are both zero.  If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of 
+$a$ and $b$ respectively and the algorithm will proceed to reduce the pair.
+
+Step six will divide out any common factors of two and keep track of the count in the variable $k$.  After this step two is no longer a
+factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even.  Step 
+seven and eight ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while loops will iterate since 
+they cannot both be even.
+
+By step nine both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
+or greater than $u$.  This ensures that the subtraction on step 9.2 will always produce a positive and even result.  Step 9.3 removes any
+factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd.
+
+After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six.  The result
+must be adjusted by multiplying by the common factors of two ($2^k$) removed earlier.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_gcd.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* Greatest Common Divisor using the binary method */
+018   int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     mp_int  u, v;
+021     int     k, u_lsb, v_lsb, res;
+022   
+023     /* either zero than gcd is the largest */
+024     if (mp_iszero (a) == 1 && mp_iszero (b) == 0) \{
+025       return mp_abs (b, c);
+026     \}
+027     if (mp_iszero (a) == 0 && mp_iszero (b) == 1) \{
+028       return mp_abs (a, c);
+029     \}
+030   
+031     /* optimized.  At this point if a == 0 then
+032      * b must equal zero too
+033      */
+034     if (mp_iszero (a) == 1) \{
+035       mp_zero(c);
+036       return MP_OKAY;
+037     \}
+038   
+039     /* get copies of a and b we can modify */
+040     if ((res = mp_init_copy (&u, a)) != MP_OKAY) \{
+041       return res;
+042     \}
+043   
+044     if ((res = mp_init_copy (&v, b)) != MP_OKAY) \{
+045       goto __U;
+046     \}
+047   
+048     /* must be positive for the remainder of the algorithm */
+049     u.sign = v.sign = MP_ZPOS;
+050   
+051     /* B1.  Find the common power of two for u and v */
+052     u_lsb = mp_cnt_lsb(&u);
+053     v_lsb = mp_cnt_lsb(&v);
+054     k     = MIN(u_lsb, v_lsb);
+055   
+056     if (k > 0) \{
+057        /* divide the power of two out */
+058        if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) \{
+059           goto __V;
+060        \}
+061   
+062        if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) \{
+063           goto __V;
+064        \}
+065     \}
+066   
+067     /* divide any remaining factors of two out */
+068     if (u_lsb != k) \{
+069        if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) \{
+070           goto __V;
+071        \}
+072     \}
+073   
+074     if (v_lsb != k) \{
+075        if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) \{
+076           goto __V;
+077        \}
+078     \}
+079   
+080     while (mp_iszero(&v) == 0) \{
+081        /* make sure v is the largest */
+082        if (mp_cmp_mag(&u, &v) == MP_GT) \{
+083           /* swap u and v to make sure v is >= u */
+084           mp_exch(&u, &v);
+085        \}
+086        
+087        /* subtract smallest from largest */
+088        if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) \{
+089           goto __V;
+090        \}
+091        
+092        /* Divide out all factors of two */
+093        if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) \{
+094           goto __V;
+095        \} 
+096     \} 
+097   
+098     /* multiply by 2**k which we divided out at the beginning */
+099     if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) \{
+100        goto __V;
+101     \}
+102     c->sign = MP_ZPOS;
+103     res = MP_OKAY;
+104   __V:mp_clear (&u);
+105   __U:mp_clear (&v);
+106     return res;
+107   \}
+108   #endif
+\end{alltt}
+\end{small}
+
+This function makes use of the macros mp\_iszero and mp\_iseven.  The former evaluates to $1$ if the input mp\_int is equivalent to the 
+integer zero otherwise it evaluates to $0$.  The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise
+it evaluates to $0$.  Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero.  The three 
+trivial cases of inputs are handled on lines 24 through 37.  After those lines the inputs are assumed to be non-zero.
+
+Lines 34 and 40 make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
+must be divided out of the two inputs.  The while loop on line 80 iterates so long as both are even.  The local integer $k$ is used to
+keep track of how many factors of $2$ are pulled out of both values.  It is assumed that the number of factors will not exceed the maximum 
+value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than entries than are accessible by an ``int'' so this is not 
+a limitation.}.  
+
+At this point there are no more common factors of two in the two values.  The while loops on lines 80 and 80 remove any independent
+factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
+on line 80 performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
+place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative.
+
+\section{Least Common Multiple}
+The least common multiple of a pair of integers is their product divided by their greatest common divisor.  For two integers $a$ and $b$ the
+least common multiple is normally denoted as $[ a, b ]$ and numerically equivalent to ${ab} \over {(a, b)}$.  For example, if $a = 2 \cdot 2 \cdot 3 = 12$
+and $b = 2 \cdot 3 \cdot 3 \cdot 7 = 126$ the least common multiple is ${126 \over {(12, 126)}} = {126 \over 6} = 21$.
+
+The least common multiple arises often in coding theory as well as number theory.  If two functions have periods of $a$ and $b$ respectively they will
+collide, that is be in synchronous states, after only $[ a, b ]$ iterations.  This is why, for example, random number generators based on 
+Linear Feedback Shift Registers (LFSR) tend to use registers with periods which are co-prime (\textit{e.g. the greatest common divisor is one.}).  
+Similarly in number theory if a composite $n$ has two prime factors $p$ and $q$ then maximal order of any unit of $\Z/n\Z$ will be $[ p - 1, q - 1] $.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lcm}. \\
+\textbf{Input}.   mp\_int $a$ and $b$ \\
+\textbf{Output}.  The least common multiple $c = [a, b]$.  \\
+\hline \\
+1.  $c \leftarrow (a, b)$ \\
+2.  $t \leftarrow a \cdot b$ \\
+3.  $c \leftarrow \lfloor t / c \rfloor$ \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lcm}
+\end{figure}
+\textbf{Algorithm mp\_lcm.}
+This algorithm computes the least common multiple of two mp\_int inputs $a$ and $b$.  It computes the least common multiple directly by
+dividing the product of the two inputs by their greatest common divisor.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_lcm.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes least common multiple as |a*b|/(a, b) */
+018   int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     int     res;
+021     mp_int  t1, t2;
+022   
+023   
+024     if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) \{
+025       return res;
+026     \}
+027   
+028     /* t1 = get the GCD of the two inputs */
+029     if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) \{
+030       goto __T;
+031     \}
+032   
+033     /* divide the smallest by the GCD */
+034     if (mp_cmp_mag(a, b) == MP_LT) \{
+035        /* store quotient in t2 such that t2 * b is the LCM */
+036        if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) \{
+037           goto __T;
+038        \}
+039        res = mp_mul(b, &t2, c);
+040     \} else \{
+041        /* store quotient in t2 such that t2 * a is the LCM */
+042        if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) \{
+043           goto __T;
+044        \}
+045        res = mp_mul(a, &t2, c);
+046     \}
+047   
+048     /* fix the sign to positive */
+049     c->sign = MP_ZPOS;
+050   
+051   __T:
+052     mp_clear_multi (&t1, &t2, NULL);
+053     return res;
+054   \}
+055   #endif
+\end{alltt}
+\end{small}
+
+\section{Jacobi Symbol Computation}
+To explain the Jacobi Symbol we shall first discuss the Legendre function\footnote{Arrg.  What is the name of this?} off which the Jacobi symbol is 
+defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
+equivalent to equation \ref{eqn:legendre}.
+
+\begin{equation}
+a^{(p-1)/2} \equiv \begin{array}{rl}
+                              -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
+                              0  &  \mbox{if }a\mbox{ divides }p\mbox{.} \\
+                              1  &  \mbox{if }a\mbox{ is a quadratic residue}. 
+                              \end{array} \mbox{ (mod }p\mbox{)}
+\label{eqn:legendre}                              
+\end{equation}
+
+\textbf{Proof.} \textit{Equation \ref{eqn:legendre} correctly identifies the residue status of an integer $a$ modulo a prime $p$.}
+An integer $a$ is a quadratic residue if the following equation has a solution.
+
+\begin{equation}
+x^2 \equiv a \mbox{ (mod }p\mbox{)}
+\label{eqn:root}
+\end{equation}
+
+Consider the following equation.
+
+\begin{equation}
+0 \equiv x^{p-1} - 1 \equiv \left \lbrace \left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \right \rbrace + \left ( a^{(p-1)/2} - 1 \right ) \mbox{ (mod }p\mbox{)}
+\label{eqn:rooti}
+\end{equation}
+
+Whether equation \ref{eqn:root} has a solution or not equation \ref{eqn:rooti} is always true.  If $a^{(p-1)/2} - 1 \equiv 0 \mbox{ (mod }p\mbox{)}$
+then the quantity in the braces must be zero.  By reduction,
+
+\begin{eqnarray}
+\left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \equiv 0  \nonumber \\
+\left (x^2 \right )^{(p-1)/2} \equiv a^{(p-1)/2} \nonumber \\
+x^2 \equiv a \mbox{ (mod }p\mbox{)} 
+\end{eqnarray}
+
+As a result there must be a solution to the quadratic equation and in turn $a$ must be a quadratic residue.  If $a$ does not divide $p$ and $a$
+is not a quadratic residue then the only other value $a^{(p-1)/2}$ may be congruent to is $-1$ since
+\begin{equation}
+0 \equiv a^{p - 1} - 1 \equiv (a^{(p-1)/2} + 1)(a^{(p-1)/2} - 1) \mbox{ (mod }p\mbox{)}
+\end{equation}
+One of the terms on the right hand side must be zero.  \textbf{QED}
+
+\subsection{Jacobi Symbol}
+The Jacobi symbol is a generalization of the Legendre function for any odd non prime moduli $p$ greater than 2.  If $p = \prod_{i=0}^n p_i$ then
+the Jacobi symbol $\left ( { a \over p } \right )$ is equal to the following equation.
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { a \over p_0} \right ) \left ( { a \over p_1} \right ) \ldots \left ( { a \over p_n} \right )
+\end{equation}
+
+By inspection if $p$ is prime the Jacobi symbol is equivalent to the Legendre function.  The following facts\footnote{See HAC \cite[pp. 72-74]{HAC} for
+further details.} will be used to derive an efficient Jacobi symbol algorithm.  Where $p$ is an odd integer greater than two and $a, b \in \Z$ the
+following are true.  
+
+\begin{enumerate}
+\item $\left ( { a \over p} \right )$ equals $-1$, $0$ or $1$. 
+\item $\left ( { ab \over p} \right ) = \left ( { a \over p} \right )\left ( { b \over p} \right )$.
+\item If $a \equiv b$ then $\left ( { a \over p} \right ) = \left ( { b \over p} \right )$.
+\item $\left ( { 2 \over p} \right )$ equals $1$ if $p \equiv 1$ or $7 \mbox{ (mod }8\mbox{)}$.  Otherwise, it equals $-1$.
+\item $\left ( { a \over p} \right ) \equiv \left ( { p \over a} \right ) \cdot (-1)^{(p-1)(a-1)/4}$.  More specifically 
+$\left ( { a \over p} \right ) = \left ( { p \over a} \right )$ if $p \equiv a \equiv 1 \mbox{ (mod }4\mbox{)}$.  
+\end{enumerate}
+
+Using these facts if $a = 2^k \cdot a'$ then
+
+\begin{eqnarray}
+\left ( { a \over p } \right ) = \left ( {{2^k} \over p } \right ) \left ( {a' \over p} \right ) \nonumber \\
+                               = \left ( {2 \over p } \right )^k \left ( {a' \over p} \right ) 
+\label{eqn:jacobi}
+\end{eqnarray}
+
+By fact five, 
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { p \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} 
+\end{equation}
+
+Subsequently by fact three since $p \equiv (p \mbox{ mod }a) \mbox{ (mod }a\mbox{)}$ then 
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { {p \mbox{ mod } a} \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} 
+\end{equation}
+
+By putting both observations into equation \ref{eqn:jacobi} the following simplified equation is formed.
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( {2 \over p } \right )^k \left ( {{p\mbox{ mod }a'} \over a'} \right )  \cdot (-1)^{(p-1)(a'-1)/4} 
+\end{equation}
+
+The value of $\left ( {{p \mbox{ mod }a'} \over a'} \right )$ can be found by using the same equation recursively.  The value of 
+$\left ( {2 \over p } \right )^k$ equals $1$ if $k$ is even otherwise it equals $\left ( {2 \over p } \right )$.  Using this approach the 
+factors of $p$ do not have to be known.  Furthermore, if $(a, p) = 1$ then the algorithm will terminate when the recursion requests the 
+Jacobi symbol computation of $\left ( {1 \over a'} \right )$ which is simply $1$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_jacobi}. \\
+\textbf{Input}.   mp\_int $a$ and $p$, $a \ge 0$, $p \ge 3$, $p \equiv 1 \mbox{ (mod }2\mbox{)}$ \\
+\textbf{Output}.  The Jacobi symbol $c = \left ( {a \over p } \right )$. \\
+\hline \\
+1.  If $a = 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow 0$ \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $a = 1$ then \\
+\hspace{3mm}2.1  $c \leftarrow 1$ \\
+\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
+3.  $a' \leftarrow a$ \\
+4.  $k \leftarrow 0$ \\
+5.  While $a'.used > 0$ and $a'_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}5.2  $a' \leftarrow \lfloor a' / 2 \rfloor$ \\
+6.  If $k \equiv 0 \mbox{ (mod }2\mbox{)}$ then \\
+\hspace{3mm}6.1  $s \leftarrow 1$ \\
+7.  else \\
+\hspace{3mm}7.1  $r \leftarrow p_0 \mbox{ (mod }8\mbox{)}$ \\
+\hspace{3mm}7.2  If $r = 1$ or $r = 7$ then \\
+\hspace{6mm}7.2.1  $s \leftarrow 1$ \\
+\hspace{3mm}7.3  else \\
+\hspace{6mm}7.3.1  $s \leftarrow -1$ \\
+8.  If $p_0 \equiv a'_0 \equiv 3 \mbox{ (mod }4\mbox{)}$ then \\
+\hspace{3mm}8.1  $s \leftarrow -s$ \\
+9.  If $a' \ne 1$ then \\
+\hspace{3mm}9.1  $p' \leftarrow p \mbox{ (mod }a'\mbox{)}$ \\
+\hspace{3mm}9.2  $s \leftarrow s \cdot \mbox{mp\_jacobi}(p', a')$ \\
+10.  $c \leftarrow s$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_jacobi}
+\end{figure}
+\textbf{Algorithm mp\_jacobi.}
+This algorithm computes the Jacobi symbol for an arbitrary positive integer $a$ with respect to an odd integer $p$ greater than three.  The algorithm
+is based on algorithm 2.149 of HAC \cite[pp. 73]{HAC}.  
+
+Step numbers one and two handle the trivial cases of $a = 0$ and $a = 1$ respectively.  Step five determines the number of two factors in the
+input $a$.  If $k$ is even than the term $\left ( { 2 \over p } \right )^k$ must always evaluate to one.  If $k$ is odd than the term evaluates to one 
+if $p_0$ is congruent to one or seven modulo eight, otherwise it evaluates to $-1$. After the the $\left ( { 2 \over p } \right )^k$ term is handled 
+the $(-1)^{(p-1)(a'-1)/4}$ is computed and multiplied against the current product $s$.  The latter term evaluates to one if both $p$ and $a'$ 
+are congruent to one modulo four, otherwise it evaluates to negative one.
+
+By step nine if $a'$ does not equal one a recursion is required.  Step 9.1 computes $p' \equiv p \mbox{ (mod }a'\mbox{)}$ and will recurse to compute
+$\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi product.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_jacobi.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes the jacobi c = (a | n) (or Legendre if n is prime)
+018    * HAC pp. 73 Algorithm 2.149
+019    */
+020   int mp_jacobi (mp_int * a, mp_int * p, int *c)
+021   \{
+022     mp_int  a1, p1;
+023     int     k, s, r, res;
+024     mp_digit residue;
+025   
+026     /* if p <= 0 return MP_VAL */
+027     if (mp_cmp_d(p, 0) != MP_GT) \{
+028        return MP_VAL;
+029     \}
+030   
+031     /* step 1.  if a == 0, return 0 */
+032     if (mp_iszero (a) == 1) \{
+033       *c = 0;
+034       return MP_OKAY;
+035     \}
+036   
+037     /* step 2.  if a == 1, return 1 */
+038     if (mp_cmp_d (a, 1) == MP_EQ) \{
+039       *c = 1;
+040       return MP_OKAY;
+041     \}
+042   
+043     /* default */
+044     s = 0;
+045   
+046     /* step 3.  write a = a1 * 2**k  */
+047     if ((res = mp_init_copy (&a1, a)) != MP_OKAY) \{
+048       return res;
+049     \}
+050   
+051     if ((res = mp_init (&p1)) != MP_OKAY) \{
+052       goto __A1;
+053     \}
+054   
+055     /* divide out larger power of two */
+056     k = mp_cnt_lsb(&a1);
+057     if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) \{
+058        goto __P1;
+059     \}
+060   
+061     /* step 4.  if e is even set s=1 */
+062     if ((k & 1) == 0) \{
+063       s = 1;
+064     \} else \{
+065       /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
+066       residue = p->dp[0] & 7;
+067   
+068       if (residue == 1 || residue == 7) \{
+069         s = 1;
+070       \} else if (residue == 3 || residue == 5) \{
+071         s = -1;
+072       \}
+073     \}
+074   
+075     /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+076     if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) \{
+077       s = -s;
+078     \}
+079   
+080     /* if a1 == 1 we're done */
+081     if (mp_cmp_d (&a1, 1) == MP_EQ) \{
+082       *c = s;
+083     \} else \{
+084       /* n1 = n mod a1 */
+085       if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) \{
+086         goto __P1;
+087       \}
+088       if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) \{
+089         goto __P1;
+090       \}
+091       *c = s * r;
+092     \}
+093   
+094     /* done */
+095     res = MP_OKAY;
+096   __P1:mp_clear (&p1);
+097   __A1:mp_clear (&a1);
+098     return res;
+099   \}
+100   #endif
+\end{alltt}
+\end{small}
+
+As a matter of practicality the variable $a'$ as per the pseudo-code is reprensented by the variable $a1$ since the $'$ symbol is not valid for a C 
+variable name character. 
+
+The two simple cases of $a = 0$ and $a = 1$ are handled at the very beginning to simplify the algorithm.  If the input is non-trivial the algorithm
+has to proceed compute the Jacobi.  The variable $s$ is used to hold the current Jacobi product.  Note that $s$ is merely a C ``int'' data type since
+the values it may obtain are merely $-1$, $0$ and $1$.  
+
+After a local copy of $a$ is made all of the factors of two are divided out and the total stored in $k$.  Technically only the least significant
+bit of $k$ is required, however, it makes the algorithm simpler to follow to perform an addition. In practice an exclusive-or and addition have the same 
+processor requirements and neither is faster than the other.
+
+Line 61 through 70 determines the value of $\left ( { 2 \over p } \right )^k$.  If the least significant bit of $k$ is zero than
+$k$ is even and the value is one.  Otherwise, the value of $s$ depends on which residue class $p$ belongs to modulo eight.  The value of
+$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines 75 through 73.  
+
+Finally, if $a1$ does not equal one the algorithm must recurse and compute $\left ( {p' \over a'} \right )$.  
+
+\textit{-- Comment about default $s$ and such...}
+
+\section{Modular Inverse}
+\label{sec:modinv}
+The modular inverse of a number actually refers to the modular multiplicative inverse.  Essentially for any integer $a$ such that $(a, p) = 1$ there
+exist another integer $b$ such that $ab \equiv 1 \mbox{ (mod }p\mbox{)}$.  The integer $b$ is called the multiplicative inverse of $a$ which is
+denoted as $b = a^{-1}$.  Technically speaking modular inversion is a well defined operation for any finite ring or field not just for rings and 
+fields of integers.  However, the former will be the matter of discussion.
+
+The simplest approach is to compute the algebraic inverse of the input.  That is to compute $b \equiv a^{\Phi(p) - 1}$.  If $\Phi(p)$ is the 
+order of the multiplicative subgroup modulo $p$ then $b$ must be the multiplicative inverse of $a$.  The proof of which is trivial.
+
+\begin{equation}
+ab \equiv a \left (a^{\Phi(p) - 1} \right ) \equiv a^{\Phi(p)} \equiv a^0 \equiv 1 \mbox{ (mod }p\mbox{)}
+\end{equation}
+
+However, as simple as this approach may be it has two serious flaws.  It requires that the value of $\Phi(p)$ be known which if $p$ is composite 
+requires all of the prime factors.  This approach also is very slow as the size of $p$ grows.  
+
+A simpler approach is based on the observation that solving for the multiplicative inverse is equivalent to solving the linear 
+Diophantine\footnote{See LeVeque \cite[pp. 40-43]{LeVeque} for more information.} equation.
+
+\begin{equation}
+ab + pq = 1
+\end{equation}
+
+Where $a$, $b$, $p$ and $q$ are all integers.  If such a pair of integers $ \left < b, q \right >$ exist than $b$ is the multiplicative inverse of 
+$a$ modulo $p$.  The extended Euclidean algorithm (Knuth \cite[pp. 342]{TAOCPV2}) can be used to solve such equations provided $(a, p) = 1$.  
+However, instead of using that algorithm directly a variant known as the binary Extended Euclidean algorithm will be used in its place.  The
+binary approach is very similar to the binary greatest common divisor algorithm except it will produce a full solution to the Diophantine 
+equation.  
+
+\subsection{General Case}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_invmod}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $(a, b) = 1$, $p \ge 2$, $0 < a < p$.  \\
+\textbf{Output}.  The modular inverse $c \equiv a^{-1} \mbox{ (mod }b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_VAL}). \\
+2.  If $b_0 \equiv 1 \mbox{ (mod }2\mbox{)}$ then use algorithm fast\_mp\_invmod. \\
+3.  $x \leftarrow \vert a \vert, y \leftarrow b$ \\
+4.  If $x_0 \equiv y_0  \equiv 0 \mbox{ (mod }2\mbox{)}$ then return(\textit{MP\_VAL}). \\
+5.  $B \leftarrow 0, C \leftarrow 0, A \leftarrow 1, D \leftarrow 1$ \\
+6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}6.2  If ($A.used > 0$ and $A_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($B.used > 0$ and $B_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
+\hspace{6mm}6.2.1  $A \leftarrow A + y$ \\
+\hspace{6mm}6.2.2  $B \leftarrow B - x$ \\
+\hspace{3mm}6.3  $A \leftarrow \lfloor A / 2 \rfloor$ \\
+\hspace{3mm}6.4  $B \leftarrow \lfloor B / 2 \rfloor$ \\
+7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+\hspace{3mm}7.2  If ($C.used > 0$ and $C_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($D.used > 0$ and $D_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
+\hspace{6mm}7.2.1  $C \leftarrow C + y$ \\
+\hspace{6mm}7.2.2  $D \leftarrow D - x$ \\
+\hspace{3mm}7.3  $C \leftarrow \lfloor C / 2 \rfloor$ \\
+\hspace{3mm}7.4  $D \leftarrow \lfloor D / 2 \rfloor$ \\
+8.  If $u \ge v$ then \\
+\hspace{3mm}8.1  $u \leftarrow u - v$ \\
+\hspace{3mm}8.2  $A \leftarrow A - C$ \\
+\hspace{3mm}8.3  $B \leftarrow B - D$ \\
+9.  else \\
+\hspace{3mm}9.1  $v \leftarrow v - u$ \\
+\hspace{3mm}9.2  $C \leftarrow C - A$ \\
+\hspace{3mm}9.3  $D \leftarrow D - B$ \\
+10.  If $u \ne 0$ goto step 6. \\
+11.  If $v \ne 1$ return(\textit{MP\_VAL}). \\
+12.  While $C \le 0$ do \\
+\hspace{3mm}12.1  $C \leftarrow C + b$ \\
+13.  While $C \ge b$ do \\
+\hspace{3mm}13.1  $C \leftarrow C - b$ \\
+14.  $c \leftarrow C$ \\
+15.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\end{figure}
+\textbf{Algorithm mp\_invmod.}
+This algorithm computes the modular multiplicative inverse of an integer $a$ modulo an integer $b$.  This algorithm is a variation of the 
+extended binary Euclidean algorithm from HAC \cite[pp. 608]{HAC}.  It has been modified to only compute the modular inverse and not a complete
+Diophantine solution.  
+
+If $b \le 0$ than the modulus is invalid and MP\_VAL is returned.  Similarly if both $a$ and $b$ are even then there cannot be a multiplicative
+inverse for $a$ and the error is reported.  
+
+The astute reader will observe that steps seven through nine are very similar to the binary greatest common divisor algorithm mp\_gcd.  In this case
+the other variables to the Diophantine equation are solved.  The algorithm terminates when $u = 0$ in which case the solution is
+
+\begin{equation}
+Ca + Db = v
+\end{equation}
+
+If $v$, the greatest common divisor of $a$ and $b$ is not equal to one then the algorithm will report an error as no inverse exists.  Otherwise, $C$
+is the modular inverse of $a$.  The actual value of $C$ is congruent to, but not necessarily equal to, the ideal modular inverse which should lie 
+within $1 \le a^{-1} < b$.  Step numbers twelve and thirteen adjust the inverse until it is in range.  If the original input $a$ is within $0 < a < p$ 
+then only a couple of additions or subtractions will be required to adjust the inverse.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_invmod.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* hac 14.61, pp608 */
+018   int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     /* b cannot be negative */
+021     if (b->sign == MP_NEG || mp_iszero(b) == 1) \{
+022       return MP_VAL;
+023     \}
+024   
+025   #ifdef BN_FAST_MP_INVMOD_C
+026     /* if the modulus is odd we can use a faster routine instead */
+027     if (mp_isodd (b) == 1) \{
+028       return fast_mp_invmod (a, b, c);
+029     \}
+030   #endif
+031   
+032   #ifdef BN_MP_INVMOD_SLOW_C
+033     return mp_invmod_slow(a, b, c);
+034   #endif
+035   
+036     return MP_VAL;
+037   \}
+038   #endif
+\end{alltt}
+\end{small}
+
+\subsubsection{Odd Moduli}
+
+When the modulus $b$ is odd the variables $A$ and $C$ are fixed and are not required to compute the inverse.  In particular by attempting to solve
+the Diophantine $Cb + Da = 1$ only $B$ and $D$ are required to find the inverse of $a$.  
+
+The algorithm fast\_mp\_invmod is a direct adaptation of algorithm mp\_invmod with all all steps involving either $A$ or $C$ removed.  This 
+optimization will halve the time required to compute the modular inverse.
+
+\section{Primality Tests}
+
+A non-zero integer $a$ is said to be prime if it is not divisible by any other integer excluding one and itself.  For example, $a = 7$ is prime 
+since the integers $2 \ldots 6$ do not evenly divide $a$.  By contrast, $a = 6$ is not prime since $a = 6 = 2 \cdot 3$. 
+
+Prime numbers arise in cryptography considerably as they allow finite fields to be formed.  The ability to determine whether an integer is prime or
+not quickly has been a viable subject in cryptography and number theory for considerable time.  The algorithms that will be presented are all
+probablistic algorithms in that when they report an integer is composite it must be composite.  However, when the algorithms report an integer is
+prime the algorithm may be incorrect.  
+
+As will be discussed it is possible to limit the probability of error so well that for practical purposes the probablity of error might as 
+well be zero.  For the purposes of these discussions let $n$ represent the candidate integer of which the primality is in question.
+
+\subsection{Trial Division}
+
+Trial division means to attempt to evenly divide a candidate integer by small prime integers.  If the candidate can be evenly divided it obviously
+cannot be prime.  By dividing by all primes $1 < p \le \sqrt{n}$ this test can actually prove whether an integer is prime.  However, such a test
+would require a prohibitive amount of time as $n$ grows.
+
+Instead of dividing by every prime, a smaller, more mangeable set of primes may be used instead.  By performing trial division with only a subset
+of the primes less than $\sqrt{n} + 1$ the algorithm cannot prove if a candidate is prime.  However, often it can prove a candidate is not prime.
+
+The benefit of this test is that trial division by small values is fairly efficient.  Specially compared to the other algorithms that will be
+discussed shortly.  The probability that this approach correctly identifies a composite candidate when tested with all primes upto $q$ is given by
+$1 - {1.12 \over ln(q)}$.  The graph (\ref{pic:primality}, will be added later) demonstrates the probability of success for the range 
+$3 \le q \le 100$.  
+
+At approximately $q = 30$ the gain of performing further tests diminishes fairly quickly.  At $q = 90$ further testing is generally not going to 
+be of any practical use.  In the case of LibTomMath the default limit $q = 256$ was chosen since it is not too high and will eliminate 
+approximately $80\%$ of all candidate integers.  The constant \textbf{PRIME\_SIZE} is equal to the number of primes in the test base.  The 
+array \_\_prime\_tab is an array of the first \textbf{PRIME\_SIZE} prime numbers.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_is\_divisible}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $c = 1$ if $n$ is divisible by a small prime, otherwise $c = 0$.  \\
+\hline \\
+1.  for $ix$ from $0$ to $PRIME\_SIZE$ do \\
+\hspace{3mm}1.1  $d \leftarrow n \mbox{ (mod }\_\_prime\_tab_{ix}\mbox{)}$ \\
+\hspace{3mm}1.2  If $d = 0$ then \\
+\hspace{6mm}1.2.1  $c \leftarrow 1$ \\
+\hspace{6mm}1.2.2  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow 0$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_is\_divisible}
+\end{figure}
+\textbf{Algorithm mp\_prime\_is\_divisible.}
+This algorithm attempts to determine if a candidate integer $n$ is composite by performing trial divisions.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_is\_divisible.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines if an integers is divisible by one 
+018    * of the first PRIME_SIZE primes or not
+019    *
+020    * sets result to 0 if not, 1 if yes
+021    */
+022   int mp_prime_is_divisible (mp_int * a, int *result)
+023   \{
+024     int     err, ix;
+025     mp_digit res;
+026   
+027     /* default to not */
+028     *result = MP_NO;
+029   
+030     for (ix = 0; ix < PRIME_SIZE; ix++) \{
+031       /* what is a mod __prime_tab[ix] */
+032       if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) \{
+033         return err;
+034       \}
+035   
+036       /* is the residue zero? */
+037       if (res == 0) \{
+038         *result = MP_YES;
+039         return MP_OKAY;
+040       \}
+041     \}
+042   
+043     return MP_OKAY;
+044   \}
+045   #endif
+\end{alltt}
+\end{small}
+
+The algorithm defaults to a return of $0$ in case an error occurs.  The values in the prime table are all specified to be in the range of a 
+mp\_digit.  The table \_\_prime\_tab is defined in the following file.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_prime\_tab.c
+\vspace{-3mm}
+\begin{alltt}
+016   const mp_digit __prime_tab[] = \{
+017     0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
+018     0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
+019     0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
+020     0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
+021   #ifndef MP_8BIT
+022     0x0083,
+023     0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
+024     0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
+025     0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
+026     0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
+027   
+028     0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
+029     0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
+030     0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
+031     0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
+032     0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
+033     0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
+034     0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
+035     0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
+036   
+037     0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
+038     0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
+039     0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
+040     0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
+041     0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
+042     0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
+043     0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
+044     0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
+045   
+046     0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
+047     0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
+048     0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
+049     0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
+050     0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
+051     0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
+052     0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
+053     0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
+054   #endif
+055   \};
+056   #endif
+\end{alltt}
+\end{small}
+
+Note that there are two possible tables.  When an mp\_digit is 7-bits long only the primes upto $127$ may be included, otherwise the primes
+upto $1619$ are used.  Note that the value of \textbf{PRIME\_SIZE} is a constant dependent on the size of a mp\_digit. 
+
+\subsection{The Fermat Test}
+The Fermat test is probably one the oldest tests to have a non-trivial probability of success.  It is based on the fact that if $n$ is in 
+fact prime then $a^{n} \equiv a \mbox{ (mod }n\mbox{)}$ for all $0 < a < n$.  The reason being that if $n$ is prime than the order of
+the multiplicative sub group is $n - 1$.  Any base $a$ must have an order which divides $n - 1$ and as such $a^n$ is equivalent to 
+$a^1 = a$.  
+
+If $n$ is composite then any given base $a$ does not have to have a period which divides $n - 1$.  In which case 
+it is possible that $a^n \nequiv a \mbox{ (mod }n\mbox{)}$.  However, this test is not absolute as it is possible that the order
+of a base will divide $n - 1$ which would then be reported as prime.  Such a base yields what is known as a Fermat pseudo-prime.  Several 
+integers known as Carmichael numbers will be a pseudo-prime to all valid bases.  Fortunately such numbers are extremely rare as $n$ grows
+in size.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_fermat}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
+\textbf{Output}.  $c = 1$ if $b^a \equiv b \mbox{ (mod }a\mbox{)}$, otherwise $c = 0$.  \\
+\hline \\
+1.  $t \leftarrow b^a \mbox{ (mod }a\mbox{)}$ \\
+2.  If $t = b$ then \\
+\hspace{3mm}2.1  $c = 1$ \\
+3.  else \\
+\hspace{3mm}3.1  $c = 0$ \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_fermat}
+\end{figure}
+\textbf{Algorithm mp\_prime\_fermat.}
+This algorithm determines whether an mp\_int $a$ is a Fermat prime to the base $b$ or not.  It uses a single modular exponentiation to
+determine the result.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_fermat.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* performs one Fermat test.
+018    * 
+019    * If "a" were prime then b**a == b (mod a) since the order of
+020    * the multiplicative sub-group would be phi(a) = a-1.  That means
+021    * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
+022    *
+023    * Sets result to 1 if the congruence holds, or zero otherwise.
+024    */
+025   int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
+026   \{
+027     mp_int  t;
+028     int     err;
+029   
+030     /* default to composite  */
+031     *result = MP_NO;
+032   
+033     /* ensure b > 1 */
+034     if (mp_cmp_d(b, 1) != MP_GT) \{
+035        return MP_VAL;
+036     \}
+037   
+038     /* init t */
+039     if ((err = mp_init (&t)) != MP_OKAY) \{
+040       return err;
+041     \}
+042   
+043     /* compute t = b**a mod a */
+044     if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) \{
+045       goto __T;
+046     \}
+047   
+048     /* is it equal to b? */
+049     if (mp_cmp (&t, b) == MP_EQ) \{
+050       *result = MP_YES;
+051     \}
+052   
+053     err = MP_OKAY;
+054   __T:mp_clear (&t);
+055     return err;
+056   \}
+057   #endif
+\end{alltt}
+\end{small}
+
+\subsection{The Miller-Rabin Test}
+The Miller-Rabin (citation) test is another primality test which has tighter error bounds than the Fermat test specifically with sequentially chosen 
+candidate  integers.  The algorithm is based on the observation that if $n - 1 = 2^kr$ and if $b^r \nequiv \pm 1$ then after upto $k - 1$ squarings the 
+value must be equal to $-1$.  The squarings are stopped as soon as $-1$ is observed.  If the value of $1$ is observed first it means that
+some value not congruent to $\pm 1$ when squared equals one which cannot occur if $n$ is prime.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_miller\_rabin}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
+\textbf{Output}.  $c = 1$ if $a$ is a Miller-Rabin prime to the base $a$, otherwise $c = 0$.  \\
+\hline
+1.  $a' \leftarrow a - 1$ \\
+2.  $r  \leftarrow n1$    \\
+3.  $c \leftarrow 0, s  \leftarrow 0$ \\
+4.  While $r.used > 0$ and $r_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}4.1  $s \leftarrow s + 1$ \\
+\hspace{3mm}4.2  $r \leftarrow \lfloor r / 2 \rfloor$ \\
+5.  $y \leftarrow b^r \mbox{ (mod }a\mbox{)}$ \\
+6.  If $y \nequiv \pm 1$ then \\
+\hspace{3mm}6.1  $j \leftarrow 1$ \\
+\hspace{3mm}6.2  While $j \le (s - 1)$ and $y \nequiv a'$ \\
+\hspace{6mm}6.2.1  $y \leftarrow y^2 \mbox{ (mod }a\mbox{)}$ \\
+\hspace{6mm}6.2.2  If $y = 1$ then goto step 8. \\
+\hspace{6mm}6.2.3  $j \leftarrow j + 1$ \\
+\hspace{3mm}6.3  If $y \nequiv a'$ goto step 8. \\
+7.  $c \leftarrow 1$\\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_miller\_rabin}
+\end{figure}
+\textbf{Algorithm mp\_prime\_miller\_rabin.}
+This algorithm performs one trial round of the Miller-Rabin algorithm to the base $b$.  It will set $c = 1$ if the algorithm cannot determine
+if $b$ is composite or $c = 0$ if $b$ is provably composite.  The values of $s$ and $r$ are computed such that $a' = a - 1 = 2^sr$.  
+
+If the value $y \equiv b^r$ is congruent to $\pm 1$ then the algorithm cannot prove if $a$ is composite or not.  Otherwise, the algorithm will
+square $y$ upto $s - 1$ times stopping only when $y \equiv -1$.  If $y^2 \equiv 1$ and $y \nequiv \pm 1$ then the algorithm can report that $a$
+is provably composite.  If the algorithm performs $s - 1$ squarings and $y \nequiv -1$ then $a$ is provably composite.  If $a$ is not provably 
+composite then it is \textit{probably} prime.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_miller\_rabin.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* Miller-Rabin test of "a" to the base of "b" as described in 
+018    * HAC pp. 139 Algorithm 4.24
+019    *
+020    * Sets result to 0 if definitely composite or 1 if probably prime.
+021    * Randomly the chance of error is no more than 1/4 and often 
+022    * very much lower.
+023    */
+024   int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
+025   \{
+026     mp_int  n1, y, r;
+027     int     s, j, err;
+028   
+029     /* default */
+030     *result = MP_NO;
+031   
+032     /* ensure b > 1 */
+033     if (mp_cmp_d(b, 1) != MP_GT) \{
+034        return MP_VAL;
+035     \}     
+036   
+037     /* get n1 = a - 1 */
+038     if ((err = mp_init_copy (&n1, a)) != MP_OKAY) \{
+039       return err;
+040     \}
+041     if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) \{
+042       goto __N1;
+043     \}
+044   
+045     /* set 2**s * r = n1 */
+046     if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) \{
+047       goto __N1;
+048     \}
+049   
+050     /* count the number of least significant bits
+051      * which are zero
+052      */
+053     s = mp_cnt_lsb(&r);
+054   
+055     /* now divide n - 1 by 2**s */
+056     if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) \{
+057       goto __R;
+058     \}
+059   
+060     /* compute y = b**r mod a */
+061     if ((err = mp_init (&y)) != MP_OKAY) \{
+062       goto __R;
+063     \}
+064     if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) \{
+065       goto __Y;
+066     \}
+067   
+068     /* if y != 1 and y != n1 do */
+069     if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) \{
+070       j = 1;
+071       /* while j <= s-1 and y != n1 */
+072       while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) \{
+073         if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) \{
+074            goto __Y;
+075         \}
+076   
+077         /* if y == 1 then composite */
+078         if (mp_cmp_d (&y, 1) == MP_EQ) \{
+079            goto __Y;
+080         \}
+081   
+082         ++j;
+083       \}
+084   
+085       /* if y != n1 then composite */
+086       if (mp_cmp (&y, &n1) != MP_EQ) \{
+087         goto __Y;
+088       \}
+089     \}
+090   
+091     /* probably prime now */
+092     *result = MP_YES;
+093   __Y:mp_clear (&y);
+094   __R:mp_clear (&r);
+095   __N1:mp_clear (&n1);
+096     return err;
+097   \}
+098   #endif
+\end{alltt}
+\end{small}
+
+
+
+
+\backmatter
+\appendix
+\begin{thebibliography}{ABCDEF}
+\bibitem[1]{TAOCPV2}
+Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
+
+\bibitem[2]{HAC}
+A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
+
+\bibitem[3]{ROSE}
+Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
+
+\bibitem[4]{COMBA}
+Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
+
+\bibitem[5]{KARA}
+A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
+
+\bibitem[6]{KARAP}
+Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
+
+\bibitem[7]{BARRETT}
+Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag.
+
+\bibitem[8]{MONT}
+P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985.
+
+\bibitem[9]{DRMET}
+Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories
+
+\bibitem[10]{MMB}
+J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89
+
+\bibitem[11]{RSAREF}
+R.L. Rivest, A. Shamir, L. Adleman, \textit{A Method for Obtaining Digital Signatures and Public-Key Cryptosystems}
+
+\bibitem[12]{DHREF}
+Whitfield Diffie, Martin E. Hellman, \textit{New Directions in Cryptography}, IEEE Transactions on Information Theory, 1976
+
+\bibitem[13]{IEEE}
+IEEE Standard for Binary Floating-Point Arithmetic (ANSI/IEEE Std 754-1985)
+
+\bibitem[14]{GMP}
+GNU Multiple Precision (GMP), \url{http://www.swox.com/gmp/}
+
+\bibitem[15]{MPI}
+Multiple Precision Integer Library (MPI), Michael Fromberger, \url{http://thayer.dartmouth.edu/~sting/mpi/}
+
+\bibitem[16]{OPENSSL}
+OpenSSL Cryptographic Toolkit, \url{http://openssl.org}
+
+\bibitem[17]{LIP}
+Large Integer Package, \url{http://home.hetnet.nl/~ecstr/LIP.zip}
+
+\bibitem[18]{ISOC}
+JTC1/SC22/WG14, ISO/IEC 9899:1999, ``A draft rationale for the C99 standard.''
+
+\bibitem[19]{JAVA}
+The Sun Java Website, \url{http://java.sun.com/}
+
+\end{thebibliography}
+
+\input{tommath.ind}
+
+\end{document}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tommath_class.h	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,951 @@
+#if !(defined(LTM1) && defined(LTM2) && defined(LTM3))
+#if defined(LTM2)
+#define LTM3
+#endif
+#if defined(LTM1)
+#define LTM2
+#endif
+#define LTM1
+
+#if defined(LTM_ALL)
+#define BN_ERROR_C
+#define BN_FAST_MP_INVMOD_C
+#define BN_FAST_MP_MONTGOMERY_REDUCE_C
+#define BN_FAST_S_MP_MUL_DIGS_C
+#define BN_FAST_S_MP_MUL_HIGH_DIGS_C
+#define BN_FAST_S_MP_SQR_C
+#define BN_MP_2EXPT_C
+#define BN_MP_ABS_C
+#define BN_MP_ADD_C
+#define BN_MP_ADD_D_C
+#define BN_MP_ADDMOD_C
+#define BN_MP_AND_C
+#define BN_MP_CLAMP_C
+#define BN_MP_CLEAR_C
+#define BN_MP_CLEAR_MULTI_C
+#define BN_MP_CMP_C
+#define BN_MP_CMP_D_C
+#define BN_MP_CMP_MAG_C
+#define BN_MP_CNT_LSB_C
+#define BN_MP_COPY_C
+#define BN_MP_COUNT_BITS_C
+#define BN_MP_DIV_C
+#define BN_MP_DIV_2_C
+#define BN_MP_DIV_2D_C
+#define BN_MP_DIV_3_C
+#define BN_MP_DIV_D_C
+#define BN_MP_DR_IS_MODULUS_C
+#define BN_MP_DR_REDUCE_C
+#define BN_MP_DR_SETUP_C
+#define BN_MP_EXCH_C
+#define BN_MP_EXPT_D_C
+#define BN_MP_EXPTMOD_C
+#define BN_MP_EXPTMOD_FAST_C
+#define BN_MP_EXTEUCLID_C
+#define BN_MP_FREAD_C
+#define BN_MP_FWRITE_C
+#define BN_MP_GCD_C
+#define BN_MP_GET_INT_C
+#define BN_MP_GROW_C
+#define BN_MP_INIT_C
+#define BN_MP_INIT_COPY_C
+#define BN_MP_INIT_MULTI_C
+#define BN_MP_INIT_SET_C
+#define BN_MP_INIT_SET_INT_C
+#define BN_MP_INIT_SIZE_C
+#define BN_MP_INVMOD_C
+#define BN_MP_INVMOD_SLOW_C
+#define BN_MP_IS_SQUARE_C
+#define BN_MP_JACOBI_C
+#define BN_MP_KARATSUBA_MUL_C
+#define BN_MP_KARATSUBA_SQR_C
+#define BN_MP_LCM_C
+#define BN_MP_LSHD_C
+#define BN_MP_MOD_C
+#define BN_MP_MOD_2D_C
+#define BN_MP_MOD_D_C
+#define BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+#define BN_MP_MONTGOMERY_REDUCE_C
+#define BN_MP_MONTGOMERY_SETUP_C
+#define BN_MP_MUL_C
+#define BN_MP_MUL_2_C
+#define BN_MP_MUL_2D_C
+#define BN_MP_MUL_D_C
+#define BN_MP_MULMOD_C
+#define BN_MP_N_ROOT_C
+#define BN_MP_NEG_C
+#define BN_MP_OR_C
+#define BN_MP_PRIME_FERMAT_C
+#define BN_MP_PRIME_IS_DIVISIBLE_C
+#define BN_MP_PRIME_IS_PRIME_C
+#define BN_MP_PRIME_MILLER_RABIN_C
+#define BN_MP_PRIME_NEXT_PRIME_C
+#define BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+#define BN_MP_PRIME_RANDOM_EX_C
+#define BN_MP_RADIX_SIZE_C
+#define BN_MP_RADIX_SMAP_C
+#define BN_MP_RAND_C
+#define BN_MP_READ_RADIX_C
+#define BN_MP_READ_SIGNED_BIN_C
+#define BN_MP_READ_UNSIGNED_BIN_C
+#define BN_MP_REDUCE_C
+#define BN_MP_REDUCE_2K_C
+#define BN_MP_REDUCE_2K_SETUP_C
+#define BN_MP_REDUCE_IS_2K_C
+#define BN_MP_REDUCE_SETUP_C
+#define BN_MP_RSHD_C
+#define BN_MP_SET_C
+#define BN_MP_SET_INT_C
+#define BN_MP_SHRINK_C
+#define BN_MP_SIGNED_BIN_SIZE_C
+#define BN_MP_SQR_C
+#define BN_MP_SQRMOD_C
+#define BN_MP_SQRT_C
+#define BN_MP_SUB_C
+#define BN_MP_SUB_D_C
+#define BN_MP_SUBMOD_C
+#define BN_MP_TO_SIGNED_BIN_C
+#define BN_MP_TO_UNSIGNED_BIN_C
+#define BN_MP_TOOM_MUL_C
+#define BN_MP_TOOM_SQR_C
+#define BN_MP_TORADIX_C
+#define BN_MP_TORADIX_N_C
+#define BN_MP_UNSIGNED_BIN_SIZE_C
+#define BN_MP_XOR_C
+#define BN_MP_ZERO_C
+#define BN_PRIME_TAB_C
+#define BN_REVERSE_C
+#define BN_S_MP_ADD_C
+#define BN_S_MP_EXPTMOD_C
+#define BN_S_MP_MUL_DIGS_C
+#define BN_S_MP_MUL_HIGH_DIGS_C
+#define BN_S_MP_SQR_C
+#define BN_S_MP_SUB_C
+#define BNCORE_C
+#endif
+
+#if defined(BN_ERROR_C)
+   #define BN_MP_ERROR_TO_STRING_C
+#endif
+
+#if defined(BN_FAST_MP_INVMOD_C)
+   #define BN_MP_ISEVEN_C
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_COPY_C
+   #define BN_MP_ABS_C
+   #define BN_MP_SET_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_ISODD_C
+   #define BN_MP_SUB_C
+   #define BN_MP_CMP_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_CMP_D_C
+   #define BN_MP_ADD_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_FAST_MP_MONTGOMERY_REDUCE_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_FAST_S_MP_MUL_DIGS_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_FAST_S_MP_SQR_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_2EXPT_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_GROW_C
+#endif
+
+#if defined(BN_MP_ABS_C)
+   #define BN_MP_COPY_C
+#endif
+
+#if defined(BN_MP_ADD_C)
+   #define BN_S_MP_ADD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_ADD_D_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_ADDMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_ADD_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MOD_C
+#endif
+
+#if defined(BN_MP_AND_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_CLAMP_C)
+#endif
+
+#if defined(BN_MP_CLEAR_C)
+#endif
+
+#if defined(BN_MP_CLEAR_MULTI_C)
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_CMP_C)
+   #define BN_MP_CMP_MAG_C
+#endif
+
+#if defined(BN_MP_CMP_D_C)
+#endif
+
+#if defined(BN_MP_CMP_MAG_C)
+#endif
+
+#if defined(BN_MP_CNT_LSB_C)
+   #define BN_MP_ISZERO_C
+#endif
+
+#if defined(BN_MP_COPY_C)
+   #define BN_MP_GROW_C
+#endif
+
+#if defined(BN_MP_COUNT_BITS_C)
+#endif
+
+#if defined(BN_MP_DIV_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_COPY_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_SET_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_CMP_C
+   #define BN_MP_SUB_C
+   #define BN_MP_ADD_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_MULTI_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_INIT_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_DIV_2_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_DIV_2D_C)
+   #define BN_MP_COPY_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_INIT_C
+   #define BN_MP_MOD_2D_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+#endif
+
+#if defined(BN_MP_DIV_3_C)
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_DIV_D_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_COPY_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_DIV_3_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_DR_IS_MODULUS_C)
+#endif
+
+#if defined(BN_MP_DR_REDUCE_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_DR_SETUP_C)
+#endif
+
+#if defined(BN_MP_EXCH_C)
+#endif
+
+#if defined(BN_MP_EXPT_D_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_SET_C
+   #define BN_MP_SQR_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MUL_C
+#endif
+
+#if defined(BN_MP_EXPTMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_INVMOD_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_ABS_C
+   #define BN_MP_CLEAR_MULTI_C
+   #define BN_MP_DR_IS_MODULUS_C
+   #define BN_MP_REDUCE_IS_2K_C
+   #define BN_MP_ISODD_C
+   #define BN_MP_EXPTMOD_FAST_C
+   #define BN_S_MP_EXPTMOD_C
+#endif
+
+#if defined(BN_MP_EXPTMOD_FAST_C)
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_INIT_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MONTGOMERY_SETUP_C
+   #define BN_FAST_MP_MONTGOMERY_REDUCE_C
+   #define BN_MP_MONTGOMERY_REDUCE_C
+   #define BN_MP_DR_SETUP_C
+   #define BN_MP_DR_REDUCE_C
+   #define BN_MP_REDUCE_2K_SETUP_C
+   #define BN_MP_REDUCE_2K_C
+   #define BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+   #define BN_MP_MULMOD_C
+   #define BN_MP_SET_C
+   #define BN_MP_MOD_C
+   #define BN_MP_COPY_C
+   #define BN_MP_SQR_C
+   #define BN_MP_MUL_C
+   #define BN_MP_EXCH_C
+#endif
+
+#if defined(BN_MP_EXTEUCLID_C)
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_SET_C
+   #define BN_MP_COPY_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_DIV_C
+   #define BN_MP_MUL_C
+   #define BN_MP_SUB_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_FREAD_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_S_RMAP_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_CMP_D_C
+#endif
+
+#if defined(BN_MP_FWRITE_C)
+   #define BN_MP_RADIX_SIZE_C
+   #define BN_MP_TORADIX_C
+#endif
+
+#if defined(BN_MP_GCD_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_ABS_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CNT_LSB_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_EXCH_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_GET_INT_C)
+#endif
+
+#if defined(BN_MP_GROW_C)
+#endif
+
+#if defined(BN_MP_INIT_C)
+#endif
+
+#if defined(BN_MP_INIT_COPY_C)
+   #define BN_MP_COPY_C
+#endif
+
+#if defined(BN_MP_INIT_MULTI_C)
+   #define BN_MP_ERR_C
+   #define BN_MP_INIT_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_INIT_SET_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SET_C
+#endif
+
+#if defined(BN_MP_INIT_SET_INT_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SET_INT_C
+#endif
+
+#if defined(BN_MP_INIT_SIZE_C)
+   #define BN_MP_INIT_C
+#endif
+
+#if defined(BN_MP_INVMOD_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_ISODD_C
+   #define BN_FAST_MP_INVMOD_C
+   #define BN_MP_INVMOD_SLOW_C
+#endif
+
+#if defined(BN_MP_INVMOD_SLOW_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_COPY_C
+   #define BN_MP_ISEVEN_C
+   #define BN_MP_SET_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_ISODD_C
+   #define BN_MP_ADD_C
+   #define BN_MP_SUB_C
+   #define BN_MP_CMP_C
+   #define BN_MP_CMP_D_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_IS_SQUARE_C)
+   #define BN_MP_MOD_D_C
+   #define BN_MP_INIT_SET_INT_C
+   #define BN_MP_MOD_C
+   #define BN_MP_GET_INT_C
+   #define BN_MP_SQRT_C
+   #define BN_MP_SQR_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_JACOBI_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CNT_LSB_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_MOD_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_KARATSUBA_MUL_C)
+   #define BN_MP_MUL_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_SUB_C
+   #define BN_MP_ADD_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_KARATSUBA_SQR_C)
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_SQR_C
+   #define BN_MP_SUB_C
+   #define BN_S_MP_ADD_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_ADD_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_LCM_C)
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_GCD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_DIV_C
+   #define BN_MP_MUL_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_LSHD_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_RSHD_C
+#endif
+
+#if defined(BN_MP_MOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_DIV_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_ADD_C
+   #define BN_MP_EXCH_C
+#endif
+
+#if defined(BN_MP_MOD_2D_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_COPY_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_MOD_D_C)
+   #define BN_MP_DIV_D_C
+#endif
+
+#if defined(BN_MP_MONTGOMERY_CALC_NORMALIZATION_C)
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_2EXPT_C
+   #define BN_MP_SET_C
+   #define BN_MP_MUL_2_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_MONTGOMERY_REDUCE_C)
+   #define BN_FAST_MP_MONTGOMERY_REDUCE_C
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_MONTGOMERY_SETUP_C)
+#endif
+
+#if defined(BN_MP_MUL_C)
+   #define BN_MP_TOOM_MUL_C
+   #define BN_MP_KARATSUBA_MUL_C
+   #define BN_FAST_S_MP_MUL_DIGS_C
+   #define BN_S_MP_MUL_C
+   #define BN_S_MP_MUL_DIGS_C
+#endif
+
+#if defined(BN_MP_MUL_2_C)
+   #define BN_MP_GROW_C
+#endif
+
+#if defined(BN_MP_MUL_2D_C)
+   #define BN_MP_COPY_C
+   #define BN_MP_GROW_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_MUL_D_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_MULMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_MUL_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MOD_C
+#endif
+
+#if defined(BN_MP_N_ROOT_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SET_C
+   #define BN_MP_COPY_C
+   #define BN_MP_EXPT_D_C
+   #define BN_MP_MUL_C
+   #define BN_MP_SUB_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_DIV_C
+   #define BN_MP_CMP_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_NEG_C)
+   #define BN_MP_COPY_C
+   #define BN_MP_ISZERO_C
+#endif
+
+#if defined(BN_MP_OR_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_FERMAT_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_INIT_C
+   #define BN_MP_EXPTMOD_C
+   #define BN_MP_CMP_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_IS_DIVISIBLE_C)
+   #define BN_MP_MOD_D_C
+#endif
+
+#if defined(BN_MP_PRIME_IS_PRIME_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_PRIME_IS_DIVISIBLE_C
+   #define BN_MP_INIT_C
+   #define BN_MP_SET_C
+   #define BN_MP_PRIME_MILLER_RABIN_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_MILLER_RABIN_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_CNT_LSB_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_EXPTMOD_C
+   #define BN_MP_CMP_C
+   #define BN_MP_SQRMOD_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_NEXT_PRIME_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_SET_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_ISEVEN_C
+   #define BN_MP_MOD_D_C
+   #define BN_MP_INIT_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_PRIME_MILLER_RABIN_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_RABIN_MILLER_TRIALS_C)
+#endif
+
+#if defined(BN_MP_PRIME_RANDOM_EX_C)
+   #define BN_MP_READ_UNSIGNED_BIN_C
+   #define BN_MP_PRIME_IS_PRIME_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_MUL_2_C
+   #define BN_MP_ADD_D_C
+#endif
+
+#if defined(BN_MP_RADIX_SIZE_C)
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_DIV_D_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_RADIX_SMAP_C)
+   #define BN_MP_S_RMAP_C
+#endif
+
+#if defined(BN_MP_RAND_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_LSHD_C
+#endif
+
+#if defined(BN_MP_READ_RADIX_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_S_RMAP_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_ISZERO_C
+#endif
+
+#if defined(BN_MP_READ_SIGNED_BIN_C)
+   #define BN_MP_READ_UNSIGNED_BIN_C
+#endif
+
+#if defined(BN_MP_READ_UNSIGNED_BIN_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_REDUCE_C)
+   #define BN_MP_REDUCE_SETUP_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_MUL_C
+   #define BN_S_MP_MUL_HIGH_DIGS_C
+   #define BN_FAST_S_MP_MUL_HIGH_DIGS_C
+   #define BN_MP_MOD_2D_C
+   #define BN_S_MP_MUL_DIGS_C
+   #define BN_MP_SUB_C
+   #define BN_MP_CMP_D_C
+   #define BN_MP_SET_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_ADD_C
+   #define BN_MP_CMP_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_REDUCE_2K_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_MUL_D_C
+   #define BN_S_MP_ADD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_REDUCE_2K_SETUP_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_2EXPT_C
+   #define BN_MP_CLEAR_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_REDUCE_IS_2K_C)
+   #define BN_MP_REDUCE_2K_C
+   #define BN_MP_COUNT_BITS_C
+#endif
+
+#if defined(BN_MP_REDUCE_SETUP_C)
+   #define BN_MP_2EXPT_C
+   #define BN_MP_DIV_C
+#endif
+
+#if defined(BN_MP_RSHD_C)
+   #define BN_MP_ZERO_C
+#endif
+
+#if defined(BN_MP_SET_C)
+   #define BN_MP_ZERO_C
+#endif
+
+#if defined(BN_MP_SET_INT_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_SHRINK_C)
+#endif
+
+#if defined(BN_MP_SIGNED_BIN_SIZE_C)
+   #define BN_MP_UNSIGNED_BIN_SIZE_C
+#endif
+
+#if defined(BN_MP_SQR_C)
+   #define BN_MP_TOOM_SQR_C
+   #define BN_MP_KARATSUBA_SQR_C
+   #define BN_FAST_S_MP_SQR_C
+   #define BN_S_MP_SQR_C
+#endif
+
+#if defined(BN_MP_SQRMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SQR_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MOD_C
+#endif
+
+#if defined(BN_MP_SQRT_C)
+   #define BN_MP_N_ROOT_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_DIV_C
+   #define BN_MP_ADD_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_SUB_C)
+   #define BN_S_MP_ADD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_SUB_D_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_SUBMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SUB_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MOD_C
+#endif
+
+#if defined(BN_MP_TO_SIGNED_BIN_C)
+   #define BN_MP_TO_UNSIGNED_BIN_C
+#endif
+
+#if defined(BN_MP_TO_UNSIGNED_BIN_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_TOOM_MUL_C)
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_MOD_2D_C
+   #define BN_MP_COPY_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_MUL_C
+   #define BN_MP_MUL_2_C
+   #define BN_MP_ADD_C
+   #define BN_MP_SUB_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_DIV_3_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_TOOM_SQR_C)
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_MOD_2D_C
+   #define BN_MP_COPY_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_SQR_C
+   #define BN_MP_MUL_2_C
+   #define BN_MP_ADD_C
+   #define BN_MP_SUB_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_DIV_3_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_TORADIX_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_DIV_D_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_S_RMAP_C
+#endif
+
+#if defined(BN_MP_TORADIX_N_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_DIV_D_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_S_RMAP_C
+#endif
+
+#if defined(BN_MP_UNSIGNED_BIN_SIZE_C)
+   #define BN_MP_COUNT_BITS_C
+#endif
+
+#if defined(BN_MP_XOR_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_ZERO_C)
+#endif
+
+#if defined(BN_PRIME_TAB_C)
+#endif
+
+#if defined(BN_REVERSE_C)
+#endif
+
+#if defined(BN_S_MP_ADD_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_S_MP_EXPTMOD_C)
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_INIT_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_REDUCE_SETUP_C
+   #define BN_MP_MOD_C
+   #define BN_MP_COPY_C
+   #define BN_MP_SQR_C
+   #define BN_MP_REDUCE_C
+   #define BN_MP_MUL_C
+   #define BN_MP_SET_C
+   #define BN_MP_EXCH_C
+#endif
+
+#if defined(BN_S_MP_MUL_DIGS_C)
+   #define BN_FAST_S_MP_MUL_DIGS_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_S_MP_MUL_HIGH_DIGS_C)
+   #define BN_FAST_S_MP_MUL_HIGH_DIGS_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_S_MP_SQR_C)
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_S_MP_SUB_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BNCORE_C)
+#endif
+
+#ifdef LTM3
+#define LTM_LAST
+#endif
+#include <tommath_superclass.h>
+#include <tommath_class.h>
+#else
+#define LTM_LAST
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tommath_superclass.h	Sun Dec 19 15:57:19 2004 +0000
@@ -0,0 +1,72 @@
+/* super class file for PK algos */
+
+/* default ... include all MPI */
+#define LTM_ALL
+
+/* RSA only (does not support DH/DSA/ECC) */
+// #define SC_RSA_1
+
+/* For reference.... On an Athlon64 optimizing for speed...
+
+   LTM's mpi.o with all functions [striped] is 142KiB in size.
+
+*/
+
+/* Works for RSA only, mpi.o is 68KiB */
+#ifdef SC_RSA_1
+   #define BN_MP_SHRINK_C
+   #define BN_MP_LCM_C
+   #define BN_MP_PRIME_RANDOM_EX_C
+   #define BN_MP_INVMOD_C
+   #define BN_MP_GCD_C
+   #define BN_MP_MOD_C
+   #define BN_MP_MULMOD_C
+   #define BN_MP_ADDMOD_C
+   #define BN_MP_EXPTMOD_C
+   #define BN_MP_SET_INT_C
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_CLEAR_MULTI_C
+   #define BN_MP_UNSIGNED_BIN_SIZE_C
+   #define BN_MP_TO_UNSIGNED_BIN_C
+   #define BN_MP_MOD_D_C
+   #define BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+   #define BN_REVERSE_C
+   #define BN_PRIME_TAB_C
+
+   /* other modifiers */
+   #define BN_MP_DIV_SMALL                    /* Slower division, not critical */
+
+   /* here we are on the last pass so we turn things off.  The functions classes are still there
+    * but we remove them specifically from the build.  This also invokes tweaks in functions
+    * like removing support for even moduli, etc...
+    */
+#ifdef LTM_LAST
+   #undef  BN_MP_TOOM_MUL_C
+   #undef  BN_MP_TOOM_SQR_C
+   #undef  BN_MP_KARATSUBA_MUL_C
+   #undef  BN_MP_KARATSUBA_SQR_C
+   #undef  BN_MP_REDUCE_C
+   #undef  BN_MP_REDUCE_SETUP_C
+   #undef  BN_MP_DR_IS_MODULUS_C
+   #undef  BN_MP_DR_SETUP_C
+   #undef  BN_MP_DR_REDUCE_C
+   #undef  BN_MP_REDUCE_IS_2K_C
+   #undef  BN_MP_REDUCE_2K_SETUP_C
+   #undef  BN_MP_REDUCE_2K_C
+   #undef  BN_S_MP_EXPTMOD_C
+   #undef  BN_MP_DIV_3_C
+   #undef  BN_S_MP_MUL_HIGH_DIGS_C
+   #undef  BN_FAST_S_MP_MUL_HIGH_DIGS_C
+   #undef  BN_FAST_MP_INVMOD_C
+
+   /* To safely undefine these you have to make sure your RSA key won't exceed the Comba threshold
+    * which is roughly 255 digits [7140 bits for 32-bit machines, 15300 bits for 64-bit machines] 
+    * which means roughly speaking you can handle upto 2536-bit RSA keys with these defined without
+    * trouble.  
+    */
+   #undef  BN_S_MP_MUL_DIGS_C
+   #undef  BN_S_MP_SQR_C
+   #undef  BN_MP_MONTGOMERY_REDUCE_C
+#endif
+
+#endif